In [2]:
import os
import pandas as pd
import re
from datetime import datetime
import json

### Data Collection

In [2]:
# file input and output
data_dir = "articles/"  
output_dir = "output/"

# create the file if it doesn't exist 
os.makedirs(output_dir, exist_ok=True)

In [3]:
# listing out the PDF names 
pdf_files = [f for f in os.listdir(data_dir) if f.endswith('.pdf')]
print(f"There are {len(pdf_files)} PDF files:")
for i, file in enumerate(pdf_files, 1): # start counting from 1 instead of 0 
    print(f"{i}. {file}")

There are 11 PDF files:
1. '1타 강사' 정승제→윤혜정, 교육 격자 현실에 '깜짝' ('공부불안').pdf
2. _공정한 사교육 기회 만들겠다_… 유명 학원 인강 버젓이 무단 배포 - 아시아경제.pdf
3. 공부보다 사람.pdf
4. 기타강사 성추행.pdf
5. 변형문제 팔아 6개월새 1억 번 학원강사…비결은 '저작권 혁신' - 머니투데이.pdf
6. 웹툰 학원 강사.pdf
7. 정원에서 새로 인생 써내려간 ‘일타 강사’[정성갑의 공간의 재발견]｜동아일보.pdf
8. 족보닷컴 학원 선생님을 위한 감사 이벤트.pdf
9. 최교진 유아 영어학원 유치원 형태 반대.pdf
10. 칠판 앞에선 선생님 급여 앞에선 을.pdf
11. 학원에 문제 팔고 뒷돈받은 교사들.pdf


In [4]:
# test opening one file
def extract_text_from_pdf(file_path):
    """ Extracting text from a PDF file. """
    # opening file path 
    doc = fitz.open(file_path)
    full_text = ""   # empty string to store the text 

    # adding text page by page
    for page in doc:
        text = page.get_text()
        full_text += text
        
    doc.close()   # close the file
    
    return full_text

pdf_1_text = extract_text_from_pdf("articles/'1타 강사' 정승제→윤혜정, 교육 격자 현실에 '깜짝' ('공부불안').pdf")
pdf_1_text

'홈»  종합\n‘1타 강사’ 정승제→윤혜정, 교육 격자 현실에 ‘깜짝’ (‘공부불안’)\n \n \n105\n0\n공유\n[TV리포트=김진수 기자] 오는 20일(월) 밤 9시 55분, EBS1에서 방송되는 EBS ‘다큐프라임 – 공부불안’의 ‘2부. 전교 1\n등인데, 왜 안되나요?’에서는 지역 교육 격차의 실체를 파헤친다.\n지금 당장 인기상품을 확인하세요!\n국어 1타 윤혜정, 수학 1타 정승제, 배우 봉태규가 직접 교육 현장을 찾아가 학생들과 학부모의 이야기를 들으며 ‘대한\n민국 교육의 민낯’을 보여준다. 이 방송에서는 농촌 지역 일반고의 최상위권 성적을 가진 학생들이 수능 최저 등급을\n맞추지 못해 의대 진학의 꿈이 흔들리는 현실을 조명한다.\n한 교감 선생님이 보낸 편지에서는 내신 최상위권인 학생들도 수능에서 저조해 대입에 실패하는 경우가 많다고 간절\n히 호소한다. 학생들의 심리적 무기력함과 불안감이 심각하게 우려된다는 메시지다. 제작진이 해당 학교로 찾아가 모\n의고사 결과를 확인하면, 영어 성적이 4~5등급으로 나오며 충격을 안긴다.\n리폿@\n스타\n가요\nTV\n영화\n해외\n인터뷰\n이슈\n종합\n엔터\n리뷰\n포토\n김진수 기자\n2025.10.17\n조회수 \xa0\n0\n10/24/25, 5:50 PM\n\'1타 강사\' 정승제→윤혜정, 교육 격자 현실에 \'깜짝\' (\'공부불안\')\nhttps://tvreport.co.kr/hot-issue/article/947207/\n1/3\n서울 자사고와 지방 일반고의 실제 내신 시험지를 비교 분석한 결과, 서울 자사고는 수능 기출문제로 구성된 시험을\n보았으나 지방고는 모든 학생의 수준에 맞춰 쉽게 출제되었다. 학생들은 내신과 수능 대비를 따로 준비해야 하는 이중\n고를 겪고 있었다.\n또한 공교육의 일부는 ‘파행 교육과정’이 문제라고 지적되고 있다. 일부 자사고에서는 고1 때 모든 수학 진도를 끝내도\n록 커리큘럼을 짜놓았다. 정승제는 이러한 교육 방식은 과거 학습 없이 따라갈 수 없

In [5]:
def extract_text_from_folder(file_path): 
    """ Extract all of the texts from PDF files in a folder """
    # creating an empy list to store data from different articles 
    articles = []
    pdf_files = [f for f in os.listdir(file_path) if f.endswith('.pdf')]

    for files in pdf_files: 
        full_path = os.path.join(file_path, files)
        print(f"Processing {files}")
        text_file = extract_text_from_pdf(full_path)

        article_dictionary = {
            "Article Name": files, 
            "Text": text_file
        }
        articles.append(article_dictionary)
        print("Appended File!")
        
    return articles

article_text = extract_text_from_folder("articles/")
print(article_text[0])

Processing '1타 강사' 정승제→윤혜정, 교육 격자 현실에 '깜짝' ('공부불안').pdf
Appended File!
Processing _공정한 사교육 기회 만들겠다_… 유명 학원 인강 버젓이 무단 배포 - 아시아경제.pdf
Appended File!
Processing 공부보다 사람.pdf
Appended File!
Processing 기타강사 성추행.pdf
Appended File!
Processing 변형문제 팔아 6개월새 1억 번 학원강사…비결은 '저작권 혁신' - 머니투데이.pdf
Appended File!
Processing 웹툰 학원 강사.pdf
Appended File!
Processing 정원에서 새로 인생 써내려간 ‘일타 강사’[정성갑의 공간의 재발견]｜동아일보.pdf
Appended File!
Processing 족보닷컴 학원 선생님을 위한 감사 이벤트.pdf
Appended File!
Processing 최교진 유아 영어학원 유치원 형태 반대.pdf
Appended File!
Processing 칠판 앞에선 선생님 급여 앞에선 을.pdf
Appended File!
Processing 학원에 문제 팔고 뒷돈받은 교사들.pdf
Appended File!
{'Article Name': "'1타 강사' 정승제→윤혜정, 교육 격자 현실에 '깜짝' ('공부불안').pdf", 'Text': '홈»  종합\n‘1타 강사’ 정승제→윤혜정, 교육 격자 현실에 ‘깜짝’ (‘공부불안’)\n \n \n105\n0\n공유\n[TV리포트=김진수 기자] 오는 20일(월) 밤 9시 55분, EBS1에서 방송되는 EBS ‘다큐프라임 – 공부불안’의 ‘2부. 전교 1\n등인데, 왜 안되나요?’에서는 지역 교육 격차의 실체를 파헤친다.\n지금 당장 인기상품을 확인하세요!\n국어 1타 윤혜정, 수학 1타 정승제, 배우 봉태규가 직접 교육 현장을 찾아가 학생들과 학부모의 이야기를 들으며 ‘대한\n민국 교육의 민낯’을 보여준다. 이 방송에서는 농촌 지역 일반고의 최상위

In [6]:
filename = "extracted_articles.xlsx"

# checking if the file already exists. 
if os.path.exists(filename):
    print(f"{filename} already exists. Skipping save.")
else:
    # convert to dataframe and save it as excel
    df = pd.DataFrame(article_text)
    df.to_excel(filename, index=False, engine='openpyxl')
    print(f"Successfully saved {len(article_text)} articles to {filename}")

extracted_articles.xlsx already exists. Skipping save.


### Data Processing

In [12]:
from google.cloud import translate_v2 as translate
import os

# use the path to the key
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = #####

translate_client = translate.Client()

# testing if my API works 
test_text = "1타 강사' 정승제→윤혜정, 교육 격자 현실에 '깜짝' ('공부불안')"
test_result = translate_client.translate(test_text, source_language='ko', target_language='en')
print(f"Test translation: {test_text} → {test_result['translatedText']}")

Test translation: 1타 강사' 정승제→윤혜정, 교육 격자 현실에 '깜짝' ('공부불안') → &quot;First-class instructors&quot; Jeong Seung-je and Yoon Hye-jeong are &quot;surprised&quot; by the reality of the educational grid (&quot;study anxiety&quot;).


In [24]:
def extract_dates_from_articles(article_text):
    """Extract only dates from already-extracted Korean articles"""
    
    articles_with_dates = []
    
    for article in article_text:
        print(f"Extracting date from: {article['Article Name']}")
        
        korean_text = article['Text']
        date_found = 'Not found'
        
        # look for date patterns in Korean text
        date_patterns = [
            r'(\d{4})\.(\d{1,2})\.(\d{1,2})',  # 2025.10.17
            r'(\d{4})-(\d{1,2})-(\d{1,2})',    # 2025-10-17
            r'(\d{4})년\s*(\d{1,2})월\s*(\d{1,2})일',  # 2025년 10월 17일
            r'입력\s*(\d{4})\.(\d{1,2})\.(\d{1,2})',  # 입력 2025.10.17
            r'승인\s*(\d{4})\.(\d{1,2})\.(\d{1,2})',  # 승인 2025.10.17
        ]
        
        for pattern in date_patterns:
            match = re.search(pattern, korean_text)
            if match:
                groups = match.groups()
                year, month, day = groups[-3], groups[-2], groups[-1]  # Get last 3 groups
                date_found = f"{day}.{month.zfill(2)}.{year}"
                break
        
        # add date to article metadata
        enhanced_article = {
            'Article Name': article['Article Name'],
            'Date': date_found
        }
        
        articles_with_dates.append(enhanced_article)
        print(f"Date: {date_found}")
    
    return articles_with_dates

In [25]:
articles_with_dates = extract_dates_from_articles(article_text)
articles_with_dates

Extracting date from: '1타 강사' 정승제→윤혜정, 교육 격자 현실에 '깜짝' ('공부불안').pdf
Date: 17.10.2025
Extracting date from: _공정한 사교육 기회 만들겠다_… 유명 학원 인강 버젓이 무단 배포 - 아시아경제.pdf
Date: 17.10.2025
Extracting date from: 공부보다 사람.pdf
Date: 20.10.2025
Extracting date from: 기타강사 성추행.pdf
Date: 16.10.2025
Extracting date from: 변형문제 팔아 6개월새 1억 번 학원강사…비결은 '저작권 혁신' - 머니투데이.pdf
Date: 23.10.2025
Extracting date from: 웹툰 학원 강사.pdf
Date: 23.10.2025
Extracting date from: 정원에서 새로 인생 써내려간 ‘일타 강사’[정성갑의 공간의 재발견]｜동아일보.pdf
Date: 25.09.2025
Extracting date from: 족보닷컴 학원 선생님을 위한 감사 이벤트.pdf
Date: 17.10.2025
Extracting date from: 최교진 유아 영어학원 유치원 형태 반대.pdf
Date: 20.10.2025
Extracting date from: 칠판 앞에선 선생님 급여 앞에선 을.pdf
Date: 11.10.2025
Extracting date from: 학원에 문제 팔고 뒷돈받은 교사들.pdf
Date: 11.10.2025


[{'Article Name': "'1타 강사' 정승제→윤혜정, 교육 격자 현실에 '깜짝' ('공부불안').pdf",
  'Date': '17.10.2025'},
 {'Article Name': '_공정한 사교육 기회 만들겠다_… 유명 학원 인강 버젓이 무단 배포 - 아시아경제.pdf',
  'Date': '17.10.2025'},
 {'Article Name': '공부보다 사람.pdf', 'Date': '20.10.2025'},
 {'Article Name': '기타강사 성추행.pdf', 'Date': '16.10.2025'},
 {'Article Name': "변형문제 팔아 6개월새 1억 번 학원강사…비결은 '저작권 혁신' - 머니투데이.pdf",
  'Date': '23.10.2025'},
 {'Article Name': '웹툰 학원 강사.pdf', 'Date': '23.10.2025'},
 {'Article Name': '정원에서 새로 인생 써내려간 ‘일타 강사’[정성갑의 공간의 재발견]｜동아일보.pdf',
  'Date': '25.09.2025'},
 {'Article Name': '족보닷컴 학원 선생님을 위한 감사 이벤트.pdf', 'Date': '17.10.2025'},
 {'Article Name': '최교진 유아 영어학원 유치원 형태 반대.pdf', 'Date': '20.10.2025'},
 {'Article Name': '칠판 앞에선 선생님 급여 앞에선 을.pdf', 'Date': '11.10.2025'},
 {'Article Name': '학원에 문제 팔고 뒷돈받은 교사들.pdf', 'Date': '11.10.2025'}]

In [13]:
import html

# translate all articles with html decoding - i want to remove the "&quot" texts and just keep the translated text
translated_articles = []

for i, article in enumerate(article_text):
    print(f"\nTranslating ({i+1}/{len(article_text)}): {article['Article Name']}")
    
    try:
        # translate the article's text
        result = translate_client.translate(
            article['Text'],
            source_language='ko',
            target_language='en'
        )
        
        # decode html entities 
        clean_translated_text = html.unescape(result['translatedText'])
        
        translated_article = {
            'Article Name': article['Article Name'],
            'Original Text': article['Text'],  # original 
            'Translated Text': clean_translated_text  # translated text
        }
        translated_articles.append(translated_article)
        
        print(f"Completed: {article['Article Name']}") # debugging
        
    except Exception as e:
        print(f"Error with {article['Article Name']}: {e}")  # debugging
        continue

print(f"\nSuccessfully translated {len(translated_articles)} articles")


Translating (1/11): '1타 강사' 정승제→윤혜정, 교육 격자 현실에 '깜짝' ('공부불안').pdf
Completed: '1타 강사' 정승제→윤혜정, 교육 격자 현실에 '깜짝' ('공부불안').pdf

Translating (2/11): _공정한 사교육 기회 만들겠다_… 유명 학원 인강 버젓이 무단 배포 - 아시아경제.pdf
Completed: _공정한 사교육 기회 만들겠다_… 유명 학원 인강 버젓이 무단 배포 - 아시아경제.pdf

Translating (3/11): 공부보다 사람.pdf
Completed: 공부보다 사람.pdf

Translating (4/11): 기타강사 성추행.pdf
Completed: 기타강사 성추행.pdf

Translating (5/11): 변형문제 팔아 6개월새 1억 번 학원강사…비결은 '저작권 혁신' - 머니투데이.pdf
Completed: 변형문제 팔아 6개월새 1억 번 학원강사…비결은 '저작권 혁신' - 머니투데이.pdf

Translating (6/11): 웹툰 학원 강사.pdf
Completed: 웹툰 학원 강사.pdf

Translating (7/11): 정원에서 새로 인생 써내려간 ‘일타 강사’[정성갑의 공간의 재발견]｜동아일보.pdf
Completed: 정원에서 새로 인생 써내려간 ‘일타 강사’[정성갑의 공간의 재발견]｜동아일보.pdf

Translating (8/11): 족보닷컴 학원 선생님을 위한 감사 이벤트.pdf
Completed: 족보닷컴 학원 선생님을 위한 감사 이벤트.pdf

Translating (9/11): 최교진 유아 영어학원 유치원 형태 반대.pdf
Completed: 최교진 유아 영어학원 유치원 형태 반대.pdf

Translating (10/11): 칠판 앞에선 선생님 급여 앞에선 을.pdf
Completed: 칠판 앞에선 선생님 급여 앞에선 을.pdf

Translating (11/11): 학원에 문제 팔고 뒷돈받은 교사들.pdf
Completed: 학원에 문제 팔고 뒷돈받

In [40]:
# add dates to translated articles 
for trans_article in translated_articles:
    # match by article name and add date
    for date_article in articles_with_dates:
        if trans_article['Article Name'] == date_article['Article Name']:
            trans_article['Date'] = date_article['Date']
            break

In [41]:
def check_for_ocr_errors(translated_articles):
    """Check for common OCR and translation issues"""
    for i, article in enumerate(translated_articles):
        print(f"\n--- Checking Article {i+1}: {article['Article Name']} ---")
        
        text = article['Translated Text']
        
        # checking for HTML entities 
        if '&quot;' in text or '&amp;' in text or '&lt;' in text:
            print("HTML entities found")
        
        # check for broken text from the translation 
        if '�' in text or '□' in text or '○' in text:
            print("Broken/placeholder characters found")
        
        # checking whitespace 
        if '   ' in text or '\n\n\n' in text:
            print("Excessive whitespace detected")
        
        # check for key terms
        key_terms = ['teacher', 'instructor', 'hagwon', 'academy', 'student']
        found_terms = [term for term in key_terms if term.lower() in text.lower()]
        print(f"Found important key terms: {', '.join(found_terms)}")
        
        if len(found_terms) < 2:
            print("Few education-related terms found")

# running the error check 
check_for_ocr_errors(translated_articles)


--- Checking Article 1: '1타 강사' 정승제→윤혜정, 교육 격자 현실에 '깜짝' ('공부불안').pdf ---
Found important key terms: instructor, student

--- Checking Article 2: _공정한 사교육 기회 만들겠다_… 유명 학원 인강 버젓이 무단 배포 - 아시아경제.pdf ---
Found important key terms: instructor, academy, student

--- Checking Article 3: 공부보다 사람.pdf ---
Found important key terms: teacher, academy, student

--- Checking Article 4: 기타강사 성추행.pdf ---
Found important key terms: instructor, academy, student

--- Checking Article 5: 변형문제 팔아 6개월새 1억 번 학원강사…비결은 '저작권 혁신' - 머니투데이.pdf ---
Found important key terms: teacher, instructor, academy, student

--- Checking Article 6: 웹툰 학원 강사.pdf ---
Found important key terms: instructor, academy

--- Checking Article 7: 정원에서 새로 인생 써내려간 ‘일타 강사’[정성갑의 공간의 재발견]｜동아일보.pdf ---
Found important key terms: instructor
Few education-related terms found

--- Checking Article 8: 족보닷컴 학원 선생님을 위한 감사 이벤트.pdf ---
Found important key terms: teacher, instructor, academy, student

--- Checking Article 9: 최교진 유아 영어학원 유치원 형태 반대.pdf -

#### Saving the translated texts for future purposes

In [42]:
# create dataframe
df = pd.DataFrame(translated_articles)

# save to excel
filename = "translated_articles.xlsx"

# check first
if os.path.exists(filename):
    print(f"{filename} already exists. Skipping save.")
else:
    df.to_excel(filename, index=False, engine='openpyxl')
    print(f"Saved {len(translated_articles)} articles to {filename}")
    print(f"Columns: {list(df.columns)}")

✅ Saved 11 articles to translated_articles.xlsx
Columns: ['Article Name', 'Original Text', 'Translated Text', 'Date']


In [7]:
# load translated excel file
df = pd.read_excel("translated_articles.xlsx")

# convert back to list of dictionaries format
translated_articles = df.to_dict('records')

#### LLM Modeling

In [3]:
# for the purpose of this task, I will be using Anthropic's Claude API since I believe it performs well 
# I also had to create a seperate environment because I had issues initializing the anthropic package. It clashed with google translate
import anthropic

os.environ['ANTHROPIC_API_KEY'] = "#####"

# Initialize the client
client = anthropic.Anthropic()

#### I focused heavily on prompt engineering. I used detailed prompts by trying narrow down the specifics much as possible by providing clear instructions on output examples, filtering instructions and what to avoid. 

In [5]:
def analyze_with_claude(article):
    article_name = article['Article Name']
    article_date = article.get('Date', 'Date not found')
    article_text = article['Translated Text'][:10000]
    
    prompt = f"""
    You are assisting research on occupational stigma toward South Korean hagwon teachers (“학원강사”) who teach academic subjects (math, English, science, SAT, ACT, etc.).

    ARTICLE INFORMATION:
    - Article: {article_name}
    - Date: {article_date}

    GOAL:
    Identify and categorize quotes from the article that show how hagwon teachers are portrayed. Only focus on academic subject teachers.

    FILTERING & INCLUSION INSTRUCTIONS:
    - Only include quotes directly about academic hagwon teachers.
    - Exclude quotes about:
      • Hagwon owners, managers, administrators, non-teaching staff
      • The hagwon industry/organization as a whole
      • Non-academic/recreational teachers (art, music, taekwondo, sports, swimming)
      • Students or parents (even if teachers are mentioned tangentially)
      • Student performance, student behavior, or student outcomes unless the quote explicitly evaluates the teacher
    - Acceptable terms include "hagwon teachers," "instructors," "subject teachers," or role-specific references in academic contexts.
    - Do NOT paraphrase or summarize. Only copy exact textual excerpts.

    Examples of what to EXCLUDE:
    × "Students at hagwons study until midnight" (about students, not teachers)
    × "Parents send their children to the best instructors" (about parent behavior)
    × "Top students seek out star teachers" (focus is on students)
    × "The hagwon industry exploits both teachers and students" (about industry)
    
    Examples of what to INCLUDE:
    ✓ "The instructor showed exceptional dedication to struggling students" (evaluates teacher behavior)
    ✓ "Hagwon teachers often work 12-hour days" (directly about teachers)
    ✓ "Many teachers feel trapped in their positions" (teacher experience/status)

    CATEGORIES AND "LOOK FOR":
    NEGATIVE PORTRAYALS
    A. Social Taint - Association with devalued social groups or "stuck" status of teachers
      • Look for: Characterizations as "stuck in job," failed professionals, low social status associations, trapped in degrading position
      • Example: "middle-aged immigrant man, stuck in the cab industry because there was nothing else"

    B. Moral Taint - Perception of immoral, deceptive, norm-violating, harmful, or lawless practices by teachers.
      • Look for: mentions of teachers engaged in
        - Deceptive/unethical practices: exploitation, greed, misconduct
        - Harm to students or society: dangerous practices, harmful teaching methods
        - Lawlessness: illegal activity, lack of regulation, unethical business practices
      • Be sure to distinguish between portrayals of organizational/owner wrongdoing (EXCLUDE) and portrayals of the teachers' own misconduct.
      • Example: "routinely charged excessive rates", "operates outside regulations"

    C. Physical Taint - Unsanitary, dirty, or physically contaminating associations of teachers or their working environments
      • Look for: Dirty environments, poor hygiene, physically unpleasant working conditions
      • Example: "cramped, poorly ventilated classrooms"

    POSITIVE PORTRAYALS
    E. Moral Value/Pro-social Actions
      • Look for: Descriptions of expertise, helping students, dedication to education, going above and beyond, ethical behavior, positive impact on students
      • Examples: "dedicated to student success," "expert instruction," "transformed struggling students"

    F. Social Status/Recognition
      • Look for: Invitations to political activities, high-profile media exposure in neutral or positive settings, recognition as experts, respected community members, elevated social standing
      • Examples: "invited to speak at education forum," "recognized as leading educator," "sought after for expertise"

    G. Professional Competence
      • Look for: References to skills, qualifications, training, effectiveness, professional development
      • Examples: "highly qualified instructors," "specialized training in pedagogy"

    NEUTRAL PORTRAYALS
    H. Rights Advocacy/Labor Issues
      • Look for: Discussions of teachers as deserving of better treatment or problematization of any maltreatment they face
      • Examples: "hagwon teachers should be protected from exploitative hagwon owners,” “concern for hagwon teachers’ intellectual property”

    I. Descriptive/Matter-of-fact
      • Look for: Neutral descriptions of work, demographics, industry information without evaluative language
      • Examples: "hagwon teachers prepare students for exams," "approximately X number work in the sector"

    STEP-BY-STEP:
    1. Examine the article and extract quotes about academic hagwon teachers per the above framework.
    2. Categorize each quote using the definitions and look-for cues above.
    3. Skip any quote not clearly about academic subject hagwon teachers or fitting the above categories.

    OUTPUT FORMAT:
    For each category, list relevant quotes in the following format:
    [Category Letter]. "English translation of the quote" ({article_date}, {article_name})
    If no relevant quotes, simply write "No result" for that category.

    Example:
    A. Social Taint:
    - "Hagwon teachers are not socially recognized." (13.05.2022, 'Hagwon Teachers’ Realities')

    Article Text (English translation for context):
    {article_text}

    BEFORE SUBMITTING:
    - Double-check each quote is about academic hagwon teachers.
    - Double-check you copied quotes directly and did not paraphrase.
    """
    
    response = client.messages.create(
        model="claude-3-haiku-20240307",  # used haiku, cheapest and fastest
        max_tokens=2000,
        temperature=0.2,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    
    return response.content[0].text

In [8]:
# initialize lists for different categories 
analysis_text = []

# process all articles
for i, article in enumerate(translated_articles):
    print(f"Analyzing article {article['Article Name']}")
    try:
        # claude analysis
        analysis = analyze_with_claude(article)
        analysis_text.append(analysis)
        
        print(f"Completed")
        
    except Exception as e: # debugging
        print(f"Error: {e}")

Analyzing article '1타 강사' 정승제→윤혜정, 교육 격자 현실에 '깜짝' ('공부불안').pdf
Completed
Analyzing article _공정한 사교육 기회 만들겠다_… 유명 학원 인강 버젓이 무단 배포 - 아시아경제.pdf
Completed
Analyzing article 공부보다 사람.pdf
Completed
Analyzing article 기타강사 성추행.pdf
Completed
Analyzing article 변형문제 팔아 6개월새 1억 번 학원강사…비결은 '저작권 혁신' - 머니투데이.pdf
Completed
Analyzing article 웹툰 학원 강사.pdf
Completed
Analyzing article 정원에서 새로 인생 써내려간 ‘일타 강사’[정성갑의 공간의 재발견]｜동아일보.pdf
Completed
Analyzing article 족보닷컴 학원 선생님을 위한 감사 이벤트.pdf
Completed
Analyzing article 최교진 유아 영어학원 유치원 형태 반대.pdf
Completed
Analyzing article 칠판 앞에선 선생님 급여 앞에선 을.pdf
Completed
Analyzing article 학원에 문제 팔고 뒷돈받은 교사들.pdf
Completed


In [9]:
social_taint = []
moral_taint = []
physical_taint = []
moral_value = []
social_status = []
professional_competence = []
rights_advocacy = []
descriptive = []

# dictionary for category mapping
cat_map = {
    "A": social_taint,
    "B": moral_taint,
    "C": physical_taint,
    "E": moral_value,
    "F": social_status,
    "G": professional_competence,
    "H": rights_advocacy,
    "I": descriptive,
}

In [15]:
import re

def parse_analysis(analysis_texts):
    """
    Parse analysis texts that contain categorized quotes.
    Each item in analysis_texts should be a string with categories and quotes.
    """
    categories = {
        "A": [],  # Social Taint
        "B": [],  # Moral Taint
        "C": [],  # Physical Taint
        "E": [],  # Moral Value
        "F": [],  # Social Status
        "G": [],  # Professional Competence
        "H": [],  # Rights Advocacy
        "I": []   # Descriptive
    }
    
    for idx, analysis in enumerate(analysis_texts):
        current_cat = None
        
        # Split by lines
        lines = analysis.split('\n')
        
        for line in lines:
            line = line.strip()
            if not line:
                continue
            
            # detect the category 
            cat_match = re.match(r'^([A-I])\.\s+(.+?):', line)
            if cat_match:
                letter = cat_match.group(1)
                if letter in categories:
                    current_cat = letter
                continue
            
            # skip if no category
            if not current_cat:
                continue
            
            # skip no results
            if 'no result' in line.lower():
                continue
            
            # extracting quotes
            quote_match = re.match(r'^"(.+?)"\s+\((.+?),\s*(.+?\.pdf)\)', line)
            
            if quote_match:
                quote = quote_match.group(1).strip()
                date = quote_match.group(2).strip()
                source = quote_match.group(3).strip()
                
                entry = {
                    'Article': source,
                    'Date': date,
                    'Quote': quote
                }
                categories[current_cat].append(entry)
    
    return categories

results = parse_analysis(analysis_text)

In [16]:
# output folder
out_folder = 'categorized_quotes'
os.makedirs(out_folder, exist_ok=True)

# file naming
code_to_name = {
    'A': 'Social_Taint',
    'B': 'Moral_Taint',
    'C': 'Physical_Taint',
    'E': 'Moral_Value',
    'F': 'Social_Status',
    'G': 'Professional_Competence',
    'H': 'Rights_Advocacy',
    'I': 'Descriptive'
}

# loop and save files as excel files
for code, quotes in results.items():
    if not quotes:
        print(f"No quotes found for category {code} ({code_to_name[code]})")
        continue
    
    df = pd.DataFrame(quotes)
    fname = f"{code}_{code_to_name[code]}.xlsx"
    path = os.path.join(out_folder, fname)
    df.to_excel(path, index=False)
    print(f"Saved {len(quotes)} quotes to {path}")

print(f"\nAll non-empty categories saved into '{out_folder}/'")

Saved 1 quotes to categorized_quotes\A_Social_Taint.xlsx
Saved 2 quotes to categorized_quotes\B_Moral_Taint.xlsx
No quotes found for category C (Physical_Taint)
Saved 11 quotes to categorized_quotes\E_Moral_Value.xlsx
No quotes found for category F (Social_Status)
Saved 3 quotes to categorized_quotes\G_Professional_Competence.xlsx
Saved 4 quotes to categorized_quotes\H_Rights_Advocacy.xlsx
Saved 10 quotes to categorized_quotes\I_Descriptive.xlsx

All non-empty categories saved into 'categorized_quotes/'
