<a href="https://colab.research.google.com/github/sangwonkook/kook/blob/main/%EB%A6%AC%EB%B7%B0_%EC%9A%94%EC%95%BD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# 파일 경로 설정
file_path = '/content/drive/MyDrive/수업진행프로젝트/foreign_worker_reviews_dummy_fixed.csv'

In [10]:
# 1. 라이브러리 설치
!pip install transformers sentencepiece

import pandas as pd
from transformers import pipeline

# 2. 데이터 불러오기
df = pd.read_csv("/content/drive/MyDrive/수업진행프로젝트/foreign_worker_reviews_dummy_fixed.csv")

# 3. 기업별 설문(수치형) 요약: 평균 값 집계
survey_summary = df.groupby("기업명")[["평점", "복지", "임금", "근로환경", "안전", "소통"]].mean().round(2)
print("📊 기업별 평균 점수 요약")
print(survey_summary)

# 4. 리뷰 요약 준비
# 리뷰를 기업별로 합쳐서 요약 진행
company_reviews = df.groupby("기업명")["리뷰내용"].apply(lambda x: " ".join(x)).reset_index()

# 5. 다국어 요약 모델 로드
summarizer = pipeline("summarization", model="facebook/mbart-large-cc25")

# 6. 기업별 리뷰 요약
summaries = []
for _, row in company_reviews.iterrows():
    company = row["기업명"]
    text = row["리뷰내용"]

    # 모델 입력 제한(길이 초과 방지)
    text = text[:1000]

    summary = summarizer(text, max_length=60, min_length=10, do_sample=False)
    summaries.append({"기업명": company, "리뷰요약": summary[0]['summary_text']})

summary_df = pd.DataFrame(summaries)

print("\n📝 리뷰 요약 결과")
print(summary_df)


📊 기업별 평균 점수 요약
           평점    복지    임금  근로환경    안전    소통
기업명                                        
광주물류     3.53  3.35  2.71  2.94  3.00  3.59
광주섬유     3.26  3.22  2.52  3.43  3.26  3.09
광주식품     3.00  3.25  3.44  3.00  3.25  2.88
광주자동차부품  3.21  3.00  3.26  3.05  3.53  3.00
광주전자     2.72  2.64  3.16  3.12  3.16  2.92


Device set to use cpu



📝 리뷰 요약 결과
       기업명                                               리뷰요약
0     광주물류  salary is good but communication is good but c...
1     광주섬유  is not well managed. Thời gian làm việc làm vi...
2     광주식품  gian làm việc làm việc làm việc làm việc làm v...
3  광주자동차부품  andandantsantsantsantsants,但是环境还可以。工资不高,安全,有有有...
4     광주전자  salary is good but communication is good but c...


In [11]:
for idx, row in summary_df.iterrows():
    print(f"기업명: {row['기업명']}")
    print(f"리뷰요약: {row['리뷰요약']}\n")


기업명: 광주물류
리뷰요약: salary is good but communication is good but communication is good but communication is good but communication is good but communication is good but communication is good but communication is good but communication is good but communication is hard. 工资不高,但是环境还可以。 Thời gian làm việc quá dài

기업명: 광주섬유
리뷰요약: is not well managed. Thời gian làm việc làm việc làm việc quá dài nhưng đồng nghiệp thân thiện nghiệp thân thiện nghiệp thân thiện nghiệp thân thiện,工作 not well managed,工作 is not well managed,工作 is not well managed,工作 is not well managed,工作,

기업명: 광주식품
리뷰요약: gian làm việc làm việc làm việc làm việc làm việc làm việc làm việc làm việc quá dài dài dài dài dài dài dài. 工资不高, communication is good but communication is good but communication is good but communication is good but communication is good but communication is good but communication

기업명: 광주자동차부품
리뷰요약: andandantsantsantsantsants,但是环境还可以。工资不高,安全,有有有有有有有有有有有有有有有有有有有有有有有有有有有有有有有有有有有有有有有

기업명: 광주전자
리뷰요약: salary is g

In [12]:
# ====== 0) 설치 ======
!pip install evaluate bert-score nltk rouge_score

import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

import evaluate
rouge = evaluate.load("rouge")           # ROUGE-1/2/L/Lsum
bertscore = evaluate.load("bertscore")   # BERTScore(F1)

# ====== 1) 데이터 불러오기 ======
# 원본 리뷰 데이터
file_path = '/content/drive/MyDrive/수업진행프로젝트/foreign_worker_reviews_dummy_fixed.csv' # Use the file_path from the first cell
df = pd.read_csv(file_path)

# 리뷰 요약 데이터 (앞에서 만든 summary_df 사용)
# summary_df = pd.DataFrame([...])  # 이미 존재한다고 가정

# 회사별 원문 리뷰 합치기
ref_base = df.groupby("기업명")["리뷰내용"].apply(lambda x: " ".join(map(str, x))).reset_index()
ref_base.rename(columns={"리뷰내용":"원문_통합"}, inplace=True)

# 정답요약 컬럼은 없음 → Lead-3로 대체
merged = summary_df.merge(ref_base, on="기업명", how="left")
merged["정답요약_통합"] = np.nan

# ====== 2) 대용 참조요약(Lead-3) 생성 ======
from nltk.tokenize import sent_tokenize

def lead3(text, max_chars=512):
    if not isinstance(text, str) or text.strip()=="":
        return ""
    sents = sent_tokenize(text)
    lead = " ".join(sents[:3])  # 앞 3문장
    return lead[:max_chars]

# 참조요약 생성
merged["참조요약"] = merged.apply(
    lambda r: r["정답요약_통합"] if isinstance(r["정답요약_통합"], str) and r["정답요약_통합"].strip()!=""
    else lead3(r["원문_통합"]),
    axis=1
)

# 빈 값 정리
merged["리뷰요약"] = merged["리뷰요약"].fillna("")
merged["참조요약"] = merged["참조요약"].fillna("")

# ====== 3) ROUGE, BERTScore 계산 ======
preds = merged["리뷰요약"].tolist()
refs  = merged["참조요약"].tolist()

rouge_res = rouge.compute(predictions=preds, references=refs, use_aggregator=True)
rouge_table = {k: round(v*100, 2) for k,v in rouge_res.items()}

bert_res = bertscore.compute(predictions=preds, references=refs, lang="ko", model_type="bert-base-multilingual-cased")
bert_f1_avg = round(float(np.mean(bert_res["f1"])) * 100, 2)

print("=== 📊 전체 평균 점수 ===")
print(f"ROUGE-1: {rouge_table['rouge1']} | ROUGE-2: {rouge_table['rouge2']} | ROUGE-L: {rouge_table['rougeL']} | ROUGE-Lsum: {rouge_table['rougeLsum']}")
print(f"BERTScore-F1: {bert_f1_avg}")

# ====== 4) 회사별(개별) 점수 산출 ======
def compute_rouge_single(p, r):
    res = rouge.compute(predictions=[p], references=[r], use_aggregator=True)
    return {k: v for k,v in res.items()}  # 0~1

def compute_bertscore_single(p, r):
    res = bertscore.compute(predictions=[p], references=[r], lang="ko", model_type="bert-base-multilingual-cased")
    return float(res["f1"][0])

rows = []
for i, row in merged.iterrows():
    rres = compute_rouge_single(row["리뷰요약"], row["참조요약"])
    bres = compute_bertscore_single(row["리뷰요약"], row["참조요약"])
    rows.append({
        "기업명": row["기업명"],
        "ROUGE-1": round(rres["rouge1"]*100, 2),
        "ROUGE-2": round(rres["rouge2"]*100, 2),
        "ROUGE-L": round(rres["rougeL"]*100, 2),
        "ROUGE-Lsum": round(rres["rougeLsum"]*100, 2),
        "BERTScore-F1": round(bres*100, 2),
        "요약_길이토큰": len(row["리뷰요약"].split()),
        "참조_길이토큰": len(row["참조요약"].split()),
    })

company_scores = pd.DataFrame(rows).sort_values("ROUGE-L", ascending=False)
print("\n=== 🏢 회사별 상세 점수(상위 10) ===")
display(company_scores.head(10))

# ====== 5) 보조 지표 ======
def compression_ratio(summary, source):
    sl = max(1, len(summary.split()))
    sr = max(1, len(source.split()))
    return round(sr/sl, 2)

def new_info_ratio(summary, reference):
    s_tokens = set(summary.split())
    r_tokens = set(reference.split())
    if len(s_tokens)==0:
        return 0.0
    return round(100 * len(s_tokens - r_tokens) / len(s_tokens), 2)

def coverage_ratio(summary, reference):
    s_tokens = set(summary.split())
    r_tokens = set(reference.split())
    if len(r_tokens)==0:
        return 0.0
    return round(100 * len(s_tokens & r_tokens) / len(r_tokens), 2)

extras = []
for i, row in merged.iterrows():
    extras.append({
        "기업명": row["기업명"],
        "압축률(원문/요약)": compression_ratio(row["리뷰요약"], row["원문_통합"]),
        "신정보율(%)": new_info_ratio(row["리뷰요약"], row["참조요약"]),
        "커버리지(%)": coverage_ratio(row["리뷰요약"], row["참조요약"]),
    })

extras_df = pd.DataFrame(extras)
result = company_scores.merge(extras_df, on="기업명", how="left")

print("\n=== 🔎 회사별 종합 리포트(일부) ===")
display(result.head(10))

print("\n[안내]")
print("- 정답요약이 없어 Lead-3(앞 3문장) 기반 대용 참조요약으로 평가했습니다. 공식 보고용보다는 탐색적 지표로만 사용하세요.")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


=== 📊 전체 평균 점수 ===
ROUGE-1: 24.61 | ROUGE-2: 21.73 | ROUGE-L: 24.1 | ROUGE-Lsum: 24.21
BERTScore-F1: 65.21

=== 🏢 회사별 상세 점수(상위 10) ===


Unnamed: 0,기업명,ROUGE-1,ROUGE-2,ROUGE-L,ROUGE-Lsum,BERTScore-F1,요약_길이토큰,참조_길이토큰
1,광주섬유,51.11,47.73,51.11,51.11,72.67,43,22
2,광주식품,28.0,22.45,26.0,26.0,69.96,51,23
0,광주물류,24.56,21.82,24.56,24.56,65.79,46,16
4,광주전자,19.35,16.67,19.35,19.35,62.0,54,16
3,광주자동차부품,0.0,0.0,0.0,0.0,55.62,1,19



=== 🔎 회사별 종합 리포트(일부) ===


Unnamed: 0,기업명,ROUGE-1,ROUGE-2,ROUGE-L,ROUGE-Lsum,BERTScore-F1,요약_길이토큰,참조_길이토큰,압축률(원문/요약),신정보율(%),커버리지(%)
0,광주섬유,51.11,47.73,51.11,51.11,72.67,43,22,3.47,27.78,59.09
1,광주식품,28.0,22.45,26.0,26.0,69.96,51,23,1.9,18.18,40.91
2,광주물류,24.56,21.82,24.56,24.56,65.79,46,16,2.15,53.85,40.0
3,광주전자,19.35,16.67,19.35,19.35,62.0,54,16,2.61,0.0,45.45
4,광주자동차부품,0.0,0.0,0.0,0.0,55.62,1,19,108.0,100.0,0.0



[안내]
- 정답요약이 없어 Lead-3(앞 3문장) 기반 대용 참조요약으로 평가했습니다. 공식 보고용보다는 탐색적 지표로만 사용하세요.
