In [1]:
import re
def normalize_answer(s):
    """간단한 토큰화와 정규화"""
    s = s.lower()  # 소문자 변환
    s = re.sub(r'\b(a|an|the)\b', ' ', s)  # 불필요한 관사 제거
    s = re.sub(r'[^a-z0-9]', ' ', s)  # 알파벳과 숫자 외 제거
    return ' '.join(s.split())  # 공백 정리

def exact_match_score(prediction, ground_truth):
    """예측 답과 실제 답 간의 EM 점수 계산"""
    return int(normalize_answer(prediction) == normalize_answer(ground_truth))

def f1_score_hotpot(prediction, ground_truth):
    """예측 답과 실제 답 간의 F1 점수 계산"""
    pred_tokens = normalize_answer(prediction).split()
    gt_tokens = normalize_answer(ground_truth).split()
    
    common_tokens = set(pred_tokens) & set(gt_tokens)
    num_common = len(common_tokens)
    
    if num_common == 0:
        return 0
    
    precision = num_common / len(pred_tokens)
    recall = num_common / len(gt_tokens)
    
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

In [10]:
import json
with open("result/hotpot_cnn/hotpot_3000_tt.json", "r", encoding="utf-8") as file:
    dev_data = json.load(file)

In [11]:
from sklearn.metrics import f1_score

result_f1 = []
result_em = []
for dev in dev_data:
    predict = ""
    answer = dev["answer"]
    generated_text = dev["generated_text"].split("**Summary:")[0].replace("**Answer:", "")
    if answer == "yes":
        if answer in generated_text.lower() and "no" not in generated_text.lower():
            generated_text = "yes"
        else:
            generated_text = ""
    elif answer == "no":
        if answer in generated_text.lower() and "yes" not in generated_text.lower():
            generated_text = "no"
        else:
            generated_text = ""
    answer = answer.strip()
    predict = generated_text.strip()
    print(answer)
    print(predict)
    print("----")
    result_f1.append(f1_score_hotpot(answer, predict))
    result_em.append(exact_match_score(predict, answer))

yes
yes
----
Chief of Protocol
United States ambassador
----
Animorphs
Animorphs
----
no
no
----
Greenwich Village, New York City
Greenwich Village, New York City
----
YG Entertainment
YG Entertainment
----
Eenasul Fateh
Aladin
----
Terry Richardson
Terry Richardson
----
yes

----
Kansas Song
Kansas Song
----
David Weissman
David Weissman
----
1999
1999
----
from 1986 to 2013
1945 to 1969
----
9,984
9,984
----
the North Atlantic Conference
Eastern College Athletic Conference-North
----
yes
yes
----
1969 until 1974
1969 until 1974
----
Robert Erskine Childers DSC
Robert Erskine Childers
----
Pedro Rodríguez
Sergio Pérez
----
Sonic
Tigger
----
keyboard function keys
NetSupport Manager
----
Badly Drawn Boy
Badly Drawn Boy
----
World's Best Goalkeeper
World's Best Goalkeeper
----
Barton Lee Hazlewood
Barton Lee Hazlewood
----
1838
1838
----
yes

----
Henry J. Kaiser
Henry J. Kaiser
----
Arena of Khazan
Adventure
----
2000
March 14, 2000
----
Fujioka, Gunma
Fujioka, Gunma
----
Charles Eugèn

In [12]:
# F1 점수와 EM 점수 출력
print("F1 점수: ", sum(result_f1) / len(result_f1))
print("EM 점수: ", sum(result_em) / len(result_em))

F1 점수:  0.6789206349206347
EM 점수:  0.57


In [13]:
len(result_f1)

100

In [21]:
import json
with open("qwen_answer_cnn_50.json", "r", encoding="utf-8") as file:
    dev_data = json.load(file)

FileNotFoundError: [Errno 2] No such file or directory: 'qwen_answer_cnn_50.json'

In [29]:
from rouge_score import rouge_scorer

def calculate_rouge(predicted_summary, reference_summary):
    # ROUGE 계산기 생성 (rouge1, rouge2, rougeL을 사용)
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # ROUGE 점수 계산
    scores = scorer.score(reference_summary, predicted_summary)
    
    return scores

In [33]:
from sklearn.metrics import f1_score

rouge1 = []
rouge2 = []
rougeL = []
for dev in dev_data:
    predict = ""
    answer = dev["answer"].split("**Summary**")[1].strip()
    generated_text = dev["generated_text"].split("assistant\n")[1]
    if "**Answer**" in generated_text:
        predict = generated_text.split("**Answer**")[1].replace("**Summary**\n", "")
    else:
        predict = generated_text
    
    answer = answer.strip()
    predict = predict.strip()
    print(answer)
    print(predict)
    print("----")
    rouge_scores = calculate_rouge(predict, answer)
    rouge1.append(rouge_scores['rouge1'].fmeasure)
    rouge2.append(rouge_scores['rouge2'].fmeasure)
    rougeL.append(rouge_scores['rougeL'].fmeasure)

Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June . Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .
Palestinians joined the International Criminal Court on Wednesday, becoming the 123rd member. The move gives the court jurisdiction over alleged crimes in Palestinian territories.
----
Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer and buried in a field . "She's a true miracle dog and she deserves a good life," says Sara Mellado, who is looking for a home for Theia .
Theia, a dog who was hit by a car, beaten with a hammer and buried. Theia survived. A dog who was hit by a car, beaten with a hammer and buried, has survived. She's now at a veterinary teaching hospital.
----
Mohammad Javad Zarif has spent more time with John Kerry than any other foreign minister . He once participated in a takeover of the Iranian Consulate in San

In [35]:
print(sum(rouge1)/len(rouge1))
print(sum(rouge2)/len(rouge2))
print(sum(rougeL)/len(rougeL))

0.33227131701560736
0.13398218246023427
0.2444843813224157
