In [1]:
import json
with open("qwen_answer_hotpot_50.json", "r", encoding="utf-8") as file:
    dev_data = json.load(file)

In [16]:
import re
def normalize_answer(s):
    """간단한 토큰화와 정규화"""
    s = s.lower()  # 소문자 변환
    s = re.sub(r'\b(a|an|the)\b', ' ', s)  # 불필요한 관사 제거
    s = re.sub(r'[^a-z0-9]', ' ', s)  # 알파벳과 숫자 외 제거
    return ' '.join(s.split())  # 공백 정리

def f1_score_hotpot(prediction, ground_truth):
    """예측 답과 실제 답 간의 F1 점수 계산"""
    pred_tokens = normalize_answer(prediction).split()
    gt_tokens = normalize_answer(ground_truth).split()
    
    common_tokens = set(pred_tokens) & set(gt_tokens)
    num_common = len(common_tokens)
    
    if num_common == 0:
        return 0
    
    precision = num_common / len(pred_tokens)
    recall = num_common / len(gt_tokens)
    
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

In [17]:
from sklearn.metrics import f1_score

result = []
for dev in dev_data:
    predict = ""
    answer = dev["answer"].split("**Answer**")[1].split("**Summary**")[0].strip()
    generated_text = dev["generated_text"].split("assistant\n")[1]
    if "**Answer**" in generated_text:
        predict = generated_text.split("**Answer**")[1].replace("**Summary**\n", "")
    else:
        predict = generated_text
    if answer == "yes":
        if answer in generated_text.lower() and "no" not in generated_text.lower():
            predict = "yes"
        else:
            predict = ""
    elif answer == "no":
        if answer in generated_text.lower() and "yes" not in generated_text.lower():
            predict = "yes"
        else:
            predict = ""
    answer = answer.strip()
    predict = predict.strip()
    print(answer)
    print(predict)
    print("----")
    result.append(f1_score_hotpot(answer, predict))

yes

----
Chief of Protocol
Secretary of State for Constitutional Affairs
----
Animorphs
The Hork-Bajir Chronicles
----
no

----
Greenwich Village, New York City
New York City
----
YG Entertainment
YG Entertainment
----
Terry Richardson
Annie Morton
----
yes
yes
----
Kansas Song
The University of Missouri–Kansas City
----
David Weissman
David Weissman
----
1999
1999
----
from 1986 to 2013
1996
----
9,984
9,984
----
the North Atlantic Conference
the Vermont Catamounts men's soccer team currently competes in a conference that was formerly known as the Eastern College Athletic Conference-North from 1979 to 1988 and the North Atlantic Conference from 1988 to 1996
----
yes
yes
----
1969 until 1974
1969–1974
----
Robert Erskine Childers DSC
Robert Erskine Childers
----
Pedro Rodríguez
Sergio Pérez
----
Sonic
Tigger
----
keyboard function keys
Garage Door Opener
----


In [19]:
sum(result)/len(result)

0.45432539682539685

In [21]:
import json
with open("qwen_answer_cnn_50.json", "r", encoding="utf-8") as file:
    dev_data = json.load(file)

In [28]:
!pip install rouge_score

1264.56s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting absl-py (from rouge_score)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Downloading absl_py-2.1.0-py3-none-any.whl (133 kB)
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (pyproject.toml) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=c5b71d94707fe0b7f661fa5dda4a1da2bb480d39068a164a09d313b234d8a612
  Stored in directory: /home/rbqlsquf2/.cache/pip/wheels/24/55/6f/ebfc4cb176d1c9665da4e306e1705496206d08215c1acd9dde
Successfully built rouge_score
Installing collected packages: absl-py, rouge_score
Successfully installed absl-py-2.1.0 rouge_score-0.1.2


In [29]:
from rouge_score import rouge_scorer

def calculate_rouge(predicted_summary, reference_summary):
    # ROUGE 계산기 생성 (rouge1, rouge2, rougeL을 사용)
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # ROUGE 점수 계산
    scores = scorer.score(reference_summary, predicted_summary)
    
    return scores

In [33]:
from sklearn.metrics import f1_score

rouge1 = []
rouge2 = []
rougeL = []
for dev in dev_data:
    predict = ""
    answer = dev["answer"].split("**Summary**")[1].strip()
    generated_text = dev["generated_text"].split("assistant\n")[1]
    if "**Answer**" in generated_text:
        predict = generated_text.split("**Answer**")[1].replace("**Summary**\n", "")
    else:
        predict = generated_text
    
    answer = answer.strip()
    predict = predict.strip()
    print(answer)
    print(predict)
    print("----")
    rouge_scores = calculate_rouge(predict, answer)
    rouge1.append(rouge_scores['rouge1'].fmeasure)
    rouge2.append(rouge_scores['rouge2'].fmeasure)
    rougeL.append(rouge_scores['rougeL'].fmeasure)

Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June . Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .
Palestinians joined the International Criminal Court on Wednesday, becoming the 123rd member. The move gives the court jurisdiction over alleged crimes in Palestinian territories.
----
Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer and buried in a field . "She's a true miracle dog and she deserves a good life," says Sara Mellado, who is looking for a home for Theia .
Theia, a dog who was hit by a car, beaten with a hammer and buried. Theia survived. A dog who was hit by a car, beaten with a hammer and buried, has survived. She's now at a veterinary teaching hospital.
----
Mohammad Javad Zarif has spent more time with John Kerry than any other foreign minister . He once participated in a takeover of the Iranian Consulate in San

In [35]:
print(sum(rouge1)/len(rouge1))
print(sum(rouge2)/len(rouge2))
print(sum(rougeL)/len(rougeL))

0.33227131701560736
0.13398218246023427
0.2444843813224157
