In [27]:
import re
def normalize_answer(s):
    """간단한 토큰화와 정규화"""
    s = s.lower()  # 소문자 변환
    s = re.sub(r'\b(a|an|the)\b', ' ', s)  # 불필요한 관사 제거
    s = re.sub(r'[^a-z0-9]', ' ', s)  # 알파벳과 숫자 외 제거
    return ' '.join(s.split())  # 공백 정리

def exact_match_score(prediction, ground_truth):
    """예측 답과 실제 답 간의 EM 점수 계산"""
    return int(normalize_answer(prediction) == normalize_answer(ground_truth))

def f1_score_hotpot(prediction, ground_truth):
    """예측 답과 실제 답 간의 F1 점수 계산"""
    pred_tokens = normalize_answer(prediction).split()
    gt_tokens = normalize_answer(ground_truth).split()
    
    common_tokens = set(pred_tokens) & set(gt_tokens)
    num_common = len(common_tokens)
    
    if num_common == 0:
        return 0
    
    precision = num_common / len(pred_tokens)
    recall = num_common / len(gt_tokens)
    
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

In [6]:
import json
with open("result/1105/hotpot_tt_2000.json", "r", encoding="utf-8") as file:
    dev_data = json.load(file)

In [7]:
from sklearn.metrics import f1_score

result_f1 = []
result_em = []
for dev in dev_data:
    predict = ""
    answer = dev["answer"].split("**Summary:")[0].replace("**Answer:", "").replace("<|im_start|>assistant", "")
    generated_text = dev["generated_text"].split("**Summary:")[0].replace("**Answer:", "")
    if answer == "yes":
        if answer in generated_text.lower() and "no" not in generated_text.lower():
            generated_text = "yes"
        else:
            generated_text = ""
    elif answer == "no":
        if answer in generated_text.lower() and "yes" not in generated_text.lower():
            generated_text = "no"
        else:
            generated_text = ""
    answer = answer.strip()
    predict = generated_text.strip()
    print(answer)
    print(predict)
    print("----")
    result_f1.append(f1_score_hotpot(answer, predict))
    result_em.append(exact_match_score(predict, answer))

yes
yes
----
Chief of Protocol
Sericety of State for Constitutional Affairs
----
Animorphs
Animrophs
----
no
yes
----
Greenwich Village, New York City
New York City
----
YG Entertainment
YG Entertainment
----
Eenasul Fateh
Eenasul Gameth
----
3,677 seated
4,000
----
Terry Richardson
Anside Morton
----
yes
yes
----
Kansas Song
Kansas Song
----
David Weissman
David Dewissman
----
1999
1999
----
no
yes
----
from 1986 to 2013
1945 to 1969
----
9,984
9,984
----
the North Atlantic Conference
Eastern Councilamic Association-North
----
yes
yes
----
1969 until 1974
1969 to 1974
----
Robert Erskine Childers DSC
Robert Arksine Childers
----
Pedro Rodríguez
Eudardo Troconis
----
Sonic
The:Herdegos
----
keyboard function keys
Mth Group
----
Badly Drawn Boy
Wolf Alice
----
World's Best Goalkeeper
Gus Williams
----
Barton Lee Hazlewood
LeestHazlewood
----
1838
1838
----
yes
yes
----
Henry J. Kaiser
Henry J. Kaiser
----
Arena of Khazan
Carusaders of Khazan
----
2000
2000
----
Fujioka, Gunma
Finla n
--

In [8]:
# F1 점수와 EM 점수 출력
print("F1 점수: ", sum(result_f1) / len(result_f1))
print("EM 점수: ", sum(result_em) / len(result_em))

F1 점수:  0.413095238095238
EM 점수:  0.28


In [17]:
len(result_f1)

100

# Summary

In [21]:
import json
with open("qwen_answer_cnn_50.json", "r", encoding="utf-8") as file:
    dev_data = json.load(file)

FileNotFoundError: [Errno 2] No such file or directory: 'qwen_answer_cnn_50.json'

In [29]:
from rouge_score import rouge_scorer

def calculate_rouge(predicted_summary, reference_summary):
    # ROUGE 계산기 생성 (rouge1, rouge2, rougeL을 사용)
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # ROUGE 점수 계산
    scores = scorer.score(reference_summary, predicted_summary)
    
    return scores

In [33]:
from sklearn.metrics import f1_score

rouge1 = []
rouge2 = []
rougeL = []
for dev in dev_data:
    predict = ""
    answer = dev["answer"].split("**Summary**")[1].strip()
    generated_text = dev["generated_text"].split("assistant\n")[1]
    if "**Answer**" in generated_text:
        predict = generated_text.split("**Answer**")[1].replace("**Summary**\n", "")
    else:
        predict = generated_text
    
    answer = answer.strip()
    predict = predict.strip()
    print(answer)
    print(predict)
    print("----")
    rouge_scores = calculate_rouge(predict, answer)
    rouge1.append(rouge_scores['rouge1'].fmeasure)
    rouge2.append(rouge_scores['rouge2'].fmeasure)
    rougeL.append(rouge_scores['rougeL'].fmeasure)

Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June . Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .
Palestinians joined the International Criminal Court on Wednesday, becoming the 123rd member. The move gives the court jurisdiction over alleged crimes in Palestinian territories.
----
Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer and buried in a field . "She's a true miracle dog and she deserves a good life," says Sara Mellado, who is looking for a home for Theia .
Theia, a dog who was hit by a car, beaten with a hammer and buried. Theia survived. A dog who was hit by a car, beaten with a hammer and buried, has survived. She's now at a veterinary teaching hospital.
----
Mohammad Javad Zarif has spent more time with John Kerry than any other foreign minister . He once participated in a takeover of the Iranian Consulate in San

In [35]:
print(sum(rouge1)/len(rouge1))
print(sum(rouge2)/len(rouge2))
print(sum(rougeL)/len(rougeL))

0.33227131701560736
0.13398218246023427
0.2444843813224157


# 근거 문장 평가

In [28]:
import json
with open("result/1113_upper/2800.json", "r", encoding="utf-8") as file:
    test_data = json.load(file)

In [29]:
file_path = "data/1029data/hotpot_dev_supporting.json"
with open(file_path, 'r', encoding='utf-8') as f:
    dev_data = json.load(f)

In [30]:
def evaluate_supporting_facts(gold_sp, pred_sp):
    """Supporting facts에 대한 EM, Precision, Recall, F1 점수를 계산하는 함수"""
    # 단일 정수를 리스트로 변환
    gold_sp = [gold_sp] if isinstance(gold_sp, int) else gold_sp
    pred_sp = [pred_sp] if isinstance(pred_sp, int) else pred_sp
    
    # 예측과 정답 집합으로 변환
    gold_set = set(gold_sp)
    pred_set = set(pred_sp)
    
    # True Positives 계산
    tp = len(gold_set & pred_set)
    
    # Precision, Recall 계산
    precision = tp / len(pred_set) if pred_set else 0
    recall = tp / len(gold_set) if gold_set else 0
    
    # F1 점수 계산
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    # Exact Match 계산
    em = 1 if gold_set == pred_set else 0
    
    return em, precision, recall, f1

In [31]:
score = []
all_em_score = []
all_precision_score = []
all_recall_score = []
all_f1_score = []
result_f1 = []
result_em = []
ignore = 0
for dev, data in zip(dev_data, test_data):
    assert dev["_id"] == data["_id"]
    predict = ""
    answer = data["answer"].split("**Summary:")[0].replace("**Answer:", "").replace("<|im_start|>assistant", "").strip()
    generated_text = data["generated_text"].split("**Summary:")[0].replace("**Answer:", "").strip()
    if answer == "yes":
        if answer in generated_text.lower() and "no" not in generated_text.lower():
            generated_text = "yes"
        else:
            generated_text = ""
    elif answer == "no":
        if answer in generated_text.lower() and "yes" not in generated_text.lower():
            generated_text = "no"
        else:
            generated_text = ""
    answer = answer.strip()
    predict = generated_text.strip()
    print(answer)
    print(generated_text)
    print("----")
    result_f1.append(f1_score_hotpot(answer, predict))
    result_em.append(exact_match_score(predict, answer))
    ################################################
    gold_sp = data["gold_sp"]
    pred_sp = data["pred_sp"]
    pred_sp = [x for x in data["pred_sp"] if x != 0]
    em, precision, recall, f1 = evaluate_supporting_facts(gold_sp, pred_sp)
    all_em_score.append(em)
    all_precision_score.append(precision)
    all_recall_score.append(recall)
    all_f1_score.append(f1)
    
    for i in pred_sp:
        if answer == "yes" or answer == "no":
            ignore += 1
            break
        if predict in dev["sent"][i-1]:
            score.append(dev["_id"])
            print(answer)
            print(generated_text)
            print(dev["sent"][i-1])
            print("================")
            break

yes
<|im_end|>
no
----
Chief of Protocol
<|im_end|>
professional acreister
----
Animorphs
<|im_end|>
The Amornorth
----
no
<|im_end|>
no
----
Greenwich Village, New York City
<|im_end|>
Greenwich Village
----
Greenwich Village, New York City
<|im_end|>
Greenwich Village
Adriana Trigiani is an Italian American best-selling author of sixteen books, television writer, film director, and entrepreneur based in Greenwich Village, New York City.
YG Entertainment
<|im_end|>
YG Entertainment
----
YG Entertainment
<|im_end|>
YG Entertainment
Winner (Hangul: 위너), often stylized as WINNER, is a South Korean boy group formed in 2013 by YG Entertainment and debuted in 2014.
Eenasul Fateh
<|im_end|>
Eenasul Hate
----
3,677 seated
<|im_end|>
4,000
----
3,677 seated
<|im_end|>
4,000
The Androscoggin Bank Colisée (formerly Central Maine Civic Center and Lewiston Colisee) is a 4,000 capacity (3,677 seated) multi-purpose arena, in Lewiston, Maine, that opened in 1958.
Terry Richardson
<|im_end|>
Terry Ric

In [32]:
len(score)

38

In [33]:
ignore

0

In [34]:
# F1 점수와 EM 점수 출력
print("F1 점수: ", sum(result_f1) / len(result_f1))
print("EM 점수: ", sum(result_em) / len(result_em))

F1 점수:  0.32729653679653675
EM 점수:  0.0


In [35]:
# F1 점수와 EM 점수 출력
print("all_f1_score 점수: ", sum(all_f1_score) / len(all_f1_score))
print("all_em_score 점수: ", sum(all_em_score) / len(all_em_score))
print("all_precision_score 점수: ", sum(all_precision_score) / len(all_precision_score))
print("all_recall_score 점수: ", sum(all_recall_score) / len(all_recall_score))

all_f1_score 점수:  0.5566190476190469
all_em_score 점수:  0.0
all_precision_score 점수:  0.5166666666666665
all_recall_score 점수:  0.6348333333333332
