In [24]:
import re
def normalize_answer(s):
    """간단한 토큰화와 정규화"""
    s = s.lower()  # 소문자 변환
    s = re.sub(r'\b(a|an|the)\b', ' ', s)  # 불필요한 관사 제거
    s = re.sub(r'[^a-z0-9]', ' ', s)  # 알파벳과 숫자 외 제거
    return ' '.join(s.split())  # 공백 정리

def exact_match_score(prediction, ground_truth):
    """예측 답과 실제 답 간의 EM 점수 계산"""
    return int(normalize_answer(prediction) == normalize_answer(ground_truth))

def f1_score_hotpot(prediction, ground_truth):
    """예측 답과 실제 답 간의 F1 점수 계산"""
    pred_tokens = normalize_answer(prediction).split()
    gt_tokens = normalize_answer(ground_truth).split()
    
    common_tokens = set(pred_tokens) & set(gt_tokens)
    num_common = len(common_tokens)
    
    if num_common == 0:
        return 0
    
    precision = num_common / len(pred_tokens)
    recall = num_common / len(gt_tokens)
    
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

In [25]:
import json
with open("result/1103+dataup+loss/hotpot_tt_8000.json", "r", encoding="utf-8") as file:
    dev_data = json.load(file)

In [29]:
from sklearn.metrics import f1_score

result_f1 = []
result_em = []
for dev in dev_data:
    predict = ""
    answer = dev["answer"].split("**Summary:")[0].replace("**Answer:", "").replace("<|im_start|>assistant", "").strip()
    generated_text = dev["generated_text"].split("**Summary:")[0].replace("**Answer:", "")
    if answer == "yes":
        if answer in generated_text.lower() and "no" not in generated_text.lower():
            generated_text = "yes"
        else:
            generated_text = ""
    elif answer == "no":
        if answer in generated_text.lower() and "yes" not in generated_text.lower():
            generated_text = "no"
        else:
            generated_text = ""
    answer = answer.strip()
    predict = generated_text.strip()
    print(answer)
    print(predict)
    print("----")
    result_f1.append(f1_score_hotpot(answer, predict))
    result_em.append(exact_match_score(predict, answer))

yes
yes
----
Chief of Protocol
United States ambassador
----
Animorphs
Animorphs
----
no
no
----
Greenwich Village, New York City
Greenwich Village
----
YG Entertainment
YG Entertainment
----
Eenasul Fateh
James P. Comer
----
3,677 seated
4,000
----
Terry Richardson
Terry Richardson
----
yes
yes
----
Kansas Song
Kansas Song
----
David Weissman
David Weissman
----
1999
1994
----
no

----
from 1986 to 2013
1986 to 2013
----
9,984
9,984
----
the North Atlantic Conference
Eastern College Athletic Conference-North
----
yes
yes
----
1969 until 1974
1969–1974
----
Robert Erskine Childers DSC
Robert Erskine Childers
----
Pedro Rodríguez
Sergio Pérez
----
Sonic
Tigger
----
keyboard function keys
Front Row
----
Badly Drawn Boy
Wolf Alice
----
World's Best Goalkeeper
Peter Schmeichel
----
Barton Lee Hazlewood
Lee Hazlewood
----
1838
1838
----
yes
yes
----
Henry J. Kaiser
Henry J. Kaiser
----
Arena of Khazan
Crusaders of Khazan
----
2000
2000
----
Fujioka, Gunma
Japan
----
Charles Eugène
Charles N

In [30]:
# F1 점수와 EM 점수 출력
print("F1 점수: ", sum(result_f1) / len(result_f1))
print("EM 점수: ", sum(result_em) / len(result_em))

F1 점수:  0.684968253968254
EM 점수:  0.56


In [31]:
len(result_f1)

100

# Summary

In [21]:
import json
with open("qwen_answer_cnn_50.json", "r", encoding="utf-8") as file:
    dev_data = json.load(file)

FileNotFoundError: [Errno 2] No such file or directory: 'qwen_answer_cnn_50.json'

In [29]:
from rouge_score import rouge_scorer

def calculate_rouge(predicted_summary, reference_summary):
    # ROUGE 계산기 생성 (rouge1, rouge2, rougeL을 사용)
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # ROUGE 점수 계산
    scores = scorer.score(reference_summary, predicted_summary)
    
    return scores

In [33]:
from sklearn.metrics import f1_score

rouge1 = []
rouge2 = []
rougeL = []
for dev in dev_data:
    predict = ""
    answer = dev["answer"].split("**Summary**")[1].strip()
    generated_text = dev["generated_text"].split("assistant\n")[1]
    if "**Answer**" in generated_text:
        predict = generated_text.split("**Answer**")[1].replace("**Summary**\n", "")
    else:
        predict = generated_text
    
    answer = answer.strip()
    predict = predict.strip()
    print(answer)
    print(predict)
    print("----")
    rouge_scores = calculate_rouge(predict, answer)
    rouge1.append(rouge_scores['rouge1'].fmeasure)
    rouge2.append(rouge_scores['rouge2'].fmeasure)
    rougeL.append(rouge_scores['rougeL'].fmeasure)

Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June . Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .
Palestinians joined the International Criminal Court on Wednesday, becoming the 123rd member. The move gives the court jurisdiction over alleged crimes in Palestinian territories.
----
Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer and buried in a field . "She's a true miracle dog and she deserves a good life," says Sara Mellado, who is looking for a home for Theia .
Theia, a dog who was hit by a car, beaten with a hammer and buried. Theia survived. A dog who was hit by a car, beaten with a hammer and buried, has survived. She's now at a veterinary teaching hospital.
----
Mohammad Javad Zarif has spent more time with John Kerry than any other foreign minister . He once participated in a takeover of the Iranian Consulate in San

In [35]:
print(sum(rouge1)/len(rouge1))
print(sum(rouge2)/len(rouge2))
print(sum(rougeL)/len(rougeL))

0.33227131701560736
0.13398218246023427
0.2444843813224157


# 근거 문장 평가

In [33]:
import json
with open("result/1103+dataup+loss/hotpot_tt_8000.json", "r", encoding="utf-8") as file:
    test_data = json.load(file)

In [34]:
file_path = "data/1029data/hotpot_dev_supporting.json"
with open(file_path, 'r', encoding='utf-8') as f:
    dev_data = json.load(f)

In [44]:
def em_precision_recall(predicted, ground_truth):
    # Exact Match
    em = int(predicted == ground_truth)
    
    # Precision and Recall
    predicted_set = set(predicted)
    ground_truth_set = set(ground_truth)
    
    true_positives = len(predicted_set & ground_truth_set)
    
    precision = true_positives / len(predicted_set) if predicted_set else 0
    recall = true_positives / len(ground_truth_set) if ground_truth_set else 0
    
    return em, precision, recall


In [45]:
score = []
all_em = []
all_precision = []
all_recall = []
ignore = 0
for dev, data in zip(dev_data, test_data):
    assert dev["_id"] == data["_id"]
    predict = ""
    answer = data["answer"].split("**Summary:")[0].replace("**Answer:", "").replace("<|im_start|>assistant", "").strip()
    generated_text = data["generated_text"].split("**Summary:")[0].replace("**Answer:", "")
    if answer == "yes":
        if answer in generated_text.lower() and "no" not in generated_text.lower():
            generated_text = "yes"
        else:
            generated_text = ""
    elif answer == "no":
        if answer in generated_text.lower() and "yes" not in generated_text.lower():
            generated_text = "no"
        else:
            generated_text = ""
    answer = answer.strip()
    predict = generated_text.strip()
    result_f1.append(f1_score_hotpot(answer, predict))
    result_em.append(exact_match_score(predict, answer))
    ################################################
    gold_sp = data["gold_sp"]
    pred_sp = data["pred_sp"]
    em, precision, recall = em_precision_recall(pred_sp, gold_sp)
    all_em.append(em)
    all_precision.append(precision)
    all_recall.append(recall)
    # print(f"EM: {em}, Precision: {precision:.2f}, Recall: {recall:.2f}")
    
    for i in pred_sp:
        if answer == "yes" or answer == "no":
            ignore += 1
            break
        if predict in dev["sent"][i-1]:
            score.append(dev["_id"])
            print(answer)
            print(generated_text)
            print(dev["sent"][i-1])
            print("================")
            break

Greenwich Village, New York City
Greenwich Village

Adriana Trigiani is an Italian American best-selling author of sixteen books, television writer, film director, and entrepreneur based in Greenwich Village, New York City.
David Weissman
David Weissman

The Family Man is a 2000 American romantic comedy-drama film directed by Brett Ratner, written by David Diamond and David Weissman, and starring Nicolas Cage and Téa Leoni.
Robert Erskine Childers DSC
Robert Erskine Childers

His double first cousin and close friend was Robert Erskine Childers.
Fujioka, Gunma
Japan

Kyo (京 , Kyō ) is a Japanese musician, poet and singer-songwriter.
Letters to Cleo
Screaming Trees

Beat Happening/Screaming Trees is an EP and a one-off collaboration between Beat Happening (from Olympia, Washington) and Screaming Trees (from Ellensburg, Washington).
New York City
New York City

Columbia University (Columbia; officially Columbia University in the City of New York), established in 1754, is a private Ivy Lea

In [42]:
len(score)

16

In [43]:
ignore

12

In [38]:
# F1 점수와 EM 점수 출력
print("F1 점수: ", sum(result_f1) / len(result_f1))
print("EM 점수: ", sum(result_em) / len(result_em))

F1 점수:  0.684968253968254
EM 점수:  0.56


In [46]:
# F1 점수와 EM 점수 출력
print("all_em 점수: ", sum(all_em) / len(all_em))
print("all_precision 점수: ", sum(all_precision) / len(all_precision))
print("all_recall 점수: ", sum(all_recall) / len(all_recall))


all_em 점수:  0.0
all_precision 점수:  0.04999999999999999
all_recall 점수:  0.06083333333333333
