In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from scipy.stats import pearsonr
import torch
import numpy as np
import json
import pandas as pd
import re
import unicodedata
from transformers import BertTokenizer, BertPreTrainedModel

In [None]:
def data_preproc(paragrahp:str):
    """
    1. 괄호 및 괄호 안 글자 제거
    2. 글자 인코딩 변경
    3. 홈페이지 주소 제거
    4. 이메일 주소 제거
    """
    paragrahp = re.sub(r'\(.*\)', '', paragrahp)
    patten = r"[^ .,·?!:'”%/()A-Za-z0-9가-힣+]"
    paragrahp = re.sub(patten, " ", paragrahp)
    paragrahp = " ".join(paragrahp.split())
    paragrahp = unicodedata.normalize("NFKD", paragrahp)
    paragrahp = re.sub("((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*", "", paragrahp)
    paragrahp = re.sub("'^[a-zA-Z0-9+-_.]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'", "", paragrahp)
    return paragrahp

In [None]:
with open("./klue-sts-data/klue-sts-v1.1_dev.json", "rt", encoding='utf8') as f:
    dev_data = json.load(f)

In [None]:
shape = np.full([len(dev_data), 3], np.nan)
dev_df = pd.DataFrame(shape, columns=['sentence1', 'sentence2', 'label'])

for idx, el in enumerate(dev_data):
    dev_df.loc[idx] = [el['sentence1'], el['sentence2'], el['labels']['real-label']]

dev_df[['sentence1', 'sentence2']] = dev_df[['sentence1', 'sentence2']].applymap(data_preproc)

In [None]:
tokenizer_krbert_sub = BertTokenizer.from_pretrained("./config_files/vocab_snu_subchar12367.txt")



In [None]:
devset_token = tokenizer_krbert_sub(dev_df[['sentence1', 'sentence2']].values.tolist(),
                                    truncation = True,
                                    padding = "longest",
                                    max_length=128,
                                    return_tensors = "pt")

In [None]:
traced_model = torch.jit.load("./torch_model_final.pt")

In [None]:
result = traced_model(**devset_token)

In [None]:
label = dev_df['label'].values

In [None]:
result = result.numpy().flatten()

In [None]:
print("Pearson r: {:.2f} \nP-value: {:.2e}".format(*pearsonr(label, result.flatten())))

Pearson r: 0.86 
P-value: 3.16e-154


In [None]:
bin_lable = np.where(label >= 3, 1, 0)
bin_result = np.where(result >= 3, 1, 0)

In [None]:
print("F1 score:", f1_score(bin_lable, bin_result))

F1 score: 0.7903225806451614


In [None]:
dev_result = pd.read_csv("./dev_set_score.csv")

In [None]:
dev_result

Unnamed: 0,guid,true_real_label,true_binary_label,predict_real_label,predict_binary_label
0,klue-sts-v1_dev_00000,4.857143,1,4.895295,1
1,klue-sts-v1_dev_00001,1.428571,0,2.828100,0
2,klue-sts-v1_dev_00002,1.285714,0,2.525229,0
3,klue-sts-v1_dev_00003,3.714286,1,4.196025,1
4,klue-sts-v1_dev_00004,2.500000,0,3.240267,1
...,...,...,...,...,...
514,klue-sts-v1_dev_00514,2.200000,0,3.525137,1
515,klue-sts-v1_dev_00515,2.833333,0,3.430123,1
516,klue-sts-v1_dev_00516,0.333333,0,0.453891,0
517,klue-sts-v1_dev_00517,0.333333,0,0.337194,0


In [None]:
dev_result['predict_real_label'] = result

In [None]:
dev_result['predict_binary_label'] = bin_result

In [None]:
dev_result.to_csv('기업과제3_4팀_dev_set_score.csv', index = False, encoding = 'utf8')