In [None]:
# !pip install --upgrade pip
# !pip install evaluate
# !pip install sentence_transformers

In [None]:
import torch
import sys
import numpy as np
 
from transformers import GPT2Tokenizer, GPT2LMHeadModel
# Load pre-trained model (weights)
with torch.no_grad():
        GPT2_model = GPT2LMHeadModel.from_pretrained('gpt2').cuda()
        GPT2_model.eval()
# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
 
def GPT2_score(sentence):
    tokenize_input = tokenizer.encode(sentence)
    tensor_input = torch.tensor([tokenize_input]).cuda()
    loss=GPT2_model(tensor_input, labels=tensor_input)[0]
    return np.exp(loss.cpu().detach().numpy())
#print(GPT2_score(sentence='Humans have many basic needs  and one of them is to have an environment that can sustain their lives.'))

In [None]:
import evaluate
bleu = evaluate.load("bleu")

from sentence_transformers import SentenceTransformer
sim_model = SentenceTransformer('bert-base-nli-mean-tokens')
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
import pickle

In [None]:
def similarity(sentence1, sentence2):
    sentence_embedding1 = sim_model.encode(sentence1)
    sentence_embedding2 = sim_model.encode(sentence2)
    sim_score = cosine_similarity([sentence_embedding1], [sentence_embedding2])
    return sim_score[0][0]

def evaluate(preds, refs):
    with open(preds, 'rb') as f:
        predictions = pickle.load(f)
        predictions = predictions[:1000]
    with open(refs, 'rb') as f:
        references = pickle.load(f)
        references = references[:1000]
        
    references1 = list()
    sim_scores = list()
    GPT2_scores_preds = list()
    for idx, reference in enumerate(references):
        references1.append([reference])
        sim_scores.append(similarity(predictions[idx], reference))
        if(len(predictions[idx])!=0):
            GPT2_scores_preds.append(GPT2_score(sentence = predictions[idx]))
    
    res_bleu_1 = bleu.compute(predictions=predictions, references=references1, max_order=1)
    res_bleu_2 = bleu.compute(predictions=predictions, references=references1, max_order=2)
    res_bleu_3 = bleu.compute(predictions=predictions, references=references1, max_order=3)
    res_bleu_4 = bleu.compute(predictions=predictions, references=references1, max_order=4)
    # sum(GPT2_scores_preds)/len(GPT2_scores_preds)
    return sum(sim_scores)/len(sim_scores), [res_bleu_1['bleu'],res_bleu_2['bleu'],res_bleu_3['bleu'],res_bleu_4['bleu']]

In [None]:
res_bleu, sim = evaluate('politeness_transfer/direct/trg_direct_lists.pkl', 'politeness_transfer/direct/src_direct_lists.pkl')
print(res_bleu)
print(sim)

In [None]:
res_bleu, sim = evaluate('politeness_transfer/direct/pred_direct_lists.pkl', 'politeness_transfer/direct/src_direct_lists.pkl')
print(res_bleu)
print(sim)

In [None]:
res_bleu, sim = evaluate('politeness_transfer/direct/pred_direct_lists.pkl', 'politeness_transfer/direct/trg_direct_lists.pkl')
print(res_bleu)
print(sim)

In [None]:
sim, bleus = evaluate('politeness_transfer/synthetic_test_polite_tagandgen.pkl', 'politeness_transfer/direct/src_direct_lists.pkl')
print(sim)
print(bleus)

In [None]:
sim, bleus = evaluate('politeness_transfer/synthetic_test_polite_tagandgen.pkl', 'politeness_transfer/direct/trg_direct_lists.pkl')
print(sim)
print(bleus)

In [None]:
res_bleu, sim = evaluate('politeness_transfer/gen/pred_gen_lists.pkl', 'politeness_transfer/direct/src_direct_lists.pkl')
print(res_bleu)
print(sim)

In [None]:
res_bleu, sim = evaluate('politeness_transfer/gen/pred_gen_lists.pkl', 'politeness_transfer/direct/trg_direct_lists.pkl')
print(res_bleu)
print(sim)

In [None]:
res_bleu, sim = evaluate('politeness_transfer/direct/P_0_test_polite_direct.pkl', 'polite_multicls_classifier/P_0_test.pkl')
print(res_bleu)
print(sim)

In [None]:
res_bleu, sim = evaluate('politeness_transfer/gen/P_0_test_polite_gen.pkl', 'polite_multicls_classifier/P_0_test.pkl')
print(res_bleu)
print(sim)

In [None]:
sim, bleus = evaluate('dialog/daily_dialog/finetune-polite/polite_dlg_responses_direct_pred_lists.pkl', 'dialog/daily_dialog/responses/test_res.pkl')
print(sim)
print(bleus)

In [None]:
sim, res_bleu = evaluate('dialog/daily_dialog/finetune/dlg_responses_direct_pred_lists.pkl', 'dialog/daily_dialog/responses/test_res.pkl')
print(sim)
print(bleus)

In [None]:
sim, res_bleu = evaluate('dialog/daily_dialog/without-finetune/dlg_responses_wo_pred_lists.pkl', 'dialog/daily_dialog/responses/test_res.pkl')
print(sim)
print(bleus)

In [None]:
res_bleu, sim = evaluate('dialog/daily_dialog/finetune-polite/polite_dlg_responses_direct_pred_lists.pkl', 'dialog/daily_dialog/responses/test_polite_res_direct.pkl')
print(res_bleu)
print(sim)

In [None]:
res_bleu, sim = evaluate('dialog/daily_dialog/finetune/dlg_responses_direct_pred_lists.pkl', 'dialog/daily_dialog/responses/test_polite_res_direct.pkl')
print(res_bleu)
print(sim)

In [None]:
sim, bleus = evaluate('dialog/daily_dialog/responses/test_polite_res_direct.pkl', 'dialog/daily_dialog/responses/test_res.pkl')
print(sim)
print(bleus)

In [None]:
sim, bleus = evaluate('dialog/daily_dialog/responses/tag-gen/test_polite_res.pkl', 'dialog/daily_dialog/responses/test_res.pkl')
print(sim)
print(bleus)

In [None]:
sim, bleus = evaluate('dialog/daily_dialog/finetune/DialoGPT_finetune_lastmodel.pkl', 'dialog/daily_dialog/responses/test_res.pkl')
print(sim)
print(bleus)

In [None]:
sim, bleus = evaluate('dialog/daily_dialog/finetune/DialoGPT_finetune_lastmodel_polite.pkl', 'dialog/daily_dialog/responses/test_res.pkl')
print(sim)
print(bleus)

In [None]:
sim, bleus = evaluate('dialog/daily_dialog/finetune/GPT2_finetune_last.pkl', 'dialog/daily_dialog/responses/test_res.pkl')
print(sim)
print(bleus)

In [None]:
sim, PPL_Score, bleus = evaluate('dialog/daily_dialog/finetune/GPT2_finetune_last_polite.pkl', 'dialog/daily_dialog/responses/test_res.pkl')
print(sim)
print(PPL_Score)
print(bleus)

In [None]:
sim, PPL_Score, bleus = evaluate('dialog/daily_dialog/finetune/GPT2_finetune_last_polite.pkl', 'dialog/daily_dialog/responses/test_res.pkl')
print(sim)
#print(PPL_Score)
print(bleus)

In [None]:
# resps = list()
# with open('dialog/daily_dialog/responses/test_res.pkl', 'rb') as f:
#         resps = pickle.load(f)
# print(len(resps))

# with open('dialog/daily_dialog/responses/test_res.txt', 'w') as f:
#     for line in resps:
#         f.write(f"{line}\n")

# with open(r"dialog/daily_dialog/responses/test_res.txt", 'r') as fp:
#     lines = len(fp.readlines())
#     print(lines)

In [None]:
sim, bleus = evaluate('dialog/daily_dialog/without-finetune/dlg_responses_wo_pred_lists_GPT2.pkl', 'dialog/daily_dialog/responses/test_res.pkl')
print(sim)
#print(PPL_Score)
print(bleus)

In [None]:
sim, bleus = evaluate('dialog/daily_dialog/without-finetune/dlg_responses_wo_pred_lists_dialoGPT.pkl', 'dialog/daily_dialog/responses/test_res.pkl')
print(sim)
#print(PPL_Score)
print(bleus)

In [None]:
sim, bleus = evaluate('dialog/daily_dialog/finetune/GPT2_finetune_best.pkl', 'dialog/daily_dialog/responses/test_res.pkl')
print(sim)
#print(PPL_Score)
print(bleus)

In [None]:
sim, bleus = evaluate('dialog/daily_dialog/finetune/GPT2_finetune_polite_best.pkl', 'dialog/daily_dialog/responses/test_res.pkl')
print(sim)
#print(PPL_Score)
print(bleus)

In [None]:
sim, bleus = evaluate('dialog/daily_dialog/finetune/DialoGPT_finetune_best.pkl', 'dialog/daily_dialog/responses/test_res.pkl')
print(sim)
#print(PPL_Score)
print(bleus)

In [None]:
sim, bleus = evaluate('dialog/daily_dialog/finetune/DialoGPT_finetune_polite_best.pkl', 'dialog/daily_dialog/responses/test_res.pkl')
print(sim)
#print(PPL_Score)
print(bleus)

In [None]:
sim, bleus = evaluate('dialog/daily_dialog/finetune/GPT2_finetune_tag-gen_polite.pkl', 'dialog/daily_dialog/responses/test_res.pkl')
print(sim)
#print(PPL_Score)
print(bleus)

In [None]:
sim, bleus = evaluate('dialog/daily_dialog/finetune-polite/Blenderbot_polite_pred_tag-gen.pkl', 'dialog/daily_dialog/responses/test_res.pkl')
print(sim)
#print(PPL_Score)
print(bleus)