In [None]:
import gdown 
import os.path
import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer
import pandas as pd

In [None]:
datamodel = 'model/pytorch_model.bin'
config_file = 'model/config.json'

In [None]:
def set_seed(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

model = T5ForConditionalGeneration.from_pretrained('t5-base')
tokenizer = T5Tokenizer.from_pretrained('t5-base')

In [None]:
def similarity(x):
    input_ids = tokenizer.encode(str(x), return_tensors='pt')
    outputs = model.generate(input_ids=input_ids)
    similarityd = tokenizer.decode(outputs[0])
    return similarityd

def cola(y):
    input_ids = tokenizer.encode(str(y), return_tensors='pt')
    outputs = model.generate(input_ids=input_ids)
    colad = tokenizer.decode(outputs[0])
    return colad

In [None]:
data = 0
data = pd.read_csv("synthetic_data/synthetic_paraphrases.csv", index_col=0)
data["Similarity_Prep"] = 'stsb sentence1:' + data['Inputs'] + ' sentence2:' + data['Paraphrase'] + ' </s>'
data["Cola_Prep"] = 'cola sentence: ' + data['Paraphrase'] + ' </s>'
data.shape

In [None]:
similarity_values = [similarity(x) for x in data['Similarity_Prep']]
cola_values = [cola(y) for y in data['Cola_Prep']]

In [None]:
pd_similarity_value = pd.DataFrame(similarity_values)
pd_cola_value = pd.DataFrame(cola_values)
sim_vals = pd.concat([data, pd_similarity_value.reindex(data.index)], axis=1)
sim_cola_vals = pd.concat([sim_vals, pd_cola_value.reindex(data.index)], axis=1)
sim_cola_vals.head()

In [None]:
sim_cola_vals.columns

In [None]:
export = sim_cola_vals[["Inputs", "Paraphrase" , 0]]
export.columns = ["Inputs", "Paraphrase" , 'Similarity', 'Acceptable']

In [None]:
export.sort_values(by=['Acceptable'], inplace=True, ascending=False)
export.head()

In [None]:
export.to_csv('synthetic_data/synthetic_paraphrases_evaluated.csv')