### Checking CPU


In [None]:
!lscpu

### Mounting Drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Imports

In [None]:
!pip install transformers datasets fastt5

In [None]:
import pandas as pd
import time
import torch
from tqdm.auto import tqdm
from fastT5 import export_and_get_onnx_model, get_onnx_model
from transformers import T5ForConditionalGeneration,T5Tokenizer

### Loading Data

In [None]:
train = pd.read_csv('/content/drive/MyDrive/Inter_IIT/Datasets/qa_paras.csv')
train.drop(['paragraph_id', 'theme_x'], axis=1, inplace = True)
train.rename(columns = {'theme_y': 'theme'}, inplace=True)

In [None]:
gen_data = pd.read_csv('/content/drive/MyDrive/Inter_IIT/Datasets/generated_data.csv')
index = pd.Index(range(944, 5500))
gen_data = gen_data.set_index(index)
gen_data.drop(['ans_start', 'ans_end', 'id'], axis=1, inplace=True)
gen_data = gen_data[train.columns]

In [None]:
df = pd.concat([train, gen_data], axis=0)

### Loading Model

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
onnx_model_path = "/content/drive/MyDrive/Inter_IIT/Models/models_paraphrase"
model_name = 'ramsrigouthamg/t5_paraphraser'
# model = export_and_get_onnx_model('ramsrigouthamg/t5_paraphraser') # for converting to onnx models, since the models are already saved it is not necessary to use this line of code
model = get_onnx_model(model_name, onnx_models_path=onnx_model_path, quantized=True) #loading saved onnx models
tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_paraphraser')
# model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_paraphraser') # for running on GPU
# model.to(device) # for running on GPU


### Paraphrasing Questions


In [None]:
def generate_ques(sentence):

    text =  "paraphrase: " + sentence + " </s>"
    max_len = 256

    encoding = tokenizer.encode_plus(text ,max_length = 512, pad_to_max_length="max_length", return_tensors="pt")
    input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
    beam_outputs = model.generate(
    input_ids=input_ids, attention_mask=attention_masks,
    do_sample=True,
    max_length=256,
    top_k=120,
    top_p=0.98,
    early_stopping=True,
    num_return_sequences=5
)
    
    final_outputs =[]
    for beam_output in beam_outputs:
      sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
      if sent.lower() != sentence.lower() and sent not in final_outputs:
          final_outputs.append(sent)

    return final_outputs

In [None]:
ques = []
ans = []
para = []
theme = []
for i in tqdm(range(df.shape[0])):
  final_output = generate_ques(df['question'][i])
  for _, final_output in enumerate(final_output):
    ques.append(final_output)
    ans.append(df['answer'][i])
    para.append(df['paragraph'][i])
    theme.append(df['theme'][i])

In [None]:
data = {'question': ques,
        'answer':ans,
        'paragraph': para,
        'theme': theme,
        }
dataframe = pd.DataFrame(data)
dataframe.to_csv('/content/drive/MyDrive/Inter_IIT/Datasets/paraphrased_data.csv', index=False)