In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

In [2]:
df = pd.read_csv("{path to train data}")
df_val = pd.read_csv("{path to val data}")
df_test = pd.read_csv("{path to test data}")

In [None]:
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

In [None]:
!pip install -U transformers[torch]
!pip install -U huggingface_hub
! pip install evaluate
!pip install bert_score
!pip install git+https://github.com/google-research/bleurt.git

In [5]:
import torch
from torch.utils.data import Dataset, random_split, DataLoader
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel, DataCollatorForLanguageModeling #Seq2SeqTrainingArguments, Seq2SeqTrainer
import accelerate
import evaluate
from torch import nn
from transformers.trainer_pt_utils import get_parameter_names
import re

In [None]:
bert = evaluate.load("bertscore")
bleurt = evaluate.load("bleurt", module_type="metric", checkpoint="BLEURT-20")

In [7]:
torch.manual_seed(42)
from transformers import set_seed
set_seed(42)

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>', sep_token='<|sep|>')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium').cuda()
model.resize_token_embeddings(len(tokenizer))

In [9]:
datacollator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm=False)

In [10]:
class QADataset(Dataset):
    def __init__(self, data, tokenizer):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for index, row in data.iterrows():
            encodings_dict = tokenizer('<|startoftext|>' + row["question"] + '<|sep|>' + row["answer"] + '<|endoftext|>', truncation=True, max_length=300, padding="max_length", return_tensors='pt')
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx]

In [11]:
class ValQADataset(Dataset):
    def __init__(self, data, tokenizer):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for index, row in data.iterrows():
          encodings_dict = tokenizer('<|startoftext|>' + row["question"] + '<|sep|>', truncation=True, max_length=120, padding="max_length", return_tensors='pt')
          if((encodings_dict['input_ids'][0][119] != torch.Tensor(np.array([50259]))) and (encodings_dict['input_ids'][0][119] != torch.Tensor(np.array([50258])))):
            encodings_dict['input_ids'][0] = torch.cat((encodings_dict['input_ids'][0][np.r_[:119]],torch.Tensor(np.array([50258]))),0)
          self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
          self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx]

In [12]:
class TestQADataset(Dataset):
    def __init__(self, data, tokenizer):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for index, row in data.iterrows():
          encodings_dict = tokenizer('<|startoftext|>' + row["question"] + '<|sep|>', truncation=True, max_length=120, return_tensors='pt')
          self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
          self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx]

In [None]:
train_dataset = QADataset(df, tokenizer)
val_dataset = ValQADataset(df_val,tokenizer)

In [14]:
def compute_metrics(eval_pred, eval_dataset, df):
    decoded_preds = []
    references = [df['answer1'], df['answer2']]
    refs = []
    bleurt_preds = []
    bleurt_refs = []

    for i, sample_input in enumerate(eval_dataset):
        temp_input = sample_input[0][sample_input[0] !=torch.Tensor(np.array([50259]))]
        temp_input = temp_input[None, :]
        #metric_outputs = model.generate(temp_input.cuda(), min_new_tokens = 60, max_new_tokens = 200, do_sample=True, top_k=50, top_p=0.95, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
        metric_outputs = model.generate(temp_input.cuda(), min_new_tokens = 60, max_new_tokens = 200, penalty_alpha=0.6, top_k=4, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
        decoded_preds.append(tokenizer.decode(metric_outputs[0]))

    final_preds =[]
    for j in range(len(decoded_preds)):
        lst = decoded_preds[j].split('<|sep|>')
        if (len(lst) >= 2):
            temp_decoded = lst[1].replace("<|endoftext|>","")
            final_preds.append(temp_decoded.split('___')[0])
            bleurt_preds.append(temp_decoded.split('___')[0])
            bleurt_preds.append(temp_decoded.split('___')[0])
        else:
          print('{}:{}'.format(j, lst[0]))
        temp_refs = []
        for k in range(len(references)):
            temp_refs.append(references[k][j])
            bleurt_refs.append(references[k][j])
        refs.append(temp_refs)

    results_bert = bert.compute(predictions=final_preds, references=refs, lang="en")
    results_bleurt = bleurt.compute(predictions=bleurt_preds, references=bleurt_refs)
    
    precision_sum = 0
    recall_sum = 0
    f1_sum = 0
    bleurt_sum = 0
    length_bert = len(results_bert['precision'])
    for i in range(length_bert):
        precision_sum += results_bert['precision'][i]
        recall_sum += results_bert['recall'][i]
        f1_sum += results_bert['f1'][i]
        bleurt_sum += results_bleurt['scores'][i]

    return {'bert_precision': precision_sum/length_bert, 'bert_recall' : recall_sum/length_bert, 'bert_f1' : f1_sum/length_bert, 'bleurt' : bleurt_sum/length_bert}

In [15]:
training_args = TrainingArguments(
    output_dir="/kaggle/working/output",
    num_train_epochs=15,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    eval_steps = 400,
    save_steps=800,
    warmup_steps=500,
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy = "epoch",
    weight_decay=0.01,
    #metric_for_best_model = "bleurt",
    metric_for_best_model = "bert_f1",
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    fp16=True,
    save_total_limit = 1
    )

In [16]:
def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits, dim=-1)
    return pred_ids, labels

In [17]:
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, data_collator=datacollator, compute_metrics = lambda pred: compute_metrics(pred, val_dataset, df_val), preprocess_logits_for_metrics = preprocess_logits_for_metrics)

In [None]:
# wandb token can be given here. 
#import wandb
#wandb.login(key = " {{token here}}")


In [None]:
# wandb.init(
#     # set the wandb project where this run will be logged
#     project="{project name}")

In [None]:
trainer.train()

In [21]:
trainer.save_model()

In [None]:
test_dataset = TestQADataset(df_test,tokenizer)

In [23]:
temp_output = []
for i, sample_input in enumerate(test_dataset):
    if(len(sample_input[0]) == 119 and sample_input[0][119] != torch.Tensor(np.array([50258]))):
        sample_input[0] = torch.cat((sample_input[0][np.r_[:119]],torch.Tensor(np.array([50258]))),0)
    #sample_output = model.generate(sample_input.cuda(), min_new_tokens = 60, max_new_tokens = 200, do_sample=True, top_k=50, top_p=0.95, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    sample_output = model.generate(sample_input.cuda(), min_new_tokens = 60, max_new_tokens = 200, penalty_alpha=0.6, top_k=4, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    temp_output.append(tokenizer.decode(sample_output[0]))
final_output =[]
for j in range(len(temp_output)):
    lst = temp_output[j].split('<|sep|>')
    if (len(lst) >= 2):
        temp_decoded = lst[1].replace("<|endoftext|>","")
        final_output.append(temp_decoded.split('___')[0])
    else:
        final_output.append("")

In [24]:
df_test["Generated"] = final_output
df_test.to_csv('/kaggle/working/GeneratedAnswer.csv')

In [25]:
pred = df_test['Generated']
ref = [df_test['answer1'], df_test['answer2']]

In [26]:
def score_evaluate(predictions, references):
    preds = []
    refs = []
    bleurt_preds = []
    bleurt_refs = []
    for i in range(len(predictions)):
        preds.append(predictions[i])
        bleurt_preds.append(predictions[i])        
        bleurt_preds.append(predictions[i])
        temp_refs = []
        for j in range(len(references)):
            temp_refs.append(references[j][i])
            bleurt_refs.append(references[j][i])
        refs.append(temp_refs)
    
    results_bleurt = bleurt.compute(predictions=bleurt_preds, references=bleurt_refs)

    results_bert = bert.compute(predictions=preds, references=refs, lang="en")
    
    precision_sum = 0
    recall_sum = 0
    f1_sum = 0
    bleurt_sum = 0
    length_bert = len(results_bert['precision'])
    for i in range(length_bert):
        precision_sum += results_bert['precision'][i]
        recall_sum += results_bert['recall'][i]
        f1_sum += results_bert['f1'][i]
        bleurt_sum += results_bleurt['scores'][i]
    
    print("Bert precision Score : " + str(precision_sum/length_bert))
    print("Bert recall Score : " + str(recall_sum/length_bert))
    print("Bert f1 Score : " + str(f1_sum/length_bert))
    print("Bleurt Score : " + str(bleurt_sum/length_bert))

In [27]:
score_evaluate(pred,ref)

Bert precision Score : 0.8952824686964352
Bert recall Score : 0.8968712404370308
Bert f1 Score : 0.8953523640831311
Bleurt Score : -0.27816072137405473
