In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("{path to train file}")
df_val = pd.read_csv("{path to val file}")
df_test = pd.read_csv("{path to test file}")

In [None]:
!pip install -U transformers[torch]
!pip install -U huggingface_hub
! pip install evaluate
!pip install sacrebleu
!pip install rouge_score
!pip install -U nltk
#!pip install bitsandbytes

In [None]:
import torch
from torch.utils.data import Dataset, random_split, DataLoader
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel, DataCollatorForLanguageModeling #Seq2SeqTrainingArguments, Seq2SeqTrainer
import accelerate
import evaluate
#import bitsandbytes
from torch import nn
from transformers.trainer_pt_utils import get_parameter_names
import re

In [None]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [None]:
sacrebleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

In [None]:
torch.manual_seed(42)

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>', sep_token='<|sep|>')
model = GPT2LMHeadModel.from_pretrained('distilgpt2').cuda()
model.resize_token_embeddings(len(tokenizer))

In [None]:
datacollator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm=False)

In [None]:
class EmailSubjectDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for index, row in data.iterrows():
            encodings_dict = tokenizer('<|startoftext|>' + row["Email"] + '<|sep|>' + row["Subject"] + '<|endoftext|>', truncation=True, max_length=250, padding="max_length", return_tensors='pt')
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx]

In [None]:
class ValEmailSubjectDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for index, row in data.iterrows():
          encodings_dict = tokenizer('<|startoftext|>' + row["Email"] + '<|sep|>', truncation=True, max_length=250, padding="max_length", return_tensors='pt')
          if((encodings_dict['input_ids'][0][249] != torch.Tensor(np.array([50259]))) and (encodings_dict['input_ids'][0][249] != torch.Tensor(np.array([50258])))):
            encodings_dict['input_ids'][0] = torch.cat((encodings_dict['input_ids'][0][np.r_[:249]],torch.Tensor(np.array([50258]))),0)
          self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
          self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx]

In [None]:
class TestEmailSubjectDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for index, row in data.iterrows():
          encodings_dict = tokenizer('<|startoftext|>' + row["Email"] + '<|sep|>', truncation=True, max_length=250, return_tensors='pt')
          self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
          self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx]

In [None]:
train_dataset = EmailSubjectDataset(df, tokenizer)
val_dataset = ValEmailSubjectDataset(df_val,tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir="/kaggle/working/output",
    overwrite_output_dir=True,
    num_train_epochs=15,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    #eval_steps = 2,
    eval_steps = 400,
    save_steps=800,
    warmup_steps=500,
    load_best_model_at_end=True,
    #evaluation_strategy="steps",
    #save_strategy = "steps",
    evaluation_strategy="epoch",
    save_strategy = "epoch",
    weight_decay=0.01,
    metric_for_best_model = "rougeL",
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    fp16=True
    )

decay_parameters = get_parameter_names(model, [nn.LayerNorm])
decay_parameters = [name for name in decay_parameters if "bias" not in name]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n in decay_parameters],
        "weight_decay": training_args.weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
        "weight_decay": 0.0,
    },
]

optimizer_kwargs = {
    "betas": (training_args.adam_beta1, training_args.adam_beta2),
    "eps": training_args.adam_epsilon,
}
optimizer_kwargs["lr"] = training_args.learning_rate
adam_bnb_optim = bitsandbytes.optim.Adam8bit(
    optimizer_grouped_parameters,
    betas=(training_args.adam_beta1, training_args.adam_beta2),
    eps=training_args.adam_epsilon,
    lr=training_args.learning_rate,
)

In [None]:
def compute_metrics(eval_pred, eval_dataset, df):
    decoded_preds = []
    references = [df['Subject'], df['Ann0'], df['Ann1'], df['Ann2']]
    refs = []
    
    for i, sample_input in enumerate(eval_dataset):
        temp_input = sample_input[0][sample_input[0] !=torch.Tensor(np.array([50259]))]
        temp_input = temp_input[None, :]
        metric_outputs = model.generate(temp_input.cuda(), min_new_tokens = 4, max_new_tokens = 12, num_beams=5, early_stopping=True, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
        decoded_preds.append(tokenizer.decode(metric_outputs[0]))
    
    final_preds =[]
    for j in range(len(decoded_preds)):
        lst = decoded_preds[j].split('<|sep|>')
        if (len(lst) >= 2):
            final_preds.append(lst[1].replace("<|endoftext|>",""))
        temp_refs = []
        for k in range(len(references)):
            temp_refs.append(references[k][j])
        refs.append(temp_refs)
    
    results_sacrebleu = sacrebleu.compute(predictions=final_preds, references=refs, lowercase = True)
    
    results_rouge = rouge.compute(predictions=final_preds, references=refs)
    
    results_meteor = meteor.compute(predictions=final_preds, references=refs)
    
    return {'bleu': results_sacrebleu['score'], 'rouge1' : results_rouge['rouge1'], 'rouge2' : results_rouge['rouge2'], 'rougeL' : results_rouge['rougeL'], 'meteor' : results_meteor['meteor']}

In [None]:
def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak. 
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits, dim=-1)
    return pred_ids, labels

In [None]:
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, data_collator=datacollator, compute_metrics = lambda pred: compute_metrics(pred, val_dataset, df_val), preprocess_logits_for_metrics = preprocess_logits_for_metrics)#, optimizers=(adam_bnb_optim, None))

In [None]:
# import wandb
# wandb.login(key = "{your token here}")

In [None]:
trainer.train()

In [None]:
trainer.save_model()

In [None]:
#trainedmodel = GPT2LMHeadModel.from_pretrained("/kaggle/input/trainedmodel").cuda()
#trainedmodel.resize_token_embeddings(len(tokenizer))

In [None]:
test_dataset = TestEmailSubjectDataset(df_test,tokenizer)
len(test_dataset)

In [None]:
temp_output = [] 
for i, sample_input in enumerate(test_dataset):
    if(len(sample_input[0]) == 250 and sample_input[0][249] != torch.Tensor(np.array([50258]))):
        sample_input[0] = torch.cat((sample_input[0][np.r_[:249]],torch.Tensor(np.array([50258]))),0)
    sample_output = model.generate(sample_input.cuda(), min_new_tokens = 4, max_new_tokens = 12, num_beams=5, early_stopping=True, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    temp_output.append(tokenizer.decode(sample_output[0]))
final_output =[]
for j in range(len(temp_output)):
    lst = temp_output[j].split('<|sep|>')
    if (len(lst) >= 2):
        final_output.append(lst[1].replace("<|endoftext|>",""))
    else:
        final_output.append("")

In [None]:
df_test["Generated"] = final_output
df_test.to_csv('/kaggle/working/Generated.csv')

In [None]:
#df_test = pd.read_csv("/kaggle/input/generatedoutput/Generated.csv")

In [None]:
pred = df_test['Generated']
ref = [df_test['Subject'], df_test['Ann0'], df_test['Ann1'], df_test['Ann2']]

In [None]:
def score_evaluate(predictions, references):
    preds = []
    refs = []
    for i in range(len(predictions)):
        preds.append(predictions[i])
        temp_refs = []
        for j in range(len(references)):
            temp_refs.append(references[j][i])
        refs.append(temp_refs)
    results_sacrebleu = sacrebleu.compute(predictions=preds, references=refs, lowercase = True)
    print("Bleu Score : " + str(results_sacrebleu['score']))

    results_rouge = rouge.compute(predictions=preds, references=refs)
    print("Rouge1 Score : " + str(results_rouge['rouge1']))
    print("Rouge2 Score : " + str(results_rouge['rouge2']))
    print("RougeL Score : " + str(results_rouge['rougeL']))

    results_meteor = meteor.compute(predictions=preds, references=refs)
    print("Meteor Score : " + str(results_meteor['meteor']))

In [None]:
score_evaluate(pred,ref)