In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-data/new_dataset/Dataset.csv


In [2]:
pip install language_tool_python

Collecting language_tool_python
  Downloading language_tool_python-2.7.1-py3-none-any.whl (34 kB)
Installing collected packages: language_tool_python
Successfully installed language_tool_python-2.7.1
[0mNote: you may need to restart the kernel to use updated packages.


# Loading Required Libraries

In [3]:
import torch
from torch.utils.data import DataLoader,Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import spacy
from sklearn.model_selection import train_test_split
import torch.nn as nn
import language_tool_python
from tqdm import tqdm



# Reading Data Into Pandas Dataframe

In [4]:
df = pd.read_csv("/kaggle/input/nlp-data/new_dataset/Dataset.csv")
df.rename(columns = {"0":"transcript","1":"summary"},inplace = True)

In [5]:
df.head(5)

Unnamed: 0,transcript,summary
0,"My name is Eric, diligent from shifts in Moria...","Eric, a recent graduate from Stanford Univers..."
1,Thank you very much.I hope you understand me.I...,This text discusses the difficult task of link...
2,Good afternoon everyone.My name is Yoshimi Cla...,"Yoshimi Clara, the Secretary General of Jay MC..."
3,"OK, so this is a tutorial tutorial on the new ...","This tutorial is about Velebit 5.0, an open so..."
4,"Everybody, so my name is Vicki and.For the nex...",This work looks at the issue of duplicate inst...


# Splitting Data into Train, Test , Validation

In [6]:
train_df,test_df = train_test_split(df,test_size=0.2)

In [7]:
train,val = train_test_split(train_df,test_size = 0.2)

# Converted the Pandas Dataframe into Dictionary

In [8]:
train_data = train.to_dict('records')
valid_data = val.to_dict('records')

In [9]:
len(train_data),len(valid_data)

(1276, 319)

# Loading the T5 Model

In [10]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

# To convert Train & Validation Dataset into Pytorch Dataset

In [11]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        item = self.data[index]
        input_text = item['transcript']
        target_text = item['summary']
        return {'transcript': input_text, 'summary': target_text}

# To Convert the Tokenized Data into Pytorch Dataset

In [12]:
class CustomDataset1(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        item = self.data[index]
        input_id = item['input_ids']
        input_mask = item['attention_mask']
        target_id = item['target_ids']
        return {'input_ids': input_id, 'attention_mask': input_mask , 'labels': target_id}

# Function to Tokenize the Train & Valid Dataset

In [13]:
def preprocess_function(data):
    input_text = data['transcript']
    target_text = data['summary']
    # Tokenize the input and target text
    input_tokens = tokenizer.encode_plus(
        input_text,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    target_tokens = tokenizer.encode_plus(
        target_text,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    return {
        'input_ids': input_tokens['input_ids'].squeeze(),
        'attention_mask': input_tokens['attention_mask'].squeeze(),
        'target_ids': target_tokens['input_ids'].squeeze(),
    }


# Grammaticality Loss Function

In [14]:
tool = language_tool_python.LanguageTool('en-US')
def grammaticality_loss_function(output_logits):
    output_sentences = tokenizer.batch_decode(torch.argmax(output_logits, dim=-1), skip_special_tokens=True)
    loss = 0.0
    cnt = 0
    for output in output_sentences:
        matches = tool.check(output)
        num_errors = len(matches)
        cnt += len(output.split())
        loss += num_errors
    loss /= cnt    
    return loss

# Customized Trainer which includes compute loss function

In [15]:
class CustomTrainer(Trainer):
    def get_train_dataloader(self):
        return DataLoader(
            self.train_dataset, 
            batch_size=self.args.train_batch_size, 
            collate_fn=self.data_collator, 
            shuffle=True
        )

    def get_eval_dataloader(self,eval_dataset):
        return DataLoader(
            self.eval_dataset, 
            batch_size=self.args.eval_batch_size, 
            collate_fn=self.data_collator
        )

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs['labels']
        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=inputs['labels'])
        logits = outputs.logits
        
        # compute custom loss
        grammaticality_loss = grammaticality_loss_function(logits.view(-1, self.model.config.vocab_size))
        
        # compute cross entropy loss
        ce_loss_fct = nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
        ce_loss = ce_loss_fct(logits.view(-1, model.module.config.vocab_size), labels.view(-1))
        
        # combine losses
        total_loss = grammaticality_loss*0.1 + ce_loss
        
        return (total_loss, outputs) if return_outputs else total_loss


# Converting the Train Dataset into Pytorch Dataset and then Tokenizing it

In [16]:
# Preprocess the data
train_dataset = CustomDataset(train_data)
train_dict_list = list(train_dataset)
train_dict_list = [preprocess_function(example) for example in train_dict_list]
train_dict_list = [{'input_ids': torch.tensor(example['input_ids']),
                    'attention_mask': torch.tensor(example['attention_mask']),
                    'target_ids': torch.tensor(example['target_ids'])} for example in train_dict_list]
train_dataset = CustomDataset1(train_dict_list)

  train_dict_list = [{'input_ids': torch.tensor(example['input_ids']),
  'attention_mask': torch.tensor(example['attention_mask']),
  'target_ids': torch.tensor(example['target_ids'])} for example in train_dict_list]


# Converting the Validation Dataset into Pytorch Dataset and then Tokenizing it

In [17]:
# Preprocess the data
valid_dataset = CustomDataset(valid_data)
valid_dict_list = list(valid_dataset)
valid_dict_list = [preprocess_function(example) for example in valid_dict_list]
valid_dict_list = [{'input_ids': torch.tensor(example['input_ids']),
                    'attention_mask': torch.tensor(example['attention_mask']),
                    'target_ids': torch.tensor(example['target_ids'])} for example in valid_dict_list]
valid_dataset = CustomDataset1(valid_dict_list)

  valid_dict_list = [{'input_ids': torch.tensor(example['input_ids']),
  'attention_mask': torch.tensor(example['attention_mask']),
  'target_ids': torch.tensor(example['target_ids'])} for example in valid_dict_list]


# Defining Training Arguments

In [18]:
training_args = TrainingArguments(
    output_dir='model_save',
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy="epoch",
    save_total_limit=5,
    learning_rate=1e-4,
    lr_scheduler_type='linear', 
    warmup_steps=0,
    dataloader_num_workers=4,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
)

In [19]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [20]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset = valid_dataset,
    data_collator=data_collator,
    tokenizer = tokenizer,
)

In [21]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
trainer.model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

# Training the Model

In [22]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss
0,2.3983,2.07503
2,2.0524,2.043393
2,1.9491,2.035646




TrainOutput(global_step=477, training_loss=2.1334996253439464, metrics={'train_runtime': 940.1929, 'train_samples_per_second': 4.072, 'train_steps_per_second': 0.507, 'total_flos': 2323783310376960.0, 'train_loss': 2.1334996253439464, 'epoch': 2.99})

# Saving the best Model depending on Evaluation Loss

In [23]:
# Save the best model
trainer.save_model(training_args.output_dir)

# Loading the Trained Model

In [24]:
# load the tokenizer
trained_tokenizer = T5Tokenizer.from_pretrained('./model_save')

# load the model
trained_model = T5ForConditionalGeneration.from_pretrained('./model_save', 
                                                  state_dict=torch.load('./model_save/pytorch_model.bin'))

In [25]:
trained_model = trained_model.to(device)

# Function To Generate Summaries

In [26]:
def generate_summary(model,tokenizer,input_text):
    # Tokenize the input text
    input_ids = tokenizer.encode(
        input_text,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    input_ids = input_ids.to(device)

    summary_ids = model.generate(input_ids, num_beams=4, max_length=128, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return summary

# Generating Summaries for Test Data using Trained Model

In [27]:
# Generate the model's summaries for the test data
reference_summaries = []
generated_summaries = []
for i in tqdm(range(test_df.shape[0])):
    generated_summary = generate_summary(trained_model, trained_tokenizer,test_df.iloc[i]["transcript"])
    generated_summaries.append(generated_summary)
    reference_summaries.append(test_df.iloc[i]["summary"])

100%|██████████| 399/399 [15:34<00:00,  2.34s/it]


# Example 1 :-

# Reference Summary

In [40]:
reference_summaries[123]

'The speaker is not a fan of summer and prefers autumn. They were excited to see people doing autumn art work on Instagram and their art usually has earth tones. They were looking for a sword image to get an idea of what a handle would look like and then copy and paste it. They also mentioned how they had accidentally locked their keyboard by holding down shift too long and had to figure out how to fix it.'

# Generated Summary

In [28]:
generated_summaries[123]

'This text is about a person who is not a big fan of summer and is excited to start doing fall art work this year. They are looking up images of a sword with a handguard and are trying to get an idea of what a handle would look like. They also mention that they accidentally locked their keyboard by holding down shift for 8 or 10 seconds and it took them so long to figure out how to fix it.'

# Example 2 :-

# Reference Summary

In [41]:
reference_summaries[0]

"This text is about creating a pet character in Illustrator. The author wants to use the Unite tool to create a simple shape, delete the pieces they don't need, and use a clipping mask to put the shape inside the body. Then they will create legs using a rectangle and the Mirror Me tool. Finally, they will draw an ellipse and use the pen tool to create a droplet."

# Generated Summary

In [38]:
generated_summaries[0]

"This text is about creating shapes for a project. The author is using the Unite tool to cut out half of the circle and create the legs for the pet and fire drops. They are using the mirror me tool to create the legs and fire drops. They are also using the Unite tool to delete the pieces they don't need."

In [32]:
pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
[0mNote: you may need to restart the kernel to use updated packages.


# Calculating Rouge Score for Test Data

In [33]:
from rouge import Rouge

rouge = Rouge()

# Calculate ROUGE scores
scores = rouge.get_scores(generated_summaries, reference_summaries, avg=True)

print(scores)

{'rouge-1': {'r': 0.3505143293478747, 'p': 0.4519901850493025, 'f': 0.3885615926694993}, 'rouge-2': {'r': 0.13400064123920188, 'p': 0.1783929203554601, 'f': 0.15038960843226018}, 'rouge-l': {'r': 0.3232046444216759, 'p': 0.41772087908719113, 'f': 0.3586954440681843}}
