In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-final-data/new_dataset/Dataset.csv


In [2]:
pip install language_tool_python

Collecting language_tool_python
  Downloading language_tool_python-2.7.1-py3-none-any.whl (34 kB)
Installing collected packages: language_tool_python
Successfully installed language_tool_python-2.7.1
Note: you may need to restart the kernel to use updated packages.


# Loading Required Libraries

In [3]:
import torch
from torch.utils.data import DataLoader,Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import spacy
from sklearn.model_selection import train_test_split
import torch.nn as nn
import language_tool_python
from tqdm import tqdm

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


# Reading Data Into Pandas Dataframe

In [4]:
df = pd.read_csv("/kaggle/input/nlp-final-data/new_dataset/Dataset.csv")
df.rename(columns = {"0":"transcript","1":"summary"},inplace = True)

In [5]:
df.head(5)

Unnamed: 0,transcript,summary
0,"My name is Eric, diligent from shifts in Moria...","Eric, a recent graduate from Stanford Univers..."
1,Thank you very much.I hope you understand me.I...,This text discusses the difficult task of link...
2,Good afternoon everyone.My name is Yoshimi Cla...,"Yoshimi Clara, the Secretary General of Jay MC..."
3,"OK, so this is a tutorial tutorial on the new ...","This tutorial is about Velebit 5.0, an open so..."
4,"Everybody, so my name is Vicki and.For the nex...",This work looks at the issue of duplicate inst...


# Splitting Data into Train, Test , Validation

In [6]:
train_df,test_df = train_test_split(df,test_size=0.2)

In [7]:
train,val = train_test_split(train_df,test_size = 0.2)

# Converted the Pandas Dataframe into Dictionary

In [8]:
train_data = train.to_dict('records')
valid_data = val.to_dict('records')

In [9]:
len(train_data),len(valid_data)

(1276, 319)

# Loading the T5 Model

In [10]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [11]:
# Freeze all layers except the last 2 layers
for param in model.parameters():
    param.requires_grad = False
for param in model.encoder.block[-2:].parameters():
    param.requires_grad = True
for param in model.decoder.block[-2:].parameters():
    param.requires_grad = True

# To convert Train & Validation Dataset into Pytorch Dataset

In [12]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        item = self.data[index]
        input_text = item['transcript']
        target_text = item['summary']
        return {'transcript': input_text, 'summary': target_text}

# To Convert the Tokenized Data into Pytorch Dataset

In [13]:
class CustomDataset1(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        item = self.data[index]
        input_id = item['input_ids']
        input_mask = item['attention_mask']
        target_id = item['target_ids']
        return {'input_ids': input_id, 'attention_mask': input_mask , 'labels': target_id}

# Function to Tokenize the Train & Valid Dataset

In [14]:
def preprocess_function(data):
    input_text = data['transcript']
    target_text = data['summary']
    # Tokenize the input and target text
    input_tokens = tokenizer.encode_plus(
        input_text,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    target_tokens = tokenizer.encode_plus(
        target_text,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    return {
        'input_ids': input_tokens['input_ids'].squeeze(),
        'attention_mask': input_tokens['attention_mask'].squeeze(),
        'target_ids': target_tokens['input_ids'].squeeze(),
    }


# Grammaticality Loss Function

In [15]:
tool = language_tool_python.LanguageTool('en-US')
def grammaticality_loss_function(output_logits):
    output_sentences = tokenizer.batch_decode(torch.argmax(output_logits, dim=-1), skip_special_tokens=True)
    loss = 0.0
    cnt = 0
    for output in output_sentences:
        matches = tool.check(output)
        num_errors = len(matches)
        cnt += len(output.split())
        loss += num_errors
    loss /= cnt    
    return loss

# Customized Trainer which includes compute loss function

In [16]:
class CustomTrainer(Trainer):
    def get_train_dataloader(self):
        return DataLoader(
            self.train_dataset, 
            batch_size=self.args.train_batch_size, 
            collate_fn=self.data_collator, 
            shuffle=True
        )

    def get_eval_dataloader(self,eval_dataset):
        return DataLoader(
            self.eval_dataset, 
            batch_size=self.args.eval_batch_size, 
            collate_fn=self.data_collator
        )

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs['labels']
        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=inputs['labels'])
        logits = outputs.logits
        
        # compute custom loss
        grammaticality_loss = 0
        grammaticality_loss = grammaticality_loss_function(logits.view(-1, self.model.config.vocab_size))
        
        # compute cross entropy loss
        ce_loss_fct = nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
        ce_loss = ce_loss_fct(logits.view(-1, model.module.config.vocab_size), labels.view(-1))
        
        # combine losses
        total_loss = grammaticality_loss*0.1 + ce_loss
        
        return (total_loss, outputs) if return_outputs else total_loss


# Converting the Train Dataset into Pytorch Dataset and then Tokenizing it

In [17]:
# Preprocess the data
train_dataset = CustomDataset(train_data)
train_dict_list = list(train_dataset)
train_dict_list = [preprocess_function(example) for example in train_dict_list]
train_dict_list = [{'input_ids': torch.tensor(example['input_ids']),
                    'attention_mask': torch.tensor(example['attention_mask']),
                    'target_ids': torch.tensor(example['target_ids'])} for example in train_dict_list]
train_dataset = CustomDataset1(train_dict_list)

  train_dict_list = [{'input_ids': torch.tensor(example['input_ids']),
  'attention_mask': torch.tensor(example['attention_mask']),
  'target_ids': torch.tensor(example['target_ids'])} for example in train_dict_list]


# Converting the Validation Dataset into Pytorch Dataset and then Tokenizing it

In [18]:
# Preprocess the data
valid_dataset = CustomDataset(valid_data)
valid_dict_list = list(valid_dataset)
valid_dict_list = [preprocess_function(example) for example in valid_dict_list]
valid_dict_list = [{'input_ids': torch.tensor(example['input_ids']),
                    'attention_mask': torch.tensor(example['attention_mask']),
                    'target_ids': torch.tensor(example['target_ids'])} for example in valid_dict_list]
valid_dataset = CustomDataset1(valid_dict_list)

  valid_dict_list = [{'input_ids': torch.tensor(example['input_ids']),
  'attention_mask': torch.tensor(example['attention_mask']),
  'target_ids': torch.tensor(example['target_ids'])} for example in valid_dict_list]


# Defining Training Arguments

In [19]:
training_args = TrainingArguments(
    output_dir='model_save',
    num_train_epochs=10,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy="epoch",
    save_total_limit=5,
    learning_rate=1e-4,
    lr_scheduler_type='linear', 
    warmup_steps=0,
    dataloader_num_workers=4,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
)

In [20]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [21]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset = valid_dataset,
    data_collator=data_collator,
    tokenizer = tokenizer,
)

In [22]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
trainer.model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

# Training the Model

In [23]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss
0,3.0217,2.38815
2,2.4727,2.247518
2,2.3914,2.213656
4,2.3212,2.19371
4,2.2959,2.171991
6,2.2416,2.138334
6,2.21,2.128771
8,2.1686,2.12859
8,2.1783,2.124144
9,2.1474,2.126884




TrainOutput(global_step=1590, training_loss=2.3452552459524862, metrics={'train_runtime': 2468.2715, 'train_samples_per_second': 5.17, 'train_steps_per_second': 0.644, 'total_flos': 7745944367923200.0, 'train_loss': 2.3452552459524862, 'epoch': 9.97})

# Saving the best Model depending on Evaluation Loss

In [24]:
# Save the best model
trainer.save_model(training_args.output_dir)

# Loading the Trained Model

In [25]:
# load the tokenizer
trained_tokenizer = T5Tokenizer.from_pretrained('./model_save')

# load the model
trained_model = T5ForConditionalGeneration.from_pretrained('./model_save', 
                                                  state_dict=torch.load('./model_save/pytorch_model.bin'))

In [26]:
trained_model = trained_model.to(device)

# Function To Generate Summaries

In [27]:
def generate_summary(model,tokenizer,input_text):
    # Tokenize the input text
    input_ids = tokenizer.encode(
        input_text,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    input_ids = input_ids.to(device)

    summary_ids = model.generate(input_ids, num_beams=4, max_length=128, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return summary

# Generating Summaries for Test Data using Trained Model

In [28]:
# Generate the model's summaries for the test data
reference_summaries = []
generated_summaries = []
for i in tqdm(range(test_df.shape[0])):
    generated_summary = generate_summary(trained_model, trained_tokenizer,test_df.iloc[i]["transcript"])
    generated_summaries.append(generated_summary)
    reference_summaries.append(test_df.iloc[i]["summary"])

100%|██████████| 399/399 [13:36<00:00,  2.05s/it]


# Example 1 :-

# Reference Summary

In [40]:
reference_summaries[10]

'This presentation is about a project called Euclid, in which a consortium of four partners will develop and implement a curriculum to train practitioners in using linked data. The consortium consists of two small-medium enterprises (SMEs) and two academic partners. One of the SMEs, OnToText, is a company that develops repository and RDF store which is graph database used in a number of industry projects. The other SME, See I Research, works closely with the Semantic Technology Institute Association, which brings together 40 institutions worldwide. The academic partners are KIT, an academic institution with a record in the semantic web area and the Open University, the largest online distance learning university. The project seeks to develop living learning materials and an ebook that will be released on iTunes U and other channels. The materials will cover fundamentals of linked data and topics not currently covered by any training curriculum. The project began in May and will go on f

# Generated Summary

In [41]:
generated_summaries[10]

'The consortium will develop and implement a curriculum to train practitioners in using linked data and semantic technologies. The consortium consists of two SMS and two academic partners. The consortium consists of two companies based in Vienna that work closely together with an Association called SGI International which organizes a wide variety of events. The consortium will provide the knowledge and background required in order to develop curriculum training materials that are useful for large audiences. The consortium will also include the Open University, which has over 2000 students and 50,000 students. The consortium will also provide the knowledge and background of the curriculum.'

# Example 2 :-

# Reference Summary

In [45]:
reference_summaries[6]

'The man is creating a mock up T-shirt on Photoshop. He is using displacement map and image mode grayscale to blur the image and then creating a path around the object. He is also adding adjustment layers to tone down the details of the shirt. He suggests to select a little bit beyond the shirt outlines to avoid having white lines of unused part of the shirt.'

# Generated Summary

In [46]:
generated_summaries[6]

'This text is about creating a displacement map with grayscale filter blur and Gaussian blur. The text is about creating a path around the object to create a mockup of the shirt. The text also mentions how to select a little beyond the shirt outlines to create a white line. The text also mentions how to tone down the details using different adjustment layers.'

In [33]:
pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
Note: you may need to restart the kernel to use updated packages.


# Calculating Rouge Score for Test Data

In [34]:
from rouge import Rouge

rouge = Rouge()

# Calculate ROUGE scores
scores = rouge.get_scores(generated_summaries, reference_summaries, avg=True)

print(scores)

{'rouge-1': {'r': 0.3082220133357383, 'p': 0.48033190241680696, 'f': 0.3655593353838794}, 'rouge-2': {'r': 0.11069361702882946, 'p': 0.17909887451621012, 'f': 0.13220770059350098}, 'rouge-l': {'r': 0.2881786147180402, 'p': 0.45022032772213255, 'f': 0.34208033746040084}}


In [37]:
pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bert_score
Successfully installed bert_score-0.3.13
Note: you may need to restart the kernel to use updated packages.


# Calculating BERTScore for Test Data

In [38]:
from bert_score import score

# Calculate BERTScore
pt_score = score(cands=generated_summaries, refs=reference_summaries, lang="en")

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
# Calculate average precision, recall, and F1 score
avg_precision = pt_score[0].mean().item()
avg_recall = pt_score[1].mean().item()
avg_f1 = pt_score[2].mean().item()
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average F1 Score:", avg_f1)

Average Precision: 0.8707130551338196
Average Recall: 0.8627409338951111
Average F1 Score: 0.866637110710144
