In [1]:
import pandas as pd
import numpy as np
import os
import requests
import re

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# df = pd.read_csv("/hindi-train/hindi_train.csv")

/kaggle/input/dataset/english_train (2).csv
/kaggle/input/dataset/EnglishNews_test.csv


In [3]:
df_train = pd.read_csv('/kaggle/input/dataset/english_train (2).csv', low_memory = False)
df_test = pd.read_csv('/kaggle/input/dataset/EnglishNews_test.csv')

In [4]:
df_train.shape

(28347, 233)

In [5]:
df_test.shape

(2895, 3)

In [6]:
df_train = df_train[['Summary','Article']]

In [7]:
df_train

Unnamed: 0,Summary,Article
0,Barbie Movie Review: Ryan Gosling shines the b...,Barbie Movie Review: One mention of Barbie and...
1,A source close to the film told News18 exclusi...,"The highly anticipated Gadar 2, starring Sunny..."
2,Kartik Aaryan was spotted flying in economy cl...,"Kartik Aaryan, who is gearing up for the relea..."
3,Abbasi had said that it was important to make ...,"Indian woman, Anju who travelled to Pakistan t..."
4,Flash floods and landslides caused by heavy do...,"Himachal Pradesh, one of the worst hit states ..."
...,...,...
28342,A massive earthquake and tsunami in 2011 destr...,FILE - Tanks storing treated radioactive water...
28343,"Over 680 tourists from France, Thailand, the N...",Villagers clear debris caused by an earthquake...
28344,"Seif al-Islam, the son and one-time heir appar...","Seif al-Islam, the son and one-time heir appar..."
28345,Most members of the Thai youth football team r...,Thai BoysMost members of the Thai youth footba...


In [8]:
df_test = df_test[['Article']]
df_test

Unnamed: 0,Article
0,Chief Minister Mamata Banerjee-led West Bengal...
1,If you are a Xiaomi Mi A3 user whose unit has ...
2,Har Ghar Tiranga is Prime Minister Narendra M...
3,VidaHero MotoCorp is gearing up to expand its ...
4,The Delhi High Court has permitted St Stephen'...
...,...
2890,"In 'How to Win a Man,' an essay published in 1..."
2891,Right before the launch of the OnePlus 7 and 7...
2892,Yashasvi Jaiswal had a terrific run in the 202...
2893,Punjab Congress chief Navjot Singh Sidhu has q...


In [9]:
!pip install transformers -q
!pip install wandb -q

In [10]:
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

In [11]:
!pip install SentencePiece -q

In [12]:
import torch.nn.functional as F
from transformers import  T5Tokenizer , T5ForConditionalGeneration



In [13]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [14]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.text
        self.ctext = self.data.ctext

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long),
            'source_mask': source_mask.to(dtype=torch.long),
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }


In [15]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        labels = y[:, 1:].clone().detach()
        labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=labels)
        loss = outputs[0]

        if _%10 == 0:
            wandb.log({"Training Loss": loss.item()})

        if _%10==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [16]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask,
                max_length=75,
                num_beams=4,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
df_train

Unnamed: 0,Summary,Article
0,Barbie Movie Review: Ryan Gosling shines the b...,Barbie Movie Review: One mention of Barbie and...
1,A source close to the film told News18 exclusi...,"The highly anticipated Gadar 2, starring Sunny..."
2,Kartik Aaryan was spotted flying in economy cl...,"Kartik Aaryan, who is gearing up for the relea..."
3,Abbasi had said that it was important to make ...,"Indian woman, Anju who travelled to Pakistan t..."
4,Flash floods and landslides caused by heavy do...,"Himachal Pradesh, one of the worst hit states ..."
...,...,...
28342,A massive earthquake and tsunami in 2011 destr...,FILE - Tanks storing treated radioactive water...
28343,"Over 680 tourists from France, Thailand, the N...",Villagers clear debris caused by an earthquake...
28344,"Seif al-Islam, the son and one-time heir appar...","Seif al-Islam, the son and one-time heir appar..."
28345,Most members of the Thai youth football team r...,Thai BoysMost members of the Thai youth footba...


In [19]:
df_train.rename(columns = {'Article':'ctext', 'Summary':'text'}, inplace = True)
df_train

Unnamed: 0,text,ctext
0,Barbie Movie Review: Ryan Gosling shines the b...,Barbie Movie Review: One mention of Barbie and...
1,A source close to the film told News18 exclusi...,"The highly anticipated Gadar 2, starring Sunny..."
2,Kartik Aaryan was spotted flying in economy cl...,"Kartik Aaryan, who is gearing up for the relea..."
3,Abbasi had said that it was important to make ...,"Indian woman, Anju who travelled to Pakistan t..."
4,Flash floods and landslides caused by heavy do...,"Himachal Pradesh, one of the worst hit states ..."
...,...,...
28342,A massive earthquake and tsunami in 2011 destr...,FILE - Tanks storing treated radioactive water...
28343,"Over 680 tourists from France, Thailand, the N...",Villagers clear debris caused by an earthquake...
28344,"Seif al-Islam, the son and one-time heir appar...","Seif al-Islam, the son and one-time heir appar..."
28345,Most members of the Thai youth football team r...,Thai BoysMost members of the Thai youth footba...


In [20]:
df_train.ctext = 'summarize: ' + df_train.ctext
df_train

Unnamed: 0,text,ctext
0,Barbie Movie Review: Ryan Gosling shines the b...,summarize: Barbie Movie Review: One mention of...
1,A source close to the film told News18 exclusi...,"summarize: The highly anticipated Gadar 2, sta..."
2,Kartik Aaryan was spotted flying in economy cl...,"summarize: Kartik Aaryan, who is gearing up fo..."
3,Abbasi had said that it was important to make ...,"summarize: Indian woman, Anju who travelled to..."
4,Flash floods and landslides caused by heavy do...,"summarize: Himachal Pradesh, one of the worst ..."
...,...,...
28342,A massive earthquake and tsunami in 2011 destr...,summarize: FILE - Tanks storing treated radioa...
28343,"Over 680 tourists from France, Thailand, the N...",summarize: Villagers clear debris caused by an...
28344,"Seif al-Islam, the son and one-time heir appar...","summarize: Seif al-Islam, the son and one-time..."
28345,Most members of the Thai youth football team r...,summarize: Thai BoysMost members of the Thai y...


In [21]:
print(df_train.head())

                                                text  \
0  Barbie Movie Review: Ryan Gosling shines the b...   
1  A source close to the film told News18 exclusi...   
2  Kartik Aaryan was spotted flying in economy cl...   
3  Abbasi had said that it was important to make ...   
4  Flash floods and landslides caused by heavy do...   

                                               ctext  
0  summarize: Barbie Movie Review: One mention of...  
1  summarize: The highly anticipated Gadar 2, sta...  
2  summarize: Kartik Aaryan, who is gearing up fo...  
3  summarize: Indian woman, Anju who travelled to...  
4  summarize: Himachal Pradesh, one of the worst ...  


In [22]:
%pip install -q wandb
import wandb
wandb.login()

Note: you may need to restart the kernel to use updated packages.


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [23]:
wandb.init(project="English_T5_KAGGLE")

config = wandb.config         
config.TRAIN_BATCH_SIZE = 2  
config.VALID_BATCH_SIZE = 2    
config.TRAIN_EPOCHS = 8       
config.VAL_EPOCHS = 8 
config.LEARNING_RATE = 5e-5    
config.SEED = 42               
config.MAX_LEN = 512
config.SUMMARY_LEN = 75 


torch.manual_seed(config.SEED) 
np.random.seed(config.SEED)
torch.backends.cudnn.deterministic = True


tokenizer = T5Tokenizer.from_pretrained("t5-base")

[34m[1mwandb[0m: Currently logged in as: [33mashutoshguptanitk[0m ([33mnitk_ai[0m). Use [1m`wandb login --relogin`[0m to force relogin


Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [24]:
df_test = pd.read_csv('/kaggle/input/dataset/EnglishNews_test.csv')
df_test

Unnamed: 0,id,Article,Heading
0,EnglishNews_test_0,Chief Minister Mamata Banerjee-led West Bengal...,"After Durga Puja, West Bengal Govt Eyes UNESCO..."
1,EnglishNews_test_1,If you are a Xiaomi Mi A3 user whose unit has ...,Is Your Xiaomi Mi A3 Bricked Because of The Fa...
2,EnglishNews_test_2,Har Ghar Tiranga is Prime Minister Narendra M...,Har Ghar Tiranga Movement: Right Time for Indi...
3,EnglishNews_test_3,VidaHero MotoCorp is gearing up to expand its ...,Hero MotoCorp Plans to Expand its Electric Two...
4,EnglishNews_test_4,The Delhi High Court has permitted St Stephen'...,Delhi HC Permits St Stephen's College to Hold ...
...,...,...,...
2890,EnglishNews_test_2995,"In 'How to Win a Man,' an essay published in 1...",How to Find Love Right Now? 9 Coaches Explain ...
2891,EnglishNews_test_2996,Right before the launch of the OnePlus 7 and 7...,"OnePlus 6T Available for Rs 27,999 During the ..."
2892,EnglishNews_test_2997,Yashasvi Jaiswal had a terrific run in the 202...,Yashasvi Jaiswal has Hunger to Win & be the Be...
2893,EnglishNews_test_2998,Punjab Congress chief Navjot Singh Sidhu has q...,Sidhu Quits as Punjab Cong Chief Amid Dispute ...


In [25]:
df_test = df_test[['Article']]

In [26]:
df_test.rename(columns = {'Article':'ctext'}, inplace = True)
df_test.ctext = 'summarize: ' + df_test.ctext

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.rename(columns = {'Article':'ctext'}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.ctext = 'summarize: ' + df_test.ctext


In [27]:
df_test

Unnamed: 0,ctext
0,summarize: Chief Minister Mamata Banerjee-led ...
1,summarize: If you are a Xiaomi Mi A3 user whos...
2,summarize: Har Ghar Tiranga is Prime Minister...
3,summarize: VidaHero MotoCorp is gearing up to ...
4,summarize: The Delhi High Court has permitted ...
...,...
2890,"summarize: In 'How to Win a Man,' an essay pub..."
2891,summarize: Right before the launch of the OneP...
2892,summarize: Yashasvi Jaiswal had a terrific run...
2893,summarize: Punjab Congress chief Navjot Singh ...


In [28]:
df_test.insert(1, "text", "x")

In [29]:
df_test

Unnamed: 0,ctext,text
0,summarize: Chief Minister Mamata Banerjee-led ...,x
1,summarize: If you are a Xiaomi Mi A3 user whos...,x
2,summarize: Har Ghar Tiranga is Prime Minister...,x
3,summarize: VidaHero MotoCorp is gearing up to ...,x
4,summarize: The Delhi High Court has permitted ...,x
...,...,...
2890,"summarize: In 'How to Win a Man,' an essay pub...",x
2891,summarize: Right before the launch of the OneP...,x
2892,summarize: Yashasvi Jaiswal had a terrific run...,x
2893,summarize: Punjab Congress chief Navjot Singh ...,x


In [30]:
# Create of Dataset and Dataloader

# train_size = 0.8
train_dataset=df_train
val_dataset=df_test
# train_dataset = train_dataset.reset_index(drop=True)



train_dataset.columns = train_dataset.columns.str.strip()


#  create dataset ready to use in  Dataloader
training_set = CustomDataset(train_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)

In [31]:
print(training_set.ctext)

0        summarize: Barbie Movie Review: One mention of...
1        summarize: The highly anticipated Gadar 2, sta...
2        summarize: Kartik Aaryan, who is gearing up fo...
3        summarize: Indian woman, Anju who travelled to...
4        summarize: Himachal Pradesh, one of the worst ...
                               ...                        
28342    summarize: FILE - Tanks storing treated radioa...
28343    summarize: Villagers clear debris caused by an...
28344    summarize: Seif al-Islam, the son and one-time...
28345    summarize: Thai BoysMost members of the Thai y...
28346    summarize: Karachi to award citizens who ident...
Name: ctext, Length: 28347, dtype: object


In [32]:
val_set = CustomDataset(val_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)

# parameters fo dataloaders
train_params = {
    'batch_size': config.TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
    }

val_params = {
    'batch_size': config.VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
    }

# Create  Dataloaders 
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)



model = T5ForConditionalGeneration.from_pretrained("t5-base")
model = model.to(device)

Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [33]:
# Defining the optimizer that 
val_set = CustomDataset(val_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)

# parameters fo dataloaders
train_params = {
    'batch_size': config.TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
    }

val_params = {
    'batch_size': config.VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
    }

# Create  Dataloaders 
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)



model = T5ForConditionalGeneration.from_pretrained("t5-base")
model = model.to(device)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Fine-Tuning for the model on our dataset




Epoch: 0, Loss:  6.305567741394043
Epoch: 0, Loss:  2.2358288764953613
Epoch: 0, Loss:  1.8008522987365723
Epoch: 0, Loss:  1.077731728553772
Epoch: 0, Loss:  1.1595935821533203
Epoch: 0, Loss:  0.5874297022819519
Epoch: 0, Loss:  1.1014891862869263
Epoch: 0, Loss:  1.946222186088562
Epoch: 0, Loss:  0.5399002432823181
Epoch: 0, Loss:  0.7453762888908386
Epoch: 0, Loss:  0.48694029450416565
Epoch: 0, Loss:  0.6612539291381836
Epoch: 0, Loss:  1.2375530004501343
Epoch: 0, Loss:  4.995579242706299
Epoch: 0, Loss:  0.24000582098960876
Epoch: 0, Loss:  0.24401283264160156
Epoch: 0, Loss:  0.5715259313583374
Epoch: 0, Loss:  0.2478499412536621
Epoch: 0, Loss:  0.4537789523601532
Epoch: 0, Loss:  0.22707505524158478
Epoch: 0, Loss:  0.8625078797340393
Epoch: 0, Loss:  0.2336285412311554
Epoch: 0, Loss:  0.41125425696372986
Epoch: 0, Loss:  0.18087361752986908
Epoch: 0, Loss:  0.717706561088562
Epoch: 0, Loss:  0.3642994463443756
Epoch: 0, Loss:  1.9040443897247314
Epoch: 0, Loss:  1.27495825

In [35]:
model.save_pretrained("/kaggle/working/")
model.save_pretrained('/kaggle/working/my_model')

In [36]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [37]:
model.push_to_hub('saliq7/T5_English')

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/saliq7/T5_English/commit/e40220fd744385497d2f857087551ac9965f96f2', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='e40220fd744385497d2f857087551ac9965f96f2', pr_url=None, pr_revision=None, pr_num=None)

In [38]:
df = pd.read_csv('predictions_english_t5.csv')

In [40]:
df['Generated Text'][0]

'a team headed by South 24-Parganas district magistrate Sumit Gupta is doing the groundwork to prepare a dossier that will be sent to UNESCO to apply for the prestigious title'

In [41]:
df_test['ctext'][0]

"summarize: Chief Minister Mamata Banerjee-led West Bengal government is all set to apply for UNESCO's intangible heritage tag for the Gangasagar Mela&amp;#8211; the century-old fair held at the southern tip of the Bay of Bengal. The Hindu pilgrimage is considered the second largest human congregation after the Kumbh Mela.According to sources in the state government, a team headed by South 24-Parganas district magistrate Sumit Gupta is doing the groundwork to prepare a dossier that will be sent to UNESCO to apply for the prestigious title.Speaking to News18, DM Sumit Gupta said, &amp;#8220;This mela is huge with cultural quotient and historical significance. We are now working on the dossier, we are doing research in every way. We hope this will be approved .'In 2023, the Gangasagar mela is scheduled to be held from January 8 to January 14, and the government is planning to videograph the upcoming pilgrimage to attach it to the dossier sent to the United Nation&amp;#8217;s agency.  The

In [42]:
col = df['Generated Text']

In [50]:
df_final = df_test.copy()

In [51]:
df_final.drop(['text'], axis =1, inplace =True)

In [52]:
df_final

Unnamed: 0,ctext
0,summarize: Chief Minister Mamata Banerjee-led ...
1,summarize: If you are a Xiaomi Mi A3 user whos...
2,summarize: Har Ghar Tiranga is Prime Minister...
3,summarize: VidaHero MotoCorp is gearing up to ...
4,summarize: The Delhi High Court has permitted ...
...,...
2890,"summarize: In 'How to Win a Man,' an essay pub..."
2891,summarize: Right before the launch of the OneP...
2892,summarize: Yashasvi Jaiswal had a terrific run...
2893,summarize: Punjab Congress chief Navjot Singh ...


In [56]:
df_final['Summary Generated'] = col

In [57]:
df_final

Unnamed: 0,ctext,Summary Generated
0,summarize: Chief Minister Mamata Banerjee-led ...,a team headed by South 24-Parganas district ma...
1,summarize: If you are a Xiaomi Mi A3 user whos...,Xiaomi has confirmed that all bricked Mi A3 ph...
2,summarize: Har Ghar Tiranga is Prime Minister...,"During his Independence Day speech in 2021, Pr..."
3,summarize: VidaHero MotoCorp is gearing up to ...,", the company plans to introduce new entry-lev..."
4,summarize: The Delhi High Court has permitted ...,"Besides the pleas of the two colleges, the cou..."
...,...,...
2890,"summarize: In 'How to Win a Man,' an essay pub...",For more-modern and less-demonic relationship ...
2891,summarize: Right before the launch of the OneP...,The OnePlus 6T runs on the Qualcomm Snapdragon...
2892,summarize: Yashasvi Jaiswal had a terrific run...,In an exclusive conversation with News18 Crick...
2893,summarize: Punjab Congress chief Navjot Singh ...,Navjot Singh Sidhu has quit from his post amid...


In [58]:
df_final.to_csv('Eng_final_output.csv')