In [38]:
!pip install matplotlib pandas transformers datasets torch scikit-learn


Defaulting to user installation because normal site-packages is not writeable


In [39]:
import matplotlib.pyplot as plt, pandas as pd
import nltk

import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Sai
[nltk_data]     Teja\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Sai
[nltk_data]     Teja\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [40]:
df=pd.read_csv('Dataset/train data.csv')


In [41]:
df.describe()

Unnamed: 0,id,article,highlights
count,1000,1000,1000
unique,1000,1000,1000
top,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
freq,1,1,1


In [42]:
df.head(10)

Unnamed: 0,id,article,highlights
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."
5,5ed5e3fbd235a8046cd3b87f4a1aa51b856c8ec3,This is the moment that a crew of firefighters...,Giant pig fell into the swimming pool at his h...
6,6394f51b120ceb3da5e7b53dd5167fc4cf80b514,The amount of time people spend listening to B...,Figures show that while millions still tune in...
7,98be9b2d558c17df8a13597195957a7c8587ddcd,"(CNN)So, you'd like a ""Full House"" reunion and...","Show will return with a one-hour special, foll..."
8,57f68638739c3a1de8d9922b389d6ded39977012,"At 11:20pm, former world champion Ken Doherty ...",Reanne Evans faced Ken Doherty in World Champi...
9,20778c35c19d741cc182719de336d71e1a0b228e,A gang of six men have been jailed for a total...,Gang have been jailed for a total of 31 years ...


In [43]:
print('Article:\n\n'+df.article[0]+'\n\n'+'-'*len(df.article[0])+'\n\nSummary:\n\n'+df.highlights[0])

Article:

Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? This week, a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans. 'In a world where animals have more rights to space and food than humans,' said Charlie Leocha, consumer representative on the committee. 'It is time that the DOT and FAA take a stand for humane treatment of passengers.' But could crowding on planes lead to more serious issues than fighting 

## Data Preperation 

In [44]:
from sklearn.model_selection import train_test_split
train_df,test_df=train_test_split(df.drop(columns=['id']),test_size=0.2,random_state=42)

## Pre Processing

### Normal Pre-Processing

In [45]:
def normal_processing(text:str):
    print('Normal processing\n\n'+text)
    #lower case
    lower_text=text.lower()

    #punctuation removing
    special_words_removed=re.sub(f'[{re.escape(string.punctuation)}]','',string=lower_text)

    #tokenizing
    tokenized_words=nltk.word_tokenize(special_words_removed)
    
    #remove stopwords
    stopwords_set=set(stopwords.words('english'))
    removed_stopwrods=[ word for word in tokenized_words if word not in stopwords_set ]

    #stemming and lemmatization
    stemmer=PorterStemmer()
    lemmatizer=WordNetLemmatizer()
    stemmed_words=[stemmer.stem(word) for word in removed_stopwrods] 
    lemmaized_words=[lemmatizer.lemmatize(word) for word in stemmed_words]


    return lemmaized_words



### Bert Pre-Processing / Tokenizer

In [46]:
from transformers import BertTokenizer

# iniating tokenizer
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')


def berttokenize(text:str):
    # print('\nBert processing\n\n'+text)
    return tokenizer(text,padding=True,truncation=True)

    

#### Comparison

In [47]:
print(normal_processing("Experts question if  packed out planes are putting passengers at risk!"))
print(berttokenize("Experts question if  packed out planes are putting passengers at risk!"))

Normal processing

Experts question if  packed out planes are putting passengers at risk!
['expert', 'question', 'pack', 'plane', 'put', 'passeng', 'risk']
{'input_ids': [101, 8519, 3160, 2065, 8966, 2041, 9738, 2024, 5128, 5467, 2012, 3891, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


## Encoding DataSet

In [113]:
from datasets import Dataset
dataset=Dataset.from_pandas(train_df)
dataset

Dataset({
    features: ['article', 'highlights', '__index_level_0__'],
    num_rows: 800
})

In [160]:
def preprocess_function(text):
    inputs=tokenizer(text['article'],padding='max_length',truncation=True,max_length=512)
    
    labels=tokenizer(text['highlights'],padding='max_length',truncation=True,max_length=512).input_ids
    inputs['labels']=labels

    
    return inputs

In [161]:
tokenized=dataset.map(preprocess_function,batched=True,remove_columns=dataset.column_names)

Map: 100%|██████████| 800/800 [00:12<00:00, 63.46 examples/s]


In [167]:
print(len(tokenized['input_ids'][0])==len(tokenized['labels'][0]))

True


In [49]:
train_enc=berttokenize(train_df['article'].tolist())
train_label_enc=berttokenize(train_df['highlights'].tolist())
# test_enc=berttokenize(test_df['articles'].tolist())
# tsst_label_enc=berttokenize(test_df['highlights'].tolist())

In [89]:
for i,j in train_label_enc.items():
    k=0
    if k==1:
        break
    k+=1
    print(j[0])

[101, 18301, 2038, 2623, 2008, 2035, 11500, 2551, 2005, 2068, 2442, 3713, 2394, 1012, 3627, 2001, 2028, 1997, 1996, 4443, 5918, 2005, 1037, 2047, 8329, 2082, 1999, 10958, 4160, 19062, 1012, 17237, 2038, 2036, 4692, 2000, 8970, 2394, 2653, 14148, 2005, 11500, 1012, 2021, 2750, 1996, 2375, 2108, 4844, 1010, 1037, 12401, 16053, 2832, 2965, 7327, 1011, 4738, 11500, 2024, 2145, 2108, 4846, 2302, 2394, 5852, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [91]:
for ids in range(1):
    item={key:val[ids] for key,val in train_enc.items()}
    item['labels']=train_label_enc['input_ids'][ids]
    print(item)

{'input_ids': [101, 2394, 6827, 1024, 2023, 16983, 15833, 6254, 17472, 1996, 2047, 18301, 8329, 2082, 1998, 7201, 1996, 2653, 4443, 5918, 2005, 2052, 1011, 2022, 11500, 1012, 17671, 3554, 2005, 1996, 5499, 2110, 7404, 2177, 1999, 7795, 2031, 2623, 2008, 2035, 11500, 2551, 1999, 2752, 2104, 2037, 2491, 2442, 3713, 2394, 1011, 2242, 1996, 17237, 2145, 8440, 1005, 1056, 3107, 1012, 8669, 3488, 2000, 2330, 1037, 2082, 1997, 8329, 1999, 18301, 1005, 2139, 13743, 3007, 10958, 4160, 19062, 1010, 1996, 4654, 7913, 23738, 2015, 2036, 21362, 1996, 7823, 4443, 5918, 2005, 17464, 2966, 3667, 1012, 2426, 2068, 2024, 3513, 2008, 17362, 2024, 2053, 2062, 2084, 2423, 2086, 1997, 2287, 1010, 2442, 2022, 5627, 2000, 2147, 5973, 2503, 1996, 3700, 4758, 2011, 1996, 4654, 7913, 23738, 2015, 1010, 1998, 2442, 3713, 19376, 2394, 1012, 1996, 3732, 3627, 2003, 2242, 2008, 2130, 1996, 17237, 2038, 2664, 3929, 2404, 1999, 2173, 1010, 2007, 11500, 2040, 4738, 1999, 2647, 3032, 2145, 2583, 2000, 7438, 5022, 2302, 

## Creating customdataset class

In [103]:
import torch

class SummaryDataset(torch.utils.data.Dataset):
    def __init__(self,articles,summary):
        self.encodings=articles
        self.labels=summary
    def __getitem__(self,ids):
        item={key:torch.tensor(val[ids]) for key,val in self.encodings.items()}
        item['labels']=torch.tensor(self.labels['input_ids'][ids])
        return item
    def __len__(self):
        return len(self.labels)

In [104]:
# train_dataset=SummaryDataset(train_enc,train_label_enc)
# eval_dataset=SummaryDataset(test_enc,test_df['highlights'].tolist())

In [105]:
print(train_dataset.__len__())


3


In [54]:
! pip install wandb


Defaulting to user installation because normal site-packages is not writeable


In [139]:
import os,wandb
os.environ["WANDB_NOTEBOOK_NAME"] = "TextsummarizerModel.ipynb"
wandb.login()



True

In [140]:
from transformers import BertForSequenceClassification, TrainingArguments

model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [168]:


training_arg = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    num_train_epochs=1,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    weight_decay=0.01,
    logging_dir='./logs',
)




In [169]:
print(train_dataset.__getitem__(0))

{'input_ids': tensor([  101,  2394,  6827,  1024,  2023, 16983, 15833,  6254, 17472,  1996,
         2047, 18301,  8329,  2082,  1998,  7201,  1996,  2653,  4443,  5918,
         2005,  2052,  1011,  2022, 11500,  1012, 17671,  3554,  2005,  1996,
         5499,  2110,  7404,  2177,  1999,  7795,  2031,  2623,  2008,  2035,
        11500,  2551,  1999,  2752,  2104,  2037,  2491,  2442,  3713,  2394,
         1011,  2242,  1996, 17237,  2145,  8440,  1005,  1056,  3107,  1012,
         8669,  3488,  2000,  2330,  1037,  2082,  1997,  8329,  1999, 18301,
         1005,  2139, 13743,  3007, 10958,  4160, 19062,  1010,  1996,  4654,
         7913, 23738,  2015,  2036, 21362,  1996,  7823,  4443,  5918,  2005,
        17464,  2966,  3667,  1012,  2426,  2068,  2024,  3513,  2008, 17362,
         2024,  2053,  2062,  2084,  2423,  2086,  1997,  2287,  1010,  2442,
         2022,  5627,  2000,  2147,  5973,  2503,  1996,  3700,  4758,  2011,
         1996,  4654,  7913, 23738,  2015,  1010, 

In [170]:
from transformers import Trainer

trainer = Trainer(
    model=model,                        
    args=training_arg,
    train_dataset=tokenized,
    eval_dataset=tokenized,
    
    
)

trainer.train()

  0%|          | 0/267 [13:49<?, ?it/s]
  0%|          | 0/267 [00:00<?, ?it/s]

ValueError: Expected input batch_size (3) to match target batch_size (1536).

In [111]:
train_dataset.__getitem__(1)

{'input_ids': tensor([  101,  3097, 14658,  2085,  4070,  2005,  2340,  2566,  9358,  1997,
          2155,  7435,  1010,  2047,  3275,  2265,  1012,  1996,  2193,  1997,
         14658,  2013,  6917,  2003,  2039,  2011,  2340,  2566,  9358,  1999,
          1037,  5476,  2004,  1996, 17237,  5363,  2000,  4337,  1037, 15843,
          1997,  3095,  1012, 15040,  2085,  4070,  2005,  2471,  2028,  2155,
          3460,  1999,  2274,  2348,  1999,  2070,  2752,  1996, 10817,  2003,
          2062,  2084,  2048,  1011, 12263,  1012,  3026,  7435,  2360,  1996,
          2193,  2097,  4125,  2582,  2058,  1996,  2279,  2261,  2086,  2138,
          1996, 17237,  2003,  2061,  2460,  1997,  2188, 16523, 12384, 19960,
          6558,  1012,  4481,  2013,  1996,  2740,  1998,  2591,  2729,  2592,
          2803,  2265,  2008,  2570,  1012,  1014,  2566,  9358,  1997,  2783,
         14658,  4227,  2037, 15644,  6931,  1010,  2039,  2013,  2539,  1012,
          1022,  2566,  9358,  1999,  2