In [50]:
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [51]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [52]:

df = pd.read_csv('/kaggle/input/assignment-4/Reviews.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  568454 non-null  int64 
 1   Summary     568427 non-null  object
 2   Text        568454 non-null  object
dtypes: int64(1), object(2)
memory usage: 13.0+ MB


In [53]:

df.dropna(inplace=True)  

df['Text'] = df['Text'].apply(lambda x: ' '.join(x.split()[:98]))

df['training'] = df['Text'] + ' TL;DR ' + df['Summary']
df_train=df[['Summary','Text','training']][:75]
df_test= df[['Summary','Text','training']][75:100]

df=df_train

In [54]:
max_length = 100
df["training"][0]

'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than most. TL;DR Good Quality Dog Food'

In [55]:


def preprocess_text(text):

    text = text.lower()

    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    stop_words = stopwords.words('english')
    text = ' '.join([word for word in text.split() if word not in stop_words])
    stemmer = PorterStemmer()
    text = ' '.join([stemmer.stem(word) for word in text.split()])

    return text

preprocess_text(df["training"][0])

'bought sever vital can dog food product found good qualiti product look like stew process meat smell better labrador finicki appreci product better tldr good qualiti dog food'

In [56]:

 
class GPT2ReviewDataset(Dataset):  
    def __init__(self, tokenizer, reviews, max_len):
        self.max_length = max_len
        self.tokenizer = tokenizer
        self.eos = self.tokenizer.eos_token
        self.end_token_id = self.tokenizer.eos_token_id
        self.reviews = reviews
        self.result = []

        for review in self.reviews:
            
            k=preprocess_text(review).split()
            idx = k.index('tldr')
            k[idx] = ' TL;DR '
            if(len(k)>115):
                k=k[:114]
                k=" ".join(k)
            else:
                d=len(k)
                k=" ".join(k)
                for i in range(114-d):
                    k+="<|endoftext|>"
#             print(len(k.split()))
            tokenized = self.tokenizer.encode(k + "<|endoftext|>")
#             print(len(tokenized))
#             break
            tokenized = self.pad_truncate(tokenized)   
            self.result.append(torch.tensor(tokenized))

    def __len__(self):
        return len(self.result)


    def __getitem__(self, item):
        return self.result[item]

    def pad_truncate(self, encoded_text):
        padding_length = 4
        text_length = len(encoded_text) - padding_length
        if text_length < self.max_length:
            padding = [self.end_token_id] * (self.max_length - text_length)
            return encoded_text + padding
        elif text_length > self.max_length:
            return encoded_text[:self.max_length + 3] + [self.end_token_id]
        else:
            return encoded_text


In [57]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [58]:
reviews_dataset = GPT2ReviewDataset(tokenizer, df['training'], 120)

In [59]:
# len(reviews_dataset[100])

In [60]:
# print(tokenizer.decode(reviews_dataset[0]))
# print(df['training'][2])
print(reviews_dataset[0])

tensor([   65,  2917,  1750,  9204,   460,  3290,  2057,  1720,  1043,   922,
         4140,  8846,  1720,   804,   588, 20798,  1429,  6174,  8508,  1365,
         2248, 40368,   957,   624,    72,  5763,  1720,  1365,   220, 24811,
           26,  7707,   220,   922,  4140,  8846,  3290,  2057, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256])


In [61]:
dataloader = DataLoader(reviews_dataset, batch_size=32, shuffle=True, drop_last= True)
def train(model, optimizer, dl, epochs):    
    for epoch in range(epochs):
        for idx, batch in enumerate(dl):
             with torch.set_grad_enabled(True):
                optimizer.zero_grad()
                batch = batch.to(device)
                output = model(batch, labels=batch)
                loss = output[0]
                loss.backward()
                optimizer.step()
                if idx % 100 == 0:
                    print("Loss is : %f "%(loss))
                    
optimizer = torch.optim.AdamW(params = model.parameters(), lr=5e-5)
mode=model.to(device)
epoch=10
                    
train(model,optimizer, dataloader, epoch )

Loss is : 11.484254 
Loss is : 4.203943 
Loss is : 2.575060 
Loss is : 2.503411 
Loss is : 2.430969 
Loss is : 2.398445 
Loss is : 2.348293 
Loss is : 2.508475 
Loss is : 2.511341 
Loss is : 2.222166 


In [62]:
import torch
path = "model.pt"  

torch.save(model.state_dict(), path)




In [63]:


path = "/kaggle/working/model.pt"  
loaded_model = torch.load(path)


In [64]:
import torch
import numpy as np


def sample_next_word(probabilities):
    probabilities = torch.softmax(probabilities, dim=-1)
    normalized_probs = probabilities / torch.sum(probabilities)
    return np.random.choice(len(probabilities), 1, p=normalized_probs.cpu().detach().numpy())[0]


def model_infer(model, tokenizer, review, max_length=15):
    encoded_review = tokenizer.encode(review)
    generated_sequence = encoded_review.copy()
    initial_input = torch.tensor(generated_sequence).unsqueeze(0).to(device)

    with torch.no_grad():
        output = model(initial_input)
        last_logits = output.logits[0, -1]
        generated_sequence.append(sample_next_word(last_logits))

        for _ in range(max_length - 1):
            input_ids = torch.tensor(generated_sequence).unsqueeze(0).to(device)
            output = model(input_ids)
            last_logits = output.logits[0, -1]
            next_word_id = sample_next_word(last_logits)

            if next_word_id == tokenizer.eos_token_id:
                break
            else:
                generated_sequence.append(next_word_id)

    return tokenizer.decode(generated_sequence)


In [71]:
k=["I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most. TL;DR Good Quality Dog Food", "Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as Jumbo. TL;DR Not as Advertised"]
samples = [review.split('TL;DR')[0] for review in k]
    
for review in samples:

    summary = model_infer(model, tokenizer, review + " TL;DR ").split(" TL;DR ")[1].strip()

    print("Summaries: "+ str(summary) +"\n")

Summaries: will present with good dog good deck will enjoy any dog



In [66]:
test_summaries=df_test['Summary'].values
test_text=df_test['Text'].values
generated_summaries = []
for i in range(len(test_text)):
    summry= model_infer(model, tokenizer, test_text[i] + " TL;DR ").split(" TL;DR ")[1].strip()
    generated_summaries.append(summry)


In [67]:
for i in range(5):
    print(test_summaries[i])
    print(generated_summaries[i])

No Tea Flavor
nice worth every second wasted
Good
--------------------
Taste great
1.5/
Order only in cold weather
<|endoftext|>
this is the best
rye cream with find  lemoni cant t put over coat blown over coat


In [68]:
pip install rouge_score


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=4c216144cd590e4e4ec1b20360e02d42387451c4f49f9e16a8399bc0fed4575f
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [69]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2
Note: you may need to restart the kernel to use updated packages.


In [70]:
from rouge_score import rouge_scorer

rough_1=[]
rough_2=[]
rough_l=[]
rouge_1_precisions = []
rouge_1_recalls = []
rouge_1_f1_scores = []
rouge_2_precisions = []
rouge_2_recalls = []
rouge_2_f1_scores = []
rouge_l_precisions = []
rouge_l_recalls = []
rouge_l_f1_scores = []

scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL'], use_stemmer=True)
for i in range(len(test_summaries)):
    scores = scorer.score(test_summaries[i], generated_summaries[i])
    rouge_1_precisions.append(scores['rouge1'][0])
    rouge_1_recalls.append(scores['rouge1'][1])
    rouge_1_f1_scores.append(scores['rouge1'][2])
    
    rouge_2_precisions.append(scores['rouge2'][0])
    rouge_2_recalls.append(scores['rouge2'][1])
    rouge_2_f1_scores.append(scores['rouge2'][2])
    
    rouge_l_precisions.append(scores['rougeL'][0])
    rouge_l_recalls.append(scores['rougeL'][1])
    rouge_l_f1_scores.append(scores['rougeL'][2])
    
    
    
rouge_1_precision_mean = sum(rouge_1_precisions) / len(rouge_1_precisions)
rouge_1_recall_mean = sum(rouge_1_recalls) / len(rouge_1_recalls)
rouge_1_f1_score_mean = sum(rouge_1_f1_scores) / len(rouge_1_f1_scores)

rouge_2_precision_mean = sum(rouge_2_precisions) / len(rouge_2_precisions)
rouge_2_recall_mean = sum(rouge_2_recalls) / len(rouge_2_recalls)
rouge_2_f1_score_mean = sum(rouge_2_f1_scores) / len(rouge_2_f1_scores)

rouge_l_precision_mean = sum(rouge_l_precisions) / len(rouge_l_precisions)
rouge_l_recall_mean = sum(rouge_l_recalls) / len(rouge_l_recalls)
rouge_l_f1_score_mean = sum(rouge_l_f1_scores) / len(rouge_l_f1_scores)

print("ROUGE-1: Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}".format(rouge_1_precision_mean, rouge_1_recall_mean, rouge_1_f1_score_mean))
print("ROUGE-2: Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}".format(rouge_2_precision_mean, rouge_2_recall_mean, rouge_2_f1_score_mean))
print("ROUGE-L: Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}".format(rouge_l_precision_mean, rouge_l_recall_mean, rouge_l_f1_score_mean))


ROUGE-1: Precision: 0.07, Recall: 0.11, F1-Score: 0.08
ROUGE-2: Precision: 0.01, Recall: 0.01, F1-Score: 0.01
ROUGE-L: Precision: 0.06, Recall: 0.10, F1-Score: 0.07
