In [15]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
df = pd.read_csv("./Dataset/Reviews.csv").sample(frac = 0.005, random_state=1)

df.dropna(subset=['Text', 'Summary'], inplace=True)

# Function to clean text
def clean_text(text):
    text = re.sub(r'\[^A-Za-z0-9\]+', '', str(text))

    words = word_tokenize(text.lower())

    lemmatizer = WordNetLemmatizer()

    stop = set(stopwords.words('english'))

    words = [lemmatizer.lemmatize(word) for word in words if word not in stop]

    return ' '.join(words)

df['Text'] = df['Text'].apply(clean_text)
df['Summary'] = df['Summary'].apply(clean_text)




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saksh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saksh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saksh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
df['Text']

288312    love cherry pie lara bar . best tasty bar suga...
431726    melitta cafe collection blanc et noir coffee s...
110311    girl absolutely loved tuna . heaven could n't ...
91855     vendor fast dependable . tea simply best way r...
338855    update - 8/9/2010 < br / > lot happen couple m...
                                ...                        
465528    's taken year lose 64 pound triscuits big part...
477980    'm embarrassed admit got suckered 5 star revie...
537044     cat love treat . : ) shake pack , come running .
51434     bought accident local supermarket . surprise ,...
426008    husband love coffee -- think 's best hazelnut ...
Name: Text, Length: 2842, dtype: object

In [17]:
df['Summary']

288312                                   cherry pie larabar
431726                                       melitta coffee
110311                                          great treat
91855                                         daily calming
338855                              best canned artichoke !
                                ...                        
465528    triscuits , diet coke weight loss ... perfect ...
477980                                      emperor clothes
537044                                      absolutely love
51434                               new favorite soup ! ! !
426008                                           fabulous !
Name: Summary, Length: 2842, dtype: object

In [18]:
df.to_csv('processed.csv')

In [19]:
import pandas as pd
df = pd.read_csv('processed.csv')

In [None]:
df['Text']

In [None]:
df['Summary']

In [20]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
from transformers import AdamW
from torch.utils.data import DataLoader, Dataset


tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [21]:
from sklearn.model_selection import train_test_split


train_data, test_data = train_test_split(df, test_size=0.25, random_state=42)


In [22]:
from torch.utils.data import Dataset, DataLoader

class ReviewSummaryDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.texts = texts
        self.summaries = summaries
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        summary = self.summaries[idx]

        # Encode the pairs of text and summaries
        encoded_pair = self.tokenizer(text, summary,
                                      max_length=self.max_length,
                                      truncation=True,
                                      padding='max_length',
                                      return_tensors='pt')

        # These are input_ids and attention_mask from tokenizer output
        input_ids = encoded_pair['input_ids'].squeeze()
        attention_mask = encoded_pair['attention_mask'].squeeze()

        return input_ids, attention_mask

# Initialize dataset
dataset = ReviewSummaryDataset(train_data['Text'].tolist(), train_data['Summary'].tolist(), tokenizer)


In [24]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from torch.utils.data import DataLoader, Dataset


# Set the pad_token to the eos_token (This is a common practice with GPT models)
tokenizer.pad_token = tokenizer.eos_token

# Assuming train_data is defined somewhere
dataset = ReviewSummaryDataset(train_data['Text'].tolist(), train_data['Summary'].tolist(), tokenizer)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Training parameters
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 4

# Training loop
model.train()

for epoch in range(epochs):
    for input_ids, attention_mask in dataloader:
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch}, Loss: {loss.item()}")




Epoch 0, Loss: 0.8897305727005005
Epoch 0, Loss: 0.7620732188224792
Epoch 0, Loss: 0.9272246956825256
Epoch 0, Loss: 0.42982128262519836
Epoch 0, Loss: 1.1102238893508911
Epoch 0, Loss: 0.7465527057647705
Epoch 0, Loss: 0.3906932473182678
Epoch 0, Loss: 1.5323435068130493
Epoch 0, Loss: 0.8804509043693542
Epoch 0, Loss: 0.6071149706840515
Epoch 0, Loss: 0.9521011114120483
Epoch 0, Loss: 1.5150409936904907
Epoch 0, Loss: 1.1015095710754395
Epoch 0, Loss: 0.3567397892475128
Epoch 0, Loss: 1.065793752670288
Epoch 0, Loss: 0.7670884728431702
Epoch 0, Loss: 0.5085272192955017
Epoch 0, Loss: 0.30241021513938904
Epoch 0, Loss: 0.6155779957771301
Epoch 0, Loss: 0.6609938144683838
Epoch 0, Loss: 0.659330427646637
Epoch 0, Loss: 1.003301978111267
Epoch 0, Loss: 0.7726998329162598
Epoch 0, Loss: 0.8593854904174805
Epoch 0, Loss: 0.7058529853820801
Epoch 0, Loss: 0.5360128283500671
Epoch 0, Loss: 0.44785502552986145
Epoch 0, Loss: 0.7726408243179321
Epoch 0, Loss: 1.175550103187561
Epoch 0, Loss: 

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Epoch 0, Loss: 0.8728443384170532
Epoch 0, Loss: 1.3296620845794678
Epoch 0, Loss: 1.2725915908813477
Epoch 0, Loss: 0.34090882539749146
Epoch 0, Loss: 0.8621825575828552
Epoch 0, Loss: 1.2213082313537598
Epoch 0, Loss: 0.6771134734153748
Epoch 0, Loss: 0.45646196603775024
Epoch 0, Loss: 0.5207059979438782
Epoch 0, Loss: 1.512495756149292
Epoch 0, Loss: 0.9341555237770081
Epoch 0, Loss: 0.6085497140884399


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Epoch 0, Loss: 0.6474109888076782
Epoch 0, Loss: 1.7097753286361694
Epoch 0, Loss: 0.7227051258087158
Epoch 0, Loss: 1.0151420831680298
Epoch 0, Loss: 0.8050028085708618
Epoch 0, Loss: 0.7631082534790039
Epoch 0, Loss: 1.2348319292068481
Epoch 0, Loss: 0.6027070879936218
Epoch 0, Loss: 0.42905429005622864
Epoch 0, Loss: 0.6361421942710876
Epoch 0, Loss: 0.5606024265289307
Epoch 0, Loss: 0.5195904970169067
Epoch 0, Loss: 0.852591335773468
Epoch 0, Loss: 0.5479235649108887
Epoch 0, Loss: 0.3071536421775818
Epoch 0, Loss: 0.9668232202529907
Epoch 0, Loss: 0.7878618240356445
Epoch 0, Loss: 0.9611565470695496
Epoch 0, Loss: 0.37482860684394836
Epoch 0, Loss: 0.7207053303718567
Epoch 0, Loss: 0.9004090428352356
Epoch 0, Loss: 0.3457772135734558
Epoch 0, Loss: 0.7521377205848694
Epoch 0, Loss: 0.7005221843719482
Epoch 0, Loss: 0.3331531286239624
Epoch 0, Loss: 0.7282260060310364


In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import DataLoader
from rouge import Rouge

# Assuming the model and tokenizer are already loaded and the test_data prepared
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.eval()  # Set the model to evaluation mode

class ReviewSummaryDataset(torch.utils.data.Dataset):
    def __init__(self, texts, summaries, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.texts = texts
        self.summaries = summaries
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoded_pair = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        input_ids = encoded_pair['input_ids'].squeeze(0)
        attention_mask = encoded_pair['attention_mask'].squeeze(0)
        return input_ids, attention_mask

# Load the test data as an instance of the ReviewSummaryDataset
test_dataset = ReviewSummaryDataset(test_data['Text'].tolist(), test_data['Summary'].tolist(), tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

def generate_summaries(model, dataloader):
    model.eval()
    summaries = []
    with torch.no_grad():
        for input_ids, attention_mask in dataloader:
            outputs = model.generate(input_ids, attention_mask=attention_mask)
            summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
            summaries.append(summary)
    return summaries

# Generate summaries
generated_summaries = generate_summaries(model, test_dataloader)

# Actual summaries
actual_summaries = test_data['Summary'].tolist()

# Calculate ROUGE scores
def calculate_rouge(hypotheses, references):
    rouge = Rouge()
    scores = rouge.get_scores(hypotheses, references, avg=True)
    return scores

rouge_scores = calculate_rouge(generated_summaries, actual_summaries)
print("ROUGE scores:", rouge_scores)
