In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
reviews_csv = '/content/drive/MyDrive/Reviews.csv'

In [None]:
import pandas as pd
review_df= pd.read_csv(reviews_csv)
review_df.shape

(568454, 10)

In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Text cleaning and preprocessing function
def clean_and_preprocess(text):
  if type(text) != str:
    return ""
  else:
    # Convert text to lowercase
    text = str(text).lower()
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back into a string
    processed_text = ' '.join(tokens)
    return processed_text

In [None]:
# Clean and preprocess the 'Text' column
review_df['Text'] = review_df['Text'].apply(clean_and_preprocess)

In [None]:
# Clean and preprocess the 'Summary' column
review_df['Summary'] = review_df['Summary'].apply(clean_and_preprocess)

In [None]:
review_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,good quality dog food,i have bought several of the vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,not a advertised,product arrived labeled a jumbo salted peanuts...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,delight say it all,this is a confection that ha been around a few...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,cough medicine,if you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,great taffy,great taffy at a great price there wa a wide a...


In [None]:
import re
import random
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelWithLMHead
import torch.optim as optim

In [None]:
review_df.Text.values[:5]

array(['i have bought several of the vitality canned dog food product and have found them all to be of good quality the product look more like a stew than a processed meat and it smell better my labrador is finicky and she appreciates this product better than most',
       'product arrived labeled a jumbo salted peanutsthe peanut were actually small sized unsalted not sure if this wa an error or if the vendor intended to represent the product a jumbo',
       'this is a confection that ha been around a few century it is a light pillowy citrus gelatin with nut in this case filbert and it is cut into tiny square and then liberally coated with powdered sugar and it is a tiny mouthful of heaven not too chewy and very flavorful i highly recommend this yummy treat if you are familiar with the story of c lewis the lion the witch and the wardrobe this is the treat that seduces edmund into selling out his brother and sister to the witch',
       'if you are looking for the secret ingredient in 

In [None]:
review_df.Summary.values[:5]

array(['good quality dog food', 'not a advertised', 'delight say it all',
       'cough medicine', 'great taffy'], dtype=object)

**Dropping NANs**

In [None]:
review_df.dropna(subset=['Text', 'Summary'], inplace=True)

Setting the device for gpu processing

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [None]:
review_df['input_model'] = review_df['Text'] + " TL;DR " + review_df['Summary']

In [None]:
review_df['input_model'].values[:5]

array(['i have bought several of the vitality canned dog food product and have found them all to be of good quality the product look more like a stew than a processed meat and it smell better my labrador is finicky and she appreciates this product better than most TL;DR good quality dog food',
       'product arrived labeled a jumbo salted peanutsthe peanut were actually small sized unsalted not sure if this wa an error or if the vendor intended to represent the product a jumbo TL;DR not a advertised',
       'this is a confection that ha been around a few century it is a light pillowy citrus gelatin with nut in this case filbert and it is cut into tiny square and then liberally coated with powdered sugar and it is a tiny mouthful of heaven not too chewy and very flavorful i highly recommend this yummy treat if you are familiar with the story of c lewis the lion the witch and the wardrobe this is the treat that seduces edmund into selling out his brother and sister to the witch TL;DR d

Finding model length

In [None]:
length = sum([len(review.split()) for review in review_df.input_model.values])/len(review_df)
length

84.2845700795491

Since model average length is 85.3, we can take a max length as 100.

In [None]:
max_len = 100

Taking a sample to reduce model training time

In [None]:
review_df = review_df.sample(40000)
review_df = review_df.input_model.values.tolist()
len(review_df)

40000

In [None]:
review_df[:5]

['im a long time user of the cytomax product and this life up to all the previous cytomax drink i have purchased good for anything that is going to put you to your endurance limit TL;DR good stuff',
 'i wa so happy to get the opportunity to try this fine bacon at my church last year me and my family were ready and eagerly waiting for the world to end n a harold camping said but now i am glad that this did not happen i had freeze dried meat but nothing to match the deliciousness of yoders canned bacon me and my family have been doing disaster drill and every time it we have a yoders bacon buffet for all the participant even the woman folk have been getting in the spirit by trying to out do each others recipe even though we are waiting on jesus hell i am willing to bet the on the mayan any reason to eat yoders is good enough for me TL;DR we are ready for the mayan apocalypse',
 'well in another review i mentioned in passing that this brand green tea and lemon is better and i stand by tha

# **Model Training**

In [None]:
from google.colab import userdata
userdata.get('HuggingFace')

'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'

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelWithLMHead.from_pretrained("gpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Send model to device and initialize optimizer

In [None]:
model = model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=3e-4)

Find the number of tokens used by the designator " TL;DR ":

In [None]:
tokenizer.encode(" TL;DR ")
extra_length = len(tokenizer.encode(" TL;DR "))

**Custom Dataset**

In [None]:
class CustomDataset(Dataset):
    def __init__(self, tokenizer, reviews, max_len):
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.eos = self.tokenizer.eos_token
        self.eos_id = self.tokenizer.eos_token_id
        self.reviews = reviews
        self.results = []

        for review in self.reviews:
            # Encode the text using tokenizer.encode(). We add EOS at the end
            tokenized_ = self.tokenizer.encode(review + self.eos, truncation=True, max_length=self.max_len)

            # Padding/truncating the encoded sequence to max_len
            padded_ = self.padding_truncate(tokenized_)

            # Creating a tensor and adding to the result
            self.results.append(torch.tensor(padded_, dtype=torch.long))

    def __len__(self):
        return len(self.results)


    def __getitem__(self, item):
        return self.results[item]

    def padding_truncate(self, name):
        name_length = len(name) - extra_length
        if name_length < self.max_len:
            difference = self.max_len - name_length
            results = name + [self.eos_id] * difference
        elif name_length > self.max_len:
            results = name[:self.max_len + 3]+[self.eos_id]
        else:
            results = name
        return results

In [None]:
dataSet = CustomDataset(tokenizer, review_df, max_len)

**Data Splitting (75:25)**

In [None]:
from torch.utils.data.dataset import random_split

# Calculate lengths for 75% train and 25% test split
total_count = len(dataSet)
train_cnt = int(0.75 * total_count)
test_cnt = total_count - train_cnt

# Perform the split
train_dataSet, test_dataSet = random_split(dataSet, [train_cnt, test_cnt])


In [None]:
from torch.utils.data import DataLoader

# Create DataLoader for training set
train_Loader = DataLoader(train_dataSet, batch_size=32, shuffle=True, drop_last=True)

# Create DataLoader for testing set
test_Loader = DataLoader(test_dataSet, batch_size=32, shuffle=False, drop_last=True)

**Fine Tuning**

In [None]:
def fine_tuning(model, optimizer, dl, epochs_cnt):
    for epoch in range(epochs_cnt):
        for idx, batch in enumerate(dl):
             with torch.set_grad_enabled(True):
                optimizer.zero_grad()
                batch = batch.to(device)
                output = model(batch, labels=batch)
                loss = output[0]
                loss.backward()
                optimizer.step()
                if idx % 100 == 0:
                    print("loss: %f, %d"%(loss, idx))

In [None]:
fine_tuning(model=model, optimizer=optimizer, dl=train_Loader, epochs_cnt=3)

loss: 8.384645, 0
loss: 2.856281, 100
loss: 2.356760, 200
loss: 2.354845, 300
loss: 2.636759, 400
loss: 2.438997, 500
loss: 2.247795, 600
loss: 2.478539, 700
loss: 2.488614, 800
loss: 2.199543, 900
loss: 1.954966, 0
loss: 1.992734, 100
loss: 2.107363, 200
loss: 2.210031, 300
loss: 2.074909, 400
loss: 2.221991, 500
loss: 2.143896, 600
loss: 1.879197, 700
loss: 1.806239, 800
loss: 1.928483, 900
loss: 2.083845, 0
loss: 1.851848, 100
loss: 1.957015, 200
loss: 1.709024, 300
loss: 1.783751, 400
loss: 1.622078, 500
loss: 1.772318, 600
loss: 1.762184, 700
loss: 1.715217, 800
loss: 1.846312, 900


In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/my_GPT2_state_dict.pth')

In [54]:
from transformers import GPT2LMHeadModel

# Initialize the model architecture first
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Load the state dict
model.load_state_dict(torch.load('/content/drive/MyDrive/my_GPT2_state_dict.pth', map_location=torch.device('cpu')))

<All keys matched successfully>

In [55]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=3e-4)

In [None]:
def evaluate_fun(model, test_loader):
    model.eval()  # Put the model in evaluation mode
    totalLoss = 0
    with torch.no_grad():  # No gradients needed
        for batch in test_loader:
            batch = batch.to(device)
            output = model(batch, labels=batch)
            loss = output.loss
            totalLoss += loss.item()
    averageLoss = totalLoss / len(test_loader)
    print(f"Average Test Loss: {averageLoss}")

evaluate_fun(model, test_Loader)

Average Test Loss: 2.299412643298125


In [56]:
def topk(probs, n=9):
    # The scores are initially softmaxed to convert to probabilities
    probs = torch.softmax(probs, dim= -1)

    # PyTorch has its own topk method, which we use here
    tokensProb_, topIx = torch.topk(probs, k=n)

    # The new selection pool (9 choices) is normalized
    tokensProb_ = tokensProb_ / torch.sum(tokensProb_)

    # Send to CPU for numpy handling
    tokensProb_ = tokensProb_.cpu().detach().numpy()

    # Make a random choice from the pool based on the new prob distribution
    choice = np.random.choice(n, 1, p = tokensProb_)
    tokenId = topIx[choice][0]

    return int(tokenId)

In [57]:
def model_inference(model, tokenizer, review, max_length=15):
    # Preprocess the init token (task designator)
    review_encoded = tokenizer.encode(review)
    result = review_encoded
    initial_input = torch.tensor(review_encoded).unsqueeze(0).to(device)

    with torch.set_grad_enabled(False):
        # Feed the init token to the model
        output = model(initial_input)

        # Flatten the logits at the final time step
        logits = output.logits[0,-1]

        # Make a top-k choice and append to the result
        result.append(topk(logits))

        # For max_length times:
        for _ in range(max_length):
            # Feed the current sequence to the model and make a choice
            input = torch.tensor(result).unsqueeze(0).to(device)
            output = model(input)
            logits = output.logits[0,-1]
            res_id = topk(logits)

            # If the chosen token is EOS, return the result
            if res_id == tokenizer.eos_token_id:
                return tokenizer.decode(result)
            else: # Append to the sequence
                result.append(res_id)
    # IF no EOS is generated, return after the max_len
    return tokenizer.decode(result)

In [62]:
review_text =  "the product is fresh and well packed very good for use a a oil for salad while making them tasty it is very good for health too"

In [63]:
given_summary = "good product"

In [67]:
print(review_text)
gen_summary = model_inference(model, tokenizer, review_text + " TL;DR ").split(" TL;DR ")[1].strip()
print("Generated Summary: "+ str(gen_summary) +"\n")

the product is fresh and well packed very good for use a a oil for salad while making them tasty it is very good for health too
Generated Summary: very good quality



**Rouge Score**

In [None]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [68]:
from rouge import Rouge

rouge = Rouge()
scores = rouge.get_scores(gen_summary, given_summary, avg=False)[0] # Get scores for this one pair

# Formatting and printing the results
print(f"ROUGE-1: Precision: {scores['rouge-1']['p']:.2f}, Recall: {scores['rouge-1']['r']:.2f}, F1-Score: {scores['rouge-1']['f']:.2f}")
print(f"ROUGE-2: Precision: {scores['rouge-2']['p']:.2f}, Recall: {scores['rouge-2']['r']:.2f}, F1-Score: {scores['rouge-2']['f']:.2f}")
print(f"ROUGE-L: Precision: {scores['rouge-l']['p']:.2f}, Recall: {scores['rouge-l']['r']:.2f}, F1-Score: {scores['rouge-l']['f']:.2f}")

ROUGE-1: Precision: 0.33, Recall: 0.50, F1-Score: 0.40
ROUGE-2: Precision: 0.00, Recall: 0.00, F1-Score: 0.00
ROUGE-L: Precision: 0.33, Recall: 0.50, F1-Score: 0.40
