In [23]:
!pip install transformers



In [2]:
import re
import random
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelWithLMHead
import torch.optim as optim

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [4]:
reviews = pd.read_csv('archive/Reviews.csv')
reviews.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [5]:
reviews.Text.values[:5]


array(['I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.',
       'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".',
       'This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis\' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.',
    

In [6]:
reviews.Summary.values[:5]

array(['Good Quality Dog Food', 'Not as Advertised',
       '"Delight" says it all', 'Cough Medicine', 'Great taffy'],
      dtype=object)

In [7]:
from sklearn.model_selection import train_test_split
reviews = reviews.sample(frac=0.01, random_state=42)
reviews, test_reviews = train_test_split(reviews, test_size=0.25, random_state=42)

In [8]:
len(reviews)


4263

In [9]:
reviews['model_input'] = reviews['Text'] + " TL;DR " + reviews['Summary']
reviews['model_input'].values[:3]

array(['This flavor tastes very good. Great product!!! I subscribed to receive one box every 6 months, very convenient and great service. TL;DR Great product, tastes good',
       "I liked the Planter's snack bars.  Basically it is a larger and higher calorie version of a regular granola bar with the added joy of having fat planter's peanuts on top.<br /><br />As other have said, it is not a diet bar and is probably a bit messier than your average granola bar which are always nice to have in the car.  However it is tasty and sweet and depending on what you are looking for in a granola bar (did I just type that) this should satisfy both your sweet and salty cravings. TL;DR Excellent Protein Snack",
       "Nowhere in the description does it mention anything about this coffee being flavored. I'm still not sure what flavor it is. Some type of nut. I wrongly assumed that Jamaican coffee would be similar to the Blue Mountain coffee which Jamaica is known for! How silly of me to assume that!

In [10]:
avg_length = sum([len(review.split()) for review in reviews.model_input.values])/len(reviews)
avg_length

84.54351395730706

In [11]:
max_length = 100

In [12]:
reviews = reviews.sample(100)
reviews = reviews.model_input.values.tolist()
len(reviews)

100

In [13]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelWithLMHead.from_pretrained("gpt2")



In [14]:
model = model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=3e-4)

In [15]:
tokenizer.encode(" TL;DR ")

[24811, 26, 7707, 220]

In [16]:
extra_length = len(tokenizer.encode(" TL;DR "))


In [17]:
class ReviewDataset(Dataset):  
    def __init__(self, tokenizer, reviews, max_len):
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.eos = self.tokenizer.eos_token
        self.eos_id = self.tokenizer.eos_token_id
        self.reviews = reviews
        self.result = []

        for review in self.reviews:
            # Encode the text using tokenizer.encode(). We add EOS at the end
            tokenized = self.tokenizer.encode(review + self.eos)
            
            # Padding/truncating the encoded sequence to max_len 
            padded = self.pad_truncate(tokenized)            

            # Creating a tensor and adding to the result
            self.result.append(torch.tensor(padded))

    def __len__(self):
        return len(self.result)


    def __getitem__(self, item):
        return self.result[item]

    def pad_truncate(self, name):
        name_length = len(name) - extra_length
        if name_length < self.max_len:
            difference = self.max_len - name_length
            result = name + [self.eos_id] * difference
        elif name_length > self.max_len:
            result = name[:self.max_len + 3]+[self.eos_id] 
        else:
            result = name
        return result

In [18]:
dataset = ReviewDataset(tokenizer, reviews, max_length)


In [19]:
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, drop_last=True)


In [20]:
def train(model, optimizer, dl, epochs):    
    for epoch in range(epochs):
        for idx, batch in enumerate(dl):
             with torch.set_grad_enabled(True):
                optimizer.zero_grad()
                batch = batch.to(device)
                output = model(batch, labels=batch)
                loss = output[0]
                loss.backward()
                optimizer.step()
                if idx % 100 == 0:
                    print("loss: %f, %d"%(loss, idx))

In [21]:
train(model=model, optimizer=optimizer, dl=dataloader, epochs=1)

loss: 5.044612, 0


In [22]:
def topk(probs, n=9):
    # The scores are initially softmaxed to convert to probabilities
    probs = torch.softmax(probs, dim= -1)
    
    # PyTorch has its own topk method, which we use here
    tokensProb, topIx = torch.topk(probs, k=n)
    
    # The new selection pool (9 choices) is normalized
    tokensProb = tokensProb / torch.sum(tokensProb)

    # Send to CPU for numpy handling
    tokensProb = tokensProb.cpu().detach().numpy()

    # Make a random choice from the pool based on the new prob distribution
    choice = np.random.choice(n, 1, p = tokensProb)
    tokenId = topIx[choice][0]

    return int(tokenId)

In [23]:
def model_infer(model, tokenizer, review, max_length=15):
    # Preprocess the init token (task designator)
    review_encoded = tokenizer.encode(review)
    result = review_encoded
    initial_input = torch.tensor(review_encoded).unsqueeze(0).to(device)

    with torch.set_grad_enabled(False):
        # Feed the init token to the model
        output = model(initial_input)

        # Flatten the logits at the final time step
        logits = output.logits[0,-1]

        # Make a top-k choice and append to the result
        result.append(topk(logits))

        # For max_length times:
        for _ in range(max_length):
            # Feed the current sequence to the model and make a choice
            input = torch.tensor(result).unsqueeze(0).to(device)
            output = model(input)
            logits = output.logits[0,-1]
            res_id = topk(logits)

            # If the chosen token is EOS, return the result
            if res_id == tokenizer.eos_token_id:
                return tokenizer.decode(result)
            else: # Append to the sequence 
                result.append(res_id)
    # IF no EOS is generated, return after the max_len
    return tokenizer.decode(result)

In [24]:
sample_reviews = [review.split(" TL;DR ")[0] for review in random.sample(reviews, 5)]
sample_reviews

["I use it to put in a protein shake. One scoop that's all it takes. Less calories than others. It is very reasonable ! I will buy again !",
 "I like drinking something other than sports drinks to get my electrolytes in after a workout, so I don't mind that this is higher in sugar and calories (110, I believe) than the other Zico waters.<br /><br />Even if you don't like the taste of regular coconut water, you should try this! It makes for a great dairy-free milk (in fact, I would compare the taste to chocolate almond milk).",
 'These wafer have a very bright lemony filling and dont taste like they are gluten free cookies. I have to keep them away from my kids!',
 'I ordered 18 bags of York Pieces for wedding favors, half of the bags were melted and Amazon has a no return policy on food items. I would not order food products from a online vendor again.',
 'This is an excellent bottled tea. It has no sugar at all but it has a great taste and the jasmine seems to give it a natural sweet 

In [25]:
for review in sample_reviews:
    print(review)
    summary = model_infer(model, tokenizer, review + " TL;DR ").split(" TL;DR ")[1].strip()
    print("Summaries: "+ str(summary) +"\n")

I use it to put in a protein shake. One scoop that's all it takes. Less calories than others. It is very reasonable ! I will buy again !
Summaries: Good product!

I like drinking something other than sports drinks to get my electrolytes in after a workout, so I don't mind that this is higher in sugar and calories (110, I believe) than the other Zico waters.<br /><br />Even if you don't like the taste of regular coconut water, you should try this! It makes for a great dairy-free milk (in fact, I would compare the taste to chocolate almond milk).
Summaries: 

These wafer have a very bright lemony filling and dont taste like they are gluten free cookies. I have to keep them away from my kids!
Summaries: This has a good gluten free cookie

I ordered 18 bags of York Pieces for wedding favors, half of the bags were melted and Amazon has a no return policy on food items. I would not order food products from a online vendor again.
Summaries: <|endoftext|>

This is an excellent bottled tea. It 

In [26]:
test_reviews = test_reviews.sample(10)
print(len(test_reviews))

10


In [27]:
from rouge_score import rouge_scorer

# Define a function to calculate ROUGE scores
def calculate_rouge_scores(model, tokenizer, test_reviews):
    rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {"rouge1": {"precision": 0, "recall": 0, "fmeasure": 0},
                    "rouge2": {"precision": 0, "recall": 0, "fmeasure": 0},
                    "rougeL": {"precision": 0, "recall": 0, "fmeasure": 0}}

    for idx, row in test_reviews.iterrows():
        review_text = row['Text']
        actual_summary = row['Summary']
        generated_summary = model_infer(model, tokenizer, review_text + " TL;DR ").split(" TL;DR ")[1].strip()
        scores = rouge_scorer_instance.score(generated_summary, actual_summary)
        for metric, values in scores.items():
            rouge_scores[metric]["precision"] += values.precision
            rouge_scores[metric]["recall"] += values.recall
            rouge_scores[metric]["fmeasure"] += values.fmeasure

    num_reviews = len(test_reviews)
    for metric in rouge_scores:
        rouge_scores[metric]["precision"] /= num_reviews
        rouge_scores[metric]["recall"] /= num_reviews
        rouge_scores[metric]["fmeasure"] /= num_reviews

    return rouge_scores

# Calculate ROUGE scores for the test_reviews DataFrame
rouge_scores = calculate_rouge_scores(model, tokenizer, test_reviews)
print("ROUGE Scores:")
for metric, scores in rouge_scores.items():
    print(f"{metric}: Precision: {scores['precision']}, Recall: {scores['recall']}, F1-Score: {scores['fmeasure']}")


ROUGE Scores:
rouge1: Precision: 0.06666666666666667, Recall: 0.04, F1-Score: 0.05
rouge2: Precision: 0.05, Recall: 0.025, F1-Score: 0.03333333333333333
rougeL: Precision: 0.06666666666666667, Recall: 0.04, F1-Score: 0.05
