# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
import string
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from transformers import Trainer, TrainingArguments, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import spacy

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
reviews_df = pd.read_csv('/content/drive/MyDrive/Reviews.csv')

In [None]:
reviews_df = reviews_df.dropna(subset=['Summary'])

In [None]:
reviews_df.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [None]:
reviews_df.shape

(568427, 10)

In [None]:
reviews_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

# Preprocess Data

In [None]:
def remove_html_tags(text):
    if isinstance(text, str):
        clean = re.compile('<.*?>')
        return re.sub(clean, '', text)
    else:
        return ''

In [None]:
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text, max_length, tokenizer):
    if text is None:
        return [tokenizer.pad_token_id] * max_length

    # Remove HTML tags and punctuation
    text = remove_html_tags(text)
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text using SpaCy
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Convert tokens to IDs using GPT2 tokenizer
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # Replace None values with padding token ID
    input_ids = [token_id if token_id is not None else tokenizer.pad_token_id for token_id in input_ids]

    # Pad sequences to ensure equal length
    input_ids = input_ids[:max_length] + [tokenizer.pad_token_id] * (max_length - len(input_ids))

    return input_ids

In [None]:
reviews_df = reviews_df[['Text', 'Summary']]

# Reduce the Data size

In [None]:
data_size = int(0.05 * len(reviews_df))
reviews_df = reviews_df[:data_size]

In [None]:
reviews_df.shape

(28421, 2)

# Preprocess two columns

In [None]:
reviews_df = reviews_df.dropna(subset=['Text', 'Summary']).reset_index(drop=True)

max_length = 128  # Define the maximum sequence length
reviews_df['Text'] = reviews_df['Text'].apply(lambda x: preprocess_text(x, max_length, tokenizer))
reviews_df['Summary'] = reviews_df['Summary'].apply(lambda x: preprocess_text(x, max_length, tokenizer))

In [None]:
reviews_df['Text'] = reviews_df['Text'].apply(lambda vector: [token if token is not None else 0 for token in vector])
reviews_df['Summary'] = reviews_df['Summary'].apply(lambda vector: [token if token is not None else 0 for token in vector])

In [None]:
reviews_df.head()

Unnamed: 0,Text,Summary
0,"[40, 17846, 50256, 50256, 9703, 19425, 11167, ...","[10248, 35013, 32942, 24602, 0, 0, 0, 0, 0, 0,..."
1,"[11167, 50256, 18242, 50256, 50256, 50256, 502...","[50256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,"[50256, 14145, 14792, 50256, 2971, 50256, 5025...","[50256, 16706, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
3,"[5460, 21078, 50256, 50256, 40, 50256, 40, 197...","[50256, 50256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
4,"[18223, 50256, 18223, 20888, 50256, 4421, 5025...","[18223, 50256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."


# Train-Test Split

In [None]:
reviews_df = reviews_df.dropna()

In [None]:
epochs = 10
learning_rate = 5e-5
batch_size = 8

# Split the dataset into train and test sets
train_size = int(0.75 * len(reviews_df))
train_df = reviews_df[:train_size]
test_df = reviews_df[train_size:]

# Building Custom Dataset

In [None]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [None]:
class CustomDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.df.iloc[idx]['Text'])
        target_ids = torch.tensor(self.df.iloc[idx]['Summary'])

        return {'input_ids': input_ids, 'labels': target_ids}

In [None]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Initialize Datasets & Dataloaders

In [None]:
train_dataset = CustomDataset(train_df)
test_dataset = CustomDataset(test_df)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8)

# Fine-Tuning

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
for epoch in range(epochs):
    model.train()
    total_loss = 0

    # Training
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Average training loss for Epoch {epoch+1}: {avg_train_loss}')


    # Validation
    model.eval()
    total_val_loss = 0
    for batch in tqdm(test_dataloader, desc=f"Validation for Epoch {epoch+1}/{epochs}"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model(input_ids, labels=labels)
            val_loss = outputs.loss

        total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(test_dataloader)
    print(f'Average validation loss for Epoch {epoch+1}: {avg_val_loss}')

    # Adjust learning rate
    scheduler.step()

Epoch 1/10: 100%|██████████| 2665/2665 [13:35<00:00,  3.27it/s]


Average training loss for Epoch 1: 0.09126111030662484


Validation for Epoch 1/10: 100%|██████████| 889/889 [01:18<00:00, 11.37it/s]


Average validation loss for Epoch 1: 0.08331164554666615


Epoch 2/10: 100%|██████████| 2665/2665 [13:38<00:00,  3.25it/s]


Average training loss for Epoch 2: 0.08105294716388937


Validation for Epoch 2/10: 100%|██████████| 889/889 [01:18<00:00, 11.38it/s]


Average validation loss for Epoch 2: 0.08256913413774511


Epoch 3/10: 100%|██████████| 2665/2665 [13:39<00:00,  3.25it/s]


Average training loss for Epoch 3: 0.08051862559136262


Validation for Epoch 3/10: 100%|██████████| 889/889 [01:18<00:00, 11.38it/s]


Average validation loss for Epoch 3: 0.08250562700205163


Epoch 4/10: 100%|██████████| 2665/2665 [13:38<00:00,  3.26it/s]


Average training loss for Epoch 4: 0.08045850456893221


Validation for Epoch 4/10: 100%|██████████| 889/889 [01:18<00:00, 11.37it/s]


Average validation loss for Epoch 4: 0.08248791420209999


Epoch 5/10: 100%|██████████| 2665/2665 [13:38<00:00,  3.26it/s]


Average training loss for Epoch 5: 0.08044497864862545


Validation for Epoch 5/10: 100%|██████████| 889/889 [01:17<00:00, 11.41it/s]


Average validation loss for Epoch 5: 0.082487632213548


Epoch 6/10: 100%|██████████| 2665/2665 [13:39<00:00,  3.25it/s]


Average training loss for Epoch 6: 0.08041611769731116


Validation for Epoch 6/10: 100%|██████████| 889/889 [01:18<00:00, 11.37it/s]


Average validation loss for Epoch 6: 0.08248762704256013


Epoch 7/10: 100%|██████████| 2665/2665 [13:38<00:00,  3.25it/s]


Average training loss for Epoch 7: 0.08042951565843921


Validation for Epoch 7/10: 100%|██████████| 889/889 [01:18<00:00, 11.37it/s]


Average validation loss for Epoch 7: 0.08248762718293946


Epoch 8/10: 100%|██████████| 2665/2665 [13:39<00:00,  3.25it/s]


Average training loss for Epoch 8: 0.0804716501273145


Validation for Epoch 8/10: 100%|██████████| 889/889 [01:18<00:00, 11.37it/s]


Average validation loss for Epoch 8: 0.08248762729817623


Epoch 9/10:  22%|██▏       | 589/2665 [03:01<10:37,  3.26it/s]

# Save the Model

In [None]:
torch.save(model.state_dict(), 'fine_tuned_model.pth')

# Load the Fine-Tuned Model

In [None]:
model_path = '/content/drive/MyDrive/fine_tuned_model.pth'
fine_tuned_model = GPT2LMHeadModel.from_pretrained('gpt2')
fine_tuned_model.load_state_dict(torch.load(model_path))

# Set the model to evaluation mode
fine_tuned_model.eval()

# Optionally, move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
fine_tuned_model.to(device)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=4511f9548903c9e1ab420becd81a98c514b3453631e8b72961045ffcf2101e3e
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.model_max_length = 512  # Set max_length to its default value, e.g., 512
tokenizer.model_max_new_tokens = 512

# Average ROUGE Scores for Test set

In [None]:
from rouge_score import rouge_scorer

# Create a RougeScorer object
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])

# Lists to store ROUGE scores
rouge1_f1_scores, rouge1_precision_scores, rouge1_recall_scores = [], [], []
rouge2_f1_scores, rouge2_precision_scores, rouge2_recall_scores = [], [], []
rougeL_f1_scores, rougeL_precision_scores, rougeL_recall_scores = [], [], []

model.eval()

# Evaluate model on test dataset
for batch in tqdm(test_dataloader, desc="Computing ROUGE scores"):
    input_ids = batch['input_ids'].to(device)
    labels = batch['labels'].to(device)

    # Generate summaries using the model
    with torch.no_grad():
        outputs = fine_tuned_model.generate(input_ids.to(device), max_length=512, num_beams=4, early_stopping=True)

    # Convert tensor to list of strings
    predicted_summaries = tokenizer.batch_decode(outputs.cpu(), skip_special_tokens=True)
    actual_summaries = tokenizer.batch_decode(labels.cpu(), skip_special_tokens=True)

    # Calculate ROUGE scores for each example in the batch
    for predicted_summary, actual_summary in zip(predicted_summaries, actual_summaries):
        scores = scorer.score(predicted_summary.lower(), actual_summary.lower())

        # Extract f1, precision, and recall for each ROUGE score
        rouge1_f1_scores.append(scores['rouge1'].fmeasure)
        rouge1_precision_scores.append(scores['rouge1'].precision)
        rouge1_recall_scores.append(scores['rouge1'].recall)

        rouge2_f1_scores.append(scores['rouge2'].fmeasure)
        rouge2_precision_scores.append(scores['rouge2'].precision)
        rouge2_recall_scores.append(scores['rouge2'].recall)

        rougeL_f1_scores.append(scores['rougeL'].fmeasure)
        rougeL_precision_scores.append(scores['rougeL'].precision)
        rougeL_recall_scores.append(scores['rougeL'].recall)


# Compute average scores
rouge1_f1_avg = sum(rouge1_f1_scores) / len(rouge1_f1_scores)
rouge1_precision_avg = sum(rouge1_precision_scores) / len(rouge1_precision_scores)
rouge1_recall_avg = sum(rouge1_recall_scores) / len(rouge1_recall_scores)

rouge2_f1_avg = sum(rouge2_f1_scores) / len(rouge2_f1_scores)
rouge2_precision_avg = sum(rouge2_precision_scores) / len(rouge2_precision_scores)
rouge2_recall_avg = sum(rouge2_recall_scores) / len(rouge2_recall_scores)

rougeL_f1_avg = sum(rougeL_f1_scores) / len(rougeL_f1_scores)
rougeL_precision_avg = sum(rougeL_precision_scores) / len(rougeL_precision_scores)
rougeL_recall_avg = sum(rougeL_recall_scores) / len(rougeL_recall_scores)

# Display average scores
print("ROUGE-1:")
print(f"  F1: {rouge1_f1_avg:.4f}, Precision: {rouge1_precision_avg:.4f}, Recall: {rouge1_recall_avg:.4f}")

print("ROUGE-2:")
print(f"  F1: {rouge2_f1_avg:.4f}, Precision: {rouge2_precision_avg:.4f}, Recall: {rouge2_recall_avg:.4f}")

print("ROUGE-L:")
print(f"  F1: {rougeL_f1_avg:.4f}, Precision: {rougeL_precision_avg:.4f}, Recall: {rougeL_recall_avg:.4f}")


Computing ROUGE scores:   0%|          | 0/889 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Computing ROUGE scores:   0%|          | 1/889 [00:12<2:58:11, 12.04s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Computing ROUGE scores:   0%|          | 2/889 [00:23<2:53:41, 11.75s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Computing ROUGE scores:   0%|        

ROUGE-1:
  F1: 0.0003, Precision: 0.0003, Recall: 0.0003
ROUGE-2:
  F1: 0.0000, Precision: 0.0000, Recall: 0.0000
ROUGE-L:
  F1: 0.0003, Precision: 0.0003, Recall: 0.0003





# ROUGE Scores for one Test Example

In [None]:
from rouge import Rouge

# Initialize ROUGE
rouge = Rouge()

# Get test text from the user
test_text = "The Fender CD-60S Dreadnought Acoustic Guitar is a great instrument for beginners. It has a solid construction, produces a rich sound, and feels comfortable to play. However, some users have reported issues with the tuning stability."

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
input_ids = tokenizer.encode(test_text, return_tensors='pt').to(device)

# Generate summary for the input text
with torch.no_grad():
    output = fine_tuned_model.generate(input_ids, max_length=100, num_beams=4, early_stopping=True)
generated_summary = tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

# Get reference summary from the user
reference_summary = "Good for beginners but has tuning stability issues."

# Calculate ROUGE scores
scores = rouge.get_scores(generated_summary, reference_summary)

# Extract and print only ROUGE-1, ROUGE-2, and ROUGE-L scores
print("ROUGE-1 Scores:")
print(scores[0]['rouge-1'])
print("ROUGE-2 Scores:")
print(scores[0]['rouge-2'])
print("ROUGE-L Scores:")
print(scores[0]['rouge-l'])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


ROUGE-1 Scores:
{'r': 0.75, 'p': 0.17142857142857143, 'f': 0.2790697644131963}
ROUGE-2 Scores:
{'r': 0.2857142857142857, 'p': 0.05555555555555555, 'f': 0.09302325308815583}
ROUGE-L Scores:
{'r': 0.625, 'p': 0.14285714285714285, 'f': 0.2325581365062196}
