In [43]:
# Installing essential libraries
!pip install transformers datasets



In [2]:
# Importing necessary libraries for data manipulation, NLP model handling, and training.

import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset
import torch

In [3]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [5]:
# Load the datasets
reviews_df = pd.read_csv('/content/reviews_supplements.csv')
products_df = pd.read_csv('/content/product_asin.csv')

In [6]:
# Data Cleaning: Drop any missing or irrelevant data
reviews_df.dropna(subset=['asin', 'text'], inplace=True)

In [7]:
# Merge the datasets using the 'asin' column
merged_df = pd.merge(reviews_df, products_df, left_on='asin', right_on='parent_asin', how='left')

In [8]:
merged_df.columns

Index(['rating', 'title_x', 'text', 'asin', 'parent_asin_x', 'user_id',
       'timestamp', 'helpful_vote', 'verified_purchase', 'date', 'time', 'X',
       'title_y', 'parent_asin_y', 'categories', 'cat1', 'cat2', 'cat3',
       'cat4', 'cat5', 'cat6'],
      dtype='object')

In [9]:
# Filter columns to keep only necessary information
merged_df = merged_df[['title_x', 'text', 'rating', 'asin', 'categories', 'cat1', 'cat2', 'cat3',
       'cat4', 'cat5', 'cat6']]


In [10]:
# Rename columns for clarity
merged_df.columns = ['review_title', 'review_text', 'rating', 'asin', 'categories', 'cat1', 'cat2', 'cat3',
       'cat4', 'cat5', 'cat6']

In [12]:
# Loading the GPT-2 tokenizer and set the padding token to the end-of-sequence token.
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token =tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



In [13]:
# Defining a function to tokenize input examples, truncate, and pad them to a specified maximum length.

def tokenize_function(examples):
    tokens =tokenizer(
        examples['review_text'],
        truncation= True,
        padding ='max_length',
        max_length =128,
        return_tensors='pt'
    )

    tokens['labels']= tokens['input_ids']
    return tokens


In [14]:
# Converting a DataFrame to a Dataset, tokenize it, and set the format for PyTorch training.
dataset =Dataset.from_pandas(merged_df[['review_text']])
tokenized_dataset =dataset.map(tokenize_function , batched=True)
tokenized_dataset.set_format(type='torch',columns= ['input_ids', 'attention_mask','labels'])
print(tokenized_dataset)

Map:   0%|          | 0/16666 [00:00<?, ? examples/s]

Dataset({
    features: ['review_text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 16666
})


In [15]:
# Spliting the tokenized dataset into training and validation sets, reserving 20% for testing.
train_test_split = tokenized_dataset.train_test_split(test_size= 0.2)
train_dataset= train_test_split['train']
val_dataset =train_test_split['test']
print(f"Training size:{len(train_dataset)}, Validation size: {len(val_dataset)}")

Training size:13332, Validation size: 3334


In [16]:
# Creating smaller training and validation datasets by shuffling and selecting 10% of each dataset.
import random
train_size =int(len(train_dataset) *0.1)
small_train_dataset= train_dataset.shuffle(seed= 42).select(range(train_size))
val_size =int(len(val_dataset) *0.1)
small_val_dataset= val_dataset.shuffle(seed= 42).select(range(val_size))
print(f"Reduced Training size: {len(small_train_dataset)},Reduced Validation size: {len(small_val_dataset)}")

Reduced Training size: 1333,Reduced Validation size: 333


In [17]:
# Seting up training arguments for the model, including output directory, batch sizes, and logging settings.
training_args = TrainingArguments(
    output_dir="/content/train/reviews",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    eval_strategy="no",
    save_steps=500,
    logging_dir="/content/train/logs",
    logging_steps=100,
    max_steps=1000,
)

In [18]:
model = GPT2LMHeadModel.from_pretrained("gpt2")

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [19]:
# Initializing the Trainer object with the model, training arguments, and datasets for training and evaluation.
trainer = Trainer(
    model =model,
    args =training_args,
    train_dataset =small_train_dataset,
    eval_dataset= small_val_dataset,
)

max_steps is given, it will override any value given in num_train_epochs


In [20]:
# Starting the training process using the Trainer object configured with the specified model and datasets.
trainer.train()

Step,Training Loss
100,1.3921
200,0.9913
300,1.0182
400,0.9913
500,0.8983
600,0.9774
700,1.1784
800,1.0329
900,0.9043
1000,1.141


TrainOutput(global_step=1000, training_loss=1.052512939453125, metrics={'train_runtime': 155.4288, 'train_samples_per_second': 6.434, 'train_steps_per_second': 6.434, 'total_flos': 65323008000000.0, 'train_loss': 1.052512939453125, 'epoch': 0.7501875468867217})

In [21]:
# Saving the trained model and tokenizer to the specified directory for future use.
model.save_pretrained("/content/train/model-add")
tokenizer.save_pretrained("/content/train/model-add")
print("Model and tokenizer saved successfully.")

Model and tokenizer saved successfully.


In [22]:
# Defining a function to generate a review using the trained model and tokenizer based on a given prompt.

def generate_review(prompt, model, tokenizer, max_length=100):
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    outputs= model.generate(
        inputs,
        max_length= max_length,
        do_sample= True,
        top_k= 50,
        top_p= 0.95,
        num_return_sequences=1
    )
    return tokenizer.decode(outputs[0], skip_special_tokens= True)

In [23]:
# Defining a function to generate a review using the trained model and tokenizer based on a given prompt, with GPU support if available.
def generate_review(prompt, model, tokenizer, max_length=100):
    # Move the input tensors to the correct device (GPU if available)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inputs = tokenizer.encode(prompt, return_tensors='pt').to(device)
    attention_mask = torch.ones(inputs.shape, device=device)  # Create attention mask

    outputs = model.generate(
        inputs,
        attention_mask=attention_mask,  # Use attention mask to handle padding
        max_length=max_length,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id  # Set pad_token_id
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [42]:
# Generating synthetic reviews based on prompts from the training dataset, along with their corresponding categories and ratings.
synthetic_reviews = []
original_prompts = []
categories = []
cat1_list = []
cat2_list = []
cat3_list = []
cat4_list = []
cat5_list = []
cat6_list = []
ratings = []

# Generate 50 reviews with corresponding categories and cat1-cat6
for n in range(50):
    i=random.randint(1,len(small_train_dataset))-2
    prompt = small_train_dataset['review_text'][i][:50]
    generated_review = generate_review(prompt, model, tokenizer)

    synthetic_reviews.append(generated_review)
    original_prompts.append(prompt)
    categories.append(merged_df['categories'][i])  # Fetch corresponding category from merged_df
    cat1_list.append(merged_df['cat1'][i])
    cat2_list.append(merged_df['cat2'][i])
    cat3_list.append(merged_df['cat3'][i])
    cat4_list.append(merged_df['cat4'][i])
    cat5_list.append(merged_df['cat5'][i])
    cat6_list.append(merged_df['cat6'][i])
    ratings.append(random.randint(1, 5))

# Create DataFrame
synthetic_df = pd.DataFrame({
    "synthetic_review": synthetic_reviews,
    "original_prompt": original_prompts,
    "category": categories,
    "cat1": cat1_list,
    "cat2": cat2_list,
    "cat3": cat3_list,
    "cat4": cat4_list,
    "cat5": cat5_list,
    "cat6": cat6_list,
    "rating": ratings
})

# Save the synthetic reviews to a CSV file
synthetic_df.to_csv("synthetic_reviews2.csv", index=False)
print("Generated synthetic reviews saved to synthetic_reviews.csv!")

Generated synthetic reviews saved to synthetic_reviews.csv!
