# Notebook: Automated Customer Review Analysis using Machine Learning

### Imports

In [16]:
import pandas as pd
import numpy as np
from pathlib import Path


### Load data

In [17]:
df1 = pd.read_csv("Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv", engine='python', on_bad_lines='skip')
df2 = pd.read_csv("1429_1.csv", engine='python', on_bad_lines='skip')
df3 = pd.read_csv("clean_reviews.csv", engine='python', on_bad_lines='skip')

#### Standardize columns

In [18]:
def standardize(df):
    df = df.rename(columns={
        "reviews.text": "review_text",
        "reviews.rating": "rating",
        "reviews.title": "title",
        "name": "product"
    })
    return df

df1 = standardize(df1)
df2 = standardize(df2)
df3 = standardize(df3)


### Remove n/a values

In [19]:
df1 = df1.dropna(subset=["review_text"])
df2 = df2.dropna(subset=["review_text"])
df3 = df3.dropna(subset=["review_text"])


### Keep Only usefull columns

In [20]:
df1 = df1[["review_text", "rating", "product"]]
df2 = df2[["review_text", "rating", "product"]]
df3 = df3[["review_text", "rating", "product"]]


### Unite Datasets

In [21]:
full_df = pd.concat([df1, df2, df3], ignore_index=True)


### Remove n/a values

In [22]:
full_df = full_df.dropna(subset=["rating"])


### Create sentiment label

In [23]:
def rating_to_sentiment(r):
    if r >= 4:
        return "positive"
    elif r == 3:
        return "neutral"
    else:
        return "negative"

full_df["sentiment"] = full_df["rating"].apply(rating_to_sentiment)


In [24]:
full_df.head()


Unnamed: 0,review_text,rating,product,sentiment
0,I order 3 of them and one of the item is bad q...,3.0,AmazonBasics AAA Performance Alkaline Batterie...,neutral
1,Bulk is always the less expensive way to go fo...,4.0,AmazonBasics AAA Performance Alkaline Batterie...,positive
2,Well they are not Duracell but for the price i...,5.0,AmazonBasics AAA Performance Alkaline Batterie...,positive
3,Seem to work as well as name brand batteries a...,5.0,AmazonBasics AAA Performance Alkaline Batterie...,positive
4,These batteries are very long lasting the pric...,5.0,AmazonBasics AAA Performance Alkaline Batterie...,positive


### Save clean dataset

In [25]:
full_df.to_csv("clean_reviews.csv", index=False)


In [26]:
from datasets import Dataset
dataset = Dataset.from_pandas(full_df)


In [27]:
def tokenize_function(example):
    return tokenizer(example["review_text"], truncation=True)


In [28]:
from transformers import AutoTokenizer

# Define model_name and tokenizer before use
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenized = dataset.map(tokenize_function)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Map:   0%|          | 0/688832 [00:00<?, ? examples/s]

## 1 Classifier: Sentiment Classification Model

### We now train a classification model to predict sentiment from product reviwes using supervised learning.

### Step 1 TF_IDF

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)

X = vectorizer.fit_transform(full_df["review_text"])
y = full_df["sentiment"]


### Step 2 Train-Test Split

In [30]:
vectorizer.fit_transform(full_df["review_text"])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 26978058 stored elements and shape (688832, 5000)>

In [31]:
from sklearn.model_selection import train_test_split

X_train_text, X_test_text, y_train, y_test = train_test_split(
    full_df["review_text"],
    full_df["sentiment"],
    test_size=0.2,
    random_state=42
)


In [32]:
vectorizer.fit(X_train_text)
X_train = vectorizer.transform(X_train_text)
X_test = vectorizer.transform(X_test_text)


### Train Logistic Regression

In [33]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


### Evaluate Model


In [34]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

    negative       0.67      0.49      0.57      8046
     neutral       0.49      0.20      0.28      9585
    positive       0.92      0.98      0.95    120136

    accuracy                           0.90    137767
   macro avg       0.69      0.56      0.60    137767
weighted avg       0.88      0.90      0.88    137767



# 2 Clustering
 ## We now apply clustering to discover common themes in product reviews.Unlike classification, clustering groups reviews based on simlarity.


#### Step 1 TF_IDF

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

cluster_vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words="english"
)

X_cluster = cluster_vectorizer.fit_transform(full_df["review_text"])


## We group reviews into clusters based on similarity (K-Means Clustering)

#### Step 2 KMeans

In [36]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=42)

clusters = kmeans.fit_predict(X_cluster)

full_df["cluster"] = clusters


#### Cluster Distribution

In [37]:
full_df["cluster"].value_counts()


Unnamed: 0_level_0,count
cluster,Unnamed: 1_level_1
2,311887
0,227736
3,82052
1,39207
4,27950


### Step 3 Top Words

In [38]:
import numpy as np

terms = cluster_vectorizer.get_feature_names_out()

for i in range(5):
    center = kmeans.cluster_centers_[i]
    top_words = [terms[ind] for ind in center.argsort()[-10:]]
    print(f"Cluster {i}: ", top_words)


Cluster 0:  ['reading', 'series', 'really', 'great', 'like', 'books', 'story', 'good', 'read', 'book']
Cluster 1:  ['kindle', 'series', 'use', 'tablet', 'easy', 'great', 'books', 'kids', 'book', 'love']
Cluster 2:  ['batteries', 'gift', 'price', 'loves', 'easy', 'bought', 'use', 'tablet', 'great', 'good']
Cluster 3:  ['books', 'really', 'characters', '34', 'just', 'like', 'read', 'story', 'book', 'br']
Cluster 4:  ['works', 'gift', 'tablet', 'value', 'batteries', 'read', 'product', 'price', 'book', 'great']


# 3 Review Summarization Using Generative AI

#### Step 1 Model Setup

In [39]:
pip install transformers accelerate bitsandbytes peft sentencepiece


Collecting bitsandbytes
  Downloading bitsandbytes-0.49.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.2-py3-none-manylinux_2_24_x86_64.whl (60.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.7/60.7 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.2


#### Step 2 Load TinyLlama

In [40]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name
)





model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

#### Step 3 Prepare training format

In [41]:
def build_prompt(row):
    return f"""
[INST]
You are a professional tech reviewer.

Product Category: {row['cluster']}
Customer Sentiment: {row['sentiment']}

Write a short blog-style product review summary.
Mention strengths, weaknesses and ideal buyer.
[/INST]
"""


#### Step 4 Format Training

In [42]:
gen_df = full_df[["review_text", "cluster", "sentiment"]].copy()

gen_df = gen_df.rename(columns={
    "review_text": "review"
})
#Copy of the original datasets to work on a separate version

In [43]:
gen_df["text"] = gen_df.apply(
    lambda row: build_prompt(row) + row["review"],
    axis=1
)


#### Step 5 TOKENIZE


In [44]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)


In [45]:
from datasets import Dataset

dataset = Dataset.from_pandas(gen_df[["text"]])
gen_df = gen_df.sample(2000, random_state=42)

def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

tokenized = dataset.map(tokenize)


Map:   0%|          | 0/688832 [00:00<?, ? examples/s]

#### Step 6 LoRA FINE-TUNING

In [46]:
!pip install peft




#### Step 7 Apply LoRA

In [47]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=2,
    lora_alpha=4,
    target_modules=["q_proj"],
    lora_dropout=0.01,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)


#### Step 8 TRAIN

In [49]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
import torch

training_args = TrainingArguments(
    output_dir="./mistral_reviews",
    per_device_train_batch_size=8,
    num_train_epochs=0.2,
    logging_steps=50,
    fp16=True,
    max_steps=500
)


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    data_collator=data_collator
)

trainer.train()


Step,Training Loss
50,1.547852
100,1.557379
150,1.47454
200,1.516143
250,1.546038
300,1.536486
350,1.489878
400,1.496431
450,1.59456
500,1.535012


TrainOutput(global_step=500, training_loss=1.529431884765625, metrics={'train_runtime': 733.3751, 'train_samples_per_second': 5.454, 'train_steps_per_second': 0.682, 'total_flos': 1.2714302767104e+16, 'train_loss': 1.529431884765625, 'epoch': 0.00580693115302425})

#### Step 9 N-SHOT PROMPT

In [50]:
def nshot_prompt(cluster):

    examples = gen_df.sample(3)

    shots = ""
    for _, row in examples.iterrows():
        shots += f"""
Category: {row['cluster']}
Sentiment: {row['sentiment']}
Review: {row['review']}
"""

    return f"""
[INST]
You are a professional product reviewer.

Examples:
{shots}

Now write a blog-style recommendation summary for Category {cluster}.
Include:
- best product type
- who should buy it
- common complaints
[/INST]
"""


## Step 10 Now, we generate our fine-tuned model!

In [51]:
def generate_review(cluster):

    prompt = nshot_prompt(cluster)

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    outputs = model.generate(
        **inputs,
        max_new_tokens=200
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


#### TEST

In [52]:

print(generate_review(1))



[INST]
You are a professional product reviewer.

Examples:

Category: 2
Sentiment: positive
Review: I like that it has parental control and a curfew on it, what I don't like is the charger it seemed to have a short in it or doesn't charge fast it's a slow charge but she loves it and it does way more then I expected. She is able to download books she can't use the internet because of the child mode it's a great tablet for children and adults

Category: 3
Sentiment: positive
Review: This is definitely the kind of story I find hard to finish, but harder to put down. Suspense galore, with obvious mystery, intrigue, and, yes, even horror. It's the kind of story that feeds the imagination, even though you may not want it to.<br /><br />I know the author, and really wanted to give it 5 stars. I had to subtract one, though, for 2 reasons. First, I'm not a big fan of sex scenes, even when applied as well as Chad has done here. The other reason is that the story needed some extra time in editin