# Notebook: Automated Customer Review Analysis using Machine Learning

### Imports

In [6]:
import pandas as pd
import numpy as np
from pathlib import Path


### Load data

In [8]:
df1 = pd.read_csv("Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv")
df2 = pd.read_csv("1429_1.csv")
df3 = pd.read_csv("clean_reviews.csv")


  df2 = pd.read_csv("1429_1.csv")


#### Standardize columns

In [11]:
def standardize(df):
    df = df.rename(columns={
        "reviews.text": "review_text",
        "reviews.rating": "rating",
        "reviews.title": "title",
        "name": "product"
    })
    return df

df1 = standardize(df1)
df2 = standardize(df2)
df3 = standardize(df3)


### Remove n/a values

In [12]:
df1 = df1.dropna(subset=["review_text"])
df2 = df2.dropna(subset=["review_text"])
df3 = df3.dropna(subset=["review_text"])


### Keep Only usefull columns

In [13]:
df1 = df1[["review_text", "rating", "product"]]
df2 = df2[["review_text", "rating", "product"]]
df3 = df3[["review_text", "rating", "product"]]


### Unite Datasets

In [14]:
full_df = pd.concat([df1, df2, df3], ignore_index=True)


### Remove n/a values

In [16]:
full_df = full_df.dropna(subset=["rating"])


### Create sentiment label

In [17]:
def rating_to_sentiment(r):
    if r >= 4:
        return "positive"
    elif r == 3:
        return "neutral"
    else:
        return "negative"

full_df["sentiment"] = full_df["rating"].apply(rating_to_sentiment)


In [19]:
full_df.head()


Unnamed: 0,review_text,rating,product,sentiment
0,I order 3 of them and one of the item is bad q...,3.0,AmazonBasics AAA Performance Alkaline Batterie...,neutral
1,Bulk is always the less expensive way to go fo...,4.0,AmazonBasics AAA Performance Alkaline Batterie...,positive
2,Well they are not Duracell but for the price i...,5.0,AmazonBasics AAA Performance Alkaline Batterie...,positive
3,Seem to work as well as name brand batteries a...,5.0,AmazonBasics AAA Performance Alkaline Batterie...,positive
4,These batteries are very long lasting the pric...,5.0,AmazonBasics AAA Performance Alkaline Batterie...,positive


### Save clean dataset

In [20]:
full_df.to_csv("clean_reviews.csv", index=False)


## 1 Classifier: Sentiment Classification Model 

### We now train a classification model to predict sentiment from product reviwes using supervised learning.

### Step 1 TF_IDF

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)

X = vectorizer.fit_transform(full_df["review_text"])
y = full_df["sentiment"]


### Step 2 Train-Test Split

In [37]:
vectorizer.fit_transform(full_df["review_text"])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 23041703 stored elements and shape (500000, 5000)>

In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [39]:
X_train_text, X_test_text = train_test_split(
    full_df["review_text"],
    test_size=0.2,
    random_state=42
)


In [40]:
vectorizer.fit(X_train_text)
X_train = vectorizer.transform(X_train_text)
X_test = vectorizer.transform(X_test_text)


### Train Logistic Regression

In [41]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


### Evaluate Model


In [42]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

    negative       0.67      0.50      0.57      6610
     neutral       0.48      0.23      0.31      7971
    positive       0.92      0.98      0.95     85419

    accuracy                           0.89    100000
   macro avg       0.69      0.57      0.61    100000
weighted avg       0.87      0.89      0.87    100000



# 2 Clustering
 ## We now apply clustering to discover common themes in product reviews.Unlike classification, clustering groups reviews based on simlarity.


#### Step 1 TF_IDF

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

cluster_vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words="english"
)

X_cluster = cluster_vectorizer.fit_transform(full_df["review_text"])


## We group reviews into clusters based on similarity (K-Means Clustering)

#### Step 2 KMeans

In [44]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=42)

clusters = kmeans.fit_predict(X_cluster)

full_df["cluster"] = clusters


#### Cluster Distribution

In [45]:
full_df["cluster"].value_counts()


cluster
0    374050
4     83276
2     16154
3     15969
1     10551
Name: count, dtype: int64

### Step 3 Top Words

In [46]:
import numpy as np

terms = cluster_vectorizer.get_feature_names_out()

for i in range(5):
    center = kmeans.cluster_centers_[i]
    top_words = [terms[ind] for ind in center.argsort()[-10:]]
    print(f"Cluster {i}: ", top_words)


Cluster 0:  ['really', 'just', 'loved', 'like', 'good', 'books', 'story', 'great', 'read', 'book']
Cluster 1:  ['series', 'information', 'quality', 'condition', 'reading', 'story', 'product', 'read', 'book', 'good']
Cluster 2:  ['kids', 'condition', 'price', 'series', 'gift', 'story', 'product', 'read', 'book', 'great']
Cluster 3:  ['author', 'coloring', 'story', 'read', 'great', 'series', 'kids', 'books', 'book', 'love']
Cluster 4:  ['books', 'really', 'characters', '34', 'just', 'like', 'read', 'story', 'book', 'br']


# 3 Review Summarization Using Generative AI

#### Step 1 Model Setup

In [47]:
pip install transformers accelerate bitsandbytes peft sentencepiece


Note: you may need to restart the kernel to use updated packages.


#### Step 2 Load TinyLlama

In [49]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name
)





Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

#### Step 3 Prepare training format

In [51]:
def build_prompt(row):
    return f"""
[INST]
You are a professional tech reviewer.

Product Category: {row['cluster']}
Customer Sentiment: {row['sentiment']}

Write a short blog-style product review summary.
Mention strengths, weaknesses and ideal buyer.
[/INST]
"""


#### Step 4 Format Training

In [62]:
gen_df = full_df[["review_text", "cluster", "sentiment"]].copy()

gen_df = gen_df.rename(columns={
    "review_text": "review"
})
#Copy of the original datasets to work on a separate version

In [64]:
gen_df["text"] = gen_df.apply(
    lambda row: build_prompt(row) + row["review"],
    axis=1
)


#### Step 5 TOKENIZE


In [65]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)


In [66]:
from datasets import Dataset

dataset = Dataset.from_pandas(gen_df[["text"]])

def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

tokenized = dataset.map(tokenize)


Map:   0%|          | 0/500000 [00:00<?, ? examples/s]

#### Step 6 LoRA FINE-TUNING

In [67]:
!pip install peft




#### Step 7 Apply LoRA

In [68]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj","v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)


#### Step 8 TRAIN

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./mistral_reviews",
    per_device_train_batch_size=1,
    num_train_epochs=1,
    logging_steps=20,
    use_cpu=True,
    fp16=False,
    bf16=False
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    data_collator=data_collator
)

trainer.train()


Step,Training Loss
20,3.247205
40,2.971854
60,2.601298
80,2.055666


#### Step 9 N-SHOT PROMPT

In [None]:
def nshot_prompt(cluster):

    examples = gen_df.sample(3)

    shots = ""
    for _, row in examples.iterrows():
        shots += f"""
Category: {row['cluster']}
Sentiment: {row['sentiment']}
Review: {row['review']}
"""

    return f"""
[INST]
You are a professional product reviewer.

Examples:
{shots}

Now write a blog-style recommendation summary for Category {cluster}.
Include:
- best product type
- who should buy it
- common complaints
[/INST]
"""


## Step 10 Now, we generate our fine-tuned model!

In [None]:
def generate_review(cluster):

    prompt = nshot_prompt(cluster)

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    outputs = model.generate(
        **inputs,
        max_new_tokens=200
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


#### TEST

In [None]:

print(generate_review(1))
