## 1. Data Loading and Initial Exploration

In [1]:
import pandas as pd

file_path = r"C:\Users\Hi\Downloads\PYTHON\FINAL PROJECT\aspect-based-sentiment-analysis\Dataset.csv"

df = pd.read_csv(
    file_path,
    encoding="latin1",
    low_memory=False
)


In [2]:
print(df.head())
print(df.columns)
print(df.info())
print(df.isnull().sum())


                                        Product_name      Price Rate  \
0  Crompton 75 L Desert Air Cooler??(White, Tea...  ??10,499    5   
1  Crompton 75 L Desert Air Cooler??(White, Tea...  ??10,499    4   
2  Crompton 75 L Desert Air Cooler??(White, Tea...  ??10,499    5   
3  Crompton 75 L Desert Air Cooler??(White, Tea...  ??10,499    5   
4  Crompton 75 L Desert Air Cooler??(White, Tea...  ??10,499    4   

                                              Review  \
0                                     Simply awesome   
1  Worth the money . Desert Cooler live up to the...   
2                                  Worth every penny   
3                                          Fabulous!   
4                                       Nice product   

                                             Summary  
0  it's really worth every single penny. it works...  
1  I bought Crompton Ozone 75 Desert Air Cooler i...  
2  GREAT packaging by seller. As this was the mos...  
3  Deliver

## 2. Data Cleaning and Preparation


In [3]:
df.columns = df.columns.str.strip()  #Remove space in column names

df["Rate"] = pd.to_numeric(df["Rate"], errors="coerce") #Convert Rate to Numeric 
df = df.dropna(subset=["Rate"]) #Reomve row with missing rating

df["text"] = df["Review"].fillna("") + " " + df["Summary"].fillna("") #Combine Review + Summary 


df = df[["Product_name", "text", "Rate"]].copy() 

print(df.head())


                                        Product_name  \
0  Crompton 75 L Desert Air Cooler??(White, Tea...   
1  Crompton 75 L Desert Air Cooler??(White, Tea...   
2  Crompton 75 L Desert Air Cooler??(White, Tea...   
3  Crompton 75 L Desert Air Cooler??(White, Tea...   
4  Crompton 75 L Desert Air Cooler??(White, Tea...   

                                                text  Rate  
0  Simply awesome it's really worth every single ...   5.0  
1  Worth the money . Desert Cooler live up to the...   4.0  
2  Worth every penny GREAT packaging by seller. A...   5.0  
3  Fabulous! Delivery was delayed by two days exc...   5.0  
4  Nice product A Good cooler by Crompton. The he...   4.0  


In [4]:
df["Rate"].value_counts().sort_index()


Rate
1.0     40077
2.0     12980
3.0     32098
4.0     74252
5.0    203832
Name: count, dtype: int64

## 3. Sentiment Label Creation


In [5]:
def map_sentiment(r):
    if r <= 2:
        return "Negative"
    elif r == 3:
        return "Neutral"
    else:
        return "Positive"

df["Sentiment"] = df["Rate"].apply(map_sentiment)


In [6]:
df["Sentiment"].value_counts()


Sentiment
Positive    278084
Negative     53057
Neutral      32098
Name: count, dtype: int64

## 4. Text Preprocessing (NLP)



In [7]:
import re
import nltk


In [8]:
##Download NLP Resources

In [9]:
nltk.download("stopwords")
nltk.download("wordnet")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
#Setup Tools
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


In [11]:
#Create Cleaning Function
def clean_text(text):
    text = text.lower()                          # lowercase
    text = re.sub(r"[^a-zA-Z]", " ", text)      # remove symbols & numbers
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)



In [12]:
df["clean_text"] = df["text"].apply(clean_text)


In [13]:
df[["text","clean_text"]].head()


Unnamed: 0,text,clean_text
0,Simply awesome it's really worth every single ...,simply awesome really worth every single penny...
1,Worth the money . Desert Cooler live up to the...,worth money desert cooler live name bought cro...
2,Worth every penny GREAT packaging by seller. A...,worth every penny great packaging seller impor...
3,Fabulous! Delivery was delayed by two days exc...,fabulous delivery delayed two day except every...
4,Nice product A Good cooler by Crompton. The he...,nice product good cooler crompton height coole...


## 5. Feature Extraction using TF-IDF


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [15]:
#Create TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)


In [16]:
#Transform Text → Numbers
X = tfidf.fit_transform(df["clean_text"])


In [17]:
#Prepare Labels
y = df["Sentiment"]


In [18]:
print(X.shape)


(363239, 5000)


In [19]:
#train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [20]:
print(X_train.shape)
print(X_test.shape)


(290591, 5000)
(72648, 5000)


## 7. Model 1 - Logistic Regression (Baseline)


In [21]:
from sklearn.linear_model import LogisticRegression


In [22]:
#Initialize Model
lr_model = LogisticRegression(
    class_weight="balanced",
    max_iter=1000
)


In [23]:
#Train Model
lr_model.fit(X_train, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [24]:
#Make Predictions
y_pred = lr_model.predict(X_test)


In [25]:
#Evaluate Model
from sklearn.metrics import classification_report, accuracy_score

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9482573505120582
              precision    recall  f1-score   support

    Negative       0.95      0.96      0.96     10611
     Neutral       0.67      0.93      0.78      6420
    Positive       0.99      0.95      0.97     55617

    accuracy                           0.95     72648
   macro avg       0.87      0.95      0.90     72648
weighted avg       0.96      0.95      0.95     72648



In [26]:
## 8. Model 2 - LSTM (Deep Learning)


In [27]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical


In [28]:
#Convert Sentiment Labels to Numbers
label_map = {"Negative":0, "Neutral":1, "Positive":2}
df["label"] = df["Sentiment"].map(label_map)


In [29]:
df[["Sentiment","label"]].head()


Unnamed: 0,Sentiment,label
0,Positive,2
1,Positive,2
2,Positive,2
3,Positive,2
4,Positive,2


In [30]:
#Tokenization (Text → Integers)
max_words = 5000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df["clean_text"])


In [31]:
#Convert Text to Sequences
sequences = tokenizer.texts_to_sequences(df["clean_text"])


In [32]:
#Padding
max_len = 100

X_seq = pad_sequences(sequences, maxlen=max_len)


In [33]:
#Prepare Labels
y_seq = to_categorical(df["label"], num_classes=3)


In [34]:
#Train/Test Split (for LSTM)
from sklearn.model_selection import train_test_split

X_train_seq, X_test_seq, y_train_seq, y_test_seq = train_test_split(
    X_seq, y_seq,
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)


In [35]:
#Check Shapes
print(X_train_seq.shape)
print(X_test_seq.shape)


(290591, 100)
(72648, 100)


## 9. LSTM Model Architecture


In [36]:
#Import Required Layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout


In [37]:
#from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential()

# Embedding Layer (define input_length here safely)
model.add(Embedding(
    input_dim=5000,
    output_dim=128,
    input_shape=(100,)
))

# LSTM Layer
model.add(LSTM(64))

# Dropout
model.add(Dropout(0.5))

# Output Layer
model.add(Dense(3, activation="softmax"))



  super().__init__(**kwargs)


In [38]:
#Compile Model
model.compile(
    loss="categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)


In [39]:
model.summary()


## 10. Training the LSTM Model

In [40]:
# Train the LSTM model

history = model.fit(
    X_train_seq,
    y_train_seq,
    epochs=2,
    batch_size=256,
    validation_split=0.1
)

Epoch 1/2
[1m1022/1022[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 152ms/step - accuracy: 0.9597 - loss: 0.1203 - val_accuracy: 0.9779 - val_loss: 0.0580
Epoch 2/2
[1m1022/1022[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 148ms/step - accuracy: 0.9797 - loss: 0.0616 - val_accuracy: 0.9805 - val_loss: 0.0569


## 11. Evaluating LSTM Model Performance

In [41]:
# Evaluate on test data

loss, accuracy = model.evaluate(X_test_seq, y_test_seq)

print("Test Accuracy:", accuracy)

[1m2271/2271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 14ms/step - accuracy: 0.9799 - loss: 0.0622
Test Accuracy: 0.9799031019210815


## 12. Model 3 - Transformer (DistilBERT)

In [42]:
from transformers import pipeline

In [43]:
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)




Device set to use cpu


## 13. Testing DistilBERT on Flipkart Reviews

In [44]:
sample_texts = df["text"].iloc[:10].tolist()

results = sentiment_pipeline(sample_texts)

for text, result in zip(sample_texts, results):
    print("Review:", text[:100])
    print("Prediction:", result)
    print("-" * 50)

Review: Simply awesome it's really worth every single penny. it works like one ton AC provided that your roo
Prediction: {'label': 'POSITIVE', 'score': 0.9995860457420349}
--------------------------------------------------
Review: Worth the money . Desert Cooler live up to the Name . I bought Crompton Ozone 75 Desert Air Cooler i
Prediction: {'label': 'POSITIVE', 'score': 0.9997459053993225}
--------------------------------------------------
Review: Worth every penny GREAT packaging by seller. As this was the most important point while transportati
Prediction: {'label': 'POSITIVE', 'score': 0.9995025396347046}
--------------------------------------------------
Review: Fabulous! Delivery was delayed by two days except this everything is quite fine.safely packed.air fl
Prediction: {'label': 'POSITIVE', 'score': 0.9982490539550781}
--------------------------------------------------
Review: Nice product A Good cooler by Crompton. The height of the cooler is about 3ft 10 inch.  color is nic

## 14. Evaluating DistilBERT on Binary Sentiment (Subset)

Since the pretrained DistilBERT (SST-2) model is a binary classifier,
we evaluate it on Positive and Negative reviews only.

Neutral reviews are retained for 3-class models (Logistic Regression and LSTM),
but excluded here to ensure fair evaluation.

In [45]:
#Create Binary Dataset
# Keep only Positive and Negative
binary_df = df[df["Sentiment"] != "Neutral"].copy()

binary_df["binary_label"] = binary_df["Sentiment"].map({
    "Negative": "NEGATIVE",
    "Positive": "POSITIVE"
})

binary_df["binary_label"].value_counts()

binary_label
POSITIVE    278084
NEGATIVE     53057
Name: count, dtype: int64

## 15. Creating Balanced Evaluation Subset

To ensure fair evaluation, we create a balanced subset 
with equal Positive and Negative samples.

In [46]:
# Sample equal number of Positive and Negative reviews (10,000 each)

pos_sample = binary_df[binary_df["binary_label"] == "POSITIVE"].sample(10000, random_state=42)
neg_sample = binary_df[binary_df["binary_label"] == "NEGATIVE"].sample(10000, random_state=42)

balanced_df = pd.concat([pos_sample, neg_sample]).sample(frac=1, random_state=42)

balanced_df["binary_label"].value_counts()

binary_label
NEGATIVE    10000
POSITIVE    10000
Name: count, dtype: int64

## 16. Running DistilBERT Inference on Balanced Dataset

We perform batch inference using the pretrained DistilBERT model
on a balanced subset of 20,000 reviews.

In [47]:
# Convert texts to list
texts = balanced_df["text"].tolist()

# Run batched predictions (batch_size helps speed)
predictions = sentiment_pipeline(
    texts,
    batch_size=32,
    truncation=True
)

# Extract predicted labels
pred_labels = [pred["label"] for pred in predictions]

balanced_df["predicted_label"] = pred_labels

In [48]:
#Calculate Accuracy
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(
    balanced_df["binary_label"],
    balanced_df["predicted_label"]
)

print("DistilBERT Accuracy:", accuracy)

print(classification_report(
    balanced_df["binary_label"],
    balanced_df["predicted_label"]
))

DistilBERT Accuracy: 0.9478
              precision    recall  f1-score   support

    NEGATIVE       0.95      0.95      0.95     10000
    POSITIVE       0.95      0.94      0.95     10000

    accuracy                           0.95     20000
   macro avg       0.95      0.95      0.95     20000
weighted avg       0.95      0.95      0.95     20000



## 17. Model Comparison Summary

We compare the performance of:
- Logistic Regression (TF-IDF)
- LSTM (Deep Learning)
- DistilBERT (Pretrained Transformer - Binary Evaluation)

In [49]:
import pandas as pd

comparison = pd.DataFrame({
    "Model": [
        "Logistic Regression (TF-IDF)",
        "LSTM (Deep Learning)",
        "DistilBERT (Pretrained - Binary)"
    ],
    "Accuracy": [
        0.9482,      # Logistic Regression
        0.9805,      # LSTM
        0.9478       # DistilBERT (binary)
    ]
})

comparison

Unnamed: 0,Model,Accuracy
0,Logistic Regression (TF-IDF),0.9482
1,LSTM (Deep Learning),0.9805
2,DistilBERT (Pretrained - Binary),0.9478


## 18. Final Conclusion

This project implemented and compared three sentiment analysis approaches:

1. Logistic Regression (TF-IDF features)
2. LSTM (Deep Learning model)
3. Pretrained Transformer (DistilBERT)

The LSTM model achieved the highest performance (98% accuracy),
demonstrating the importance of sequential modeling and domain-specific training.

While DistilBERT demonstrated strong transfer learning capability,
fine-tuning on the Flipkart dataset would likely further improve performance.

Overall, this project demonstrates:
- Traditional Machine Learning
- Deep Learning for NLP
- Transformer-based Transfer Learning
- Model Evaluation and Comparison

## 19. Fine-Tuning DistilBERT for 3-Class Sentiment Classification

In this step, we fine-tune the base DistilBERT model on the Flipkart dataset 
to perform 3-class sentiment classification (Negative, Neutral, Positive).

To ensure computational feasibility on CPU, we train on a controlled subset (~50,000 samples).

In [50]:
# Take 50k samples (stratified to preserve class distribution)

subset_df = df.sample(50000, random_state=42)

subset_df["label"] = subset_df["Sentiment"].map({
    "Negative": 0,
    "Neutral": 1,
    "Positive": 2
})

subset_df["label"].value_counts()

label
2    38247
0     7391
1     4362
Name: count, dtype: int64

In [51]:
# Create balanced subset for fine-tuning

neg_df = df[df["Sentiment"] == "Negative"].sample(4000, random_state=42)
neu_df = df[df["Sentiment"] == "Neutral"].sample(4000, random_state=42)
pos_df = df[df["Sentiment"] == "Positive"].sample(4000, random_state=42)

subset_df = pd.concat([neg_df, neu_df, pos_df]).sample(frac=1, random_state=42)

subset_df["label"] = subset_df["Sentiment"].map({
    "Negative": 0,
    "Neutral": 1,
    "Positive": 2
})

subset_df["label"].value_counts()

label
0    4000
1    4000
2    4000
Name: count, dtype: int64

## 20. Preparing Data for Transformer Fine-Tuning

We split the balanced dataset into training and testing sets,
then tokenize the text using the DistilBERT tokenizer.

In [52]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    subset_df,
    test_size=0.2,
    stratify=subset_df["label"],
    random_state=42
)

print("Train size:", len(train_df))
print("Test size:", len(test_df))

Train size: 9600
Test size: 2400


## 21. Tokenizing Text using DistilBERT Tokenizer

We use the DistilBERT tokenizer to convert raw text into token IDs 
that can be processed by the Transformer model.

In [53]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [54]:
def tokenize_function(examples):
    return tokenizer(
        examples,
        padding="max_length",
        truncation=True,
        max_length=128
    )

In [55]:
train_encodings = tokenize_function(train_df["text"].tolist())
test_encodings = tokenize_function(test_df["text"].tolist())

In [56]:
print(train_encodings["input_ids"][0][:20])

[101, 26380, 3835, 2374, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


## 22. Creating PyTorch Dataset for Fine-Tuning

We convert tokenized encodings and labels into a PyTorch dataset
compatible with HuggingFace Trainer API.

In [57]:
import torch

class FlipkartDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

In [58]:
train_dataset = FlipkartDataset(
    train_encodings,
    train_df["label"].tolist()
)

test_dataset = FlipkartDataset(
    test_encodings,
    test_df["label"].tolist()
)

len(train_dataset), len(test_dataset)

(9600, 2400)

## 23. Loading DistilBERT Model for 3-Class Classification

We load the base DistilBERT model and configure it 
for 3-class sentiment classification.

In [59]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=3
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 24. Defining Training Configuration

In [60]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

In [61]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=3
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [62]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="no",
    logging_dir="./logs",
    report_to="none"
)

In [63]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [64]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.1222,0.067021
2,0.0405,0.069112


TrainOutput(global_step=1200, training_loss=0.07601396719614664, metrics={'train_runtime': 7434.2626, 'train_samples_per_second': 2.583, 'train_steps_per_second': 0.161, 'total_flos': 635854852915200.0, 'train_loss': 0.07601396719614664, 'epoch': 2.0})

In [65]:
trainer.evaluate()

{'eval_loss': 0.06911160796880722,
 'eval_runtime': 217.9516,
 'eval_samples_per_second': 11.012,
 'eval_steps_per_second': 0.688,
 'epoch': 2.0}

In [66]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

predictions = trainer.predict(test_dataset)

preds = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids

accuracy = accuracy_score(true_labels, preds)

print("Fine-Tuned DistilBERT Accuracy:", accuracy)
print(classification_report(true_labels, preds))

Fine-Tuned DistilBERT Accuracy: 0.98875
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       800
           1       0.99      0.98      0.99       800
           2       0.99      0.99      0.99       800

    accuracy                           0.99      2400
   macro avg       0.99      0.99      0.99      2400
weighted avg       0.99      0.99      0.99      2400



In [67]:
# Save model and tokenizer
model.save_pretrained("./fine_tuned_distilbert_flipkart")
tokenizer.save_pretrained("./fine_tuned_distilbert_flipkart")

('./fine_tuned_distilbert_flipkart\\tokenizer_config.json',
 './fine_tuned_distilbert_flipkart\\special_tokens_map.json',
 './fine_tuned_distilbert_flipkart\\vocab.txt',
 './fine_tuned_distilbert_flipkart\\added_tokens.json',
 './fine_tuned_distilbert_flipkart\\tokenizer.json')

In [68]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_path = "./fine_tuned_distilbert_flipkart"

saved_tokenizer = AutoTokenizer.from_pretrained(model_path)
saved_model = AutoModelForSequenceClassification.from_pretrained(model_path)

print("Model loaded successfully.")

Model loaded successfully.
