In [3]:
import pandas as pd
import numpy as np


In [4]:
df1 = pd.read_csv("reviews_badminton/data.csv")
df2 = pd.read_csv("reviews_tawa/data.csv")
df3 = pd.read_csv("reviews_tea/data.csv")


In [5]:
df = pd.concat([df1, df2, df3], ignore_index=True)


In [7]:
df.shape

(20219, 24)

In [9]:
df.columns

Index(['Reviewer Name', 'Review Title', 'Place of Review', 'Up Votes',
       'Down Votes', 'Month', 'Review text', 'Ratings', 'Reviewer_Name',
       'Reviewer_Rating', 'Review_Title', 'Review_Text', 'Place_of_Review',
       'Date_of_Review', 'Up_Votes', 'Down_Votes', 'reviewer_name',
       'reviewer_rating', 'review_title', 'review_text', 'place_of_review',
       'Date_of_review', 'up_votes', 'Down_votes'],
      dtype='object')

In [8]:
df.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings,Reviewer_Name,Reviewer_Rating,...,Up_Votes,Down_Votes,reviewer_name,reviewer_rating,review_title,review_text,place_of_review,Date_of_review,up_votes,Down_votes
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4.0,,,...,,,,,,,,,,
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1.0,,,...,,,,,,,,,,
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1.0,,,...,,,,,,,,,,
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3.0,,,...,,,,,,,,,,
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1.0,,,...,,,,,,,,,,


In [10]:
# Create unified review_text column
df['final_review_text'] = (
    df['Review text']
    .combine_first(df['Review_Text'])
    .combine_first(df['review_text'])
)

# Create unified rating column
df['final_rating'] = (
    df['Ratings']
    .combine_first(df['Reviewer_Rating'])
    .combine_first(df['reviewer_rating'])
)


In [11]:
df = df[['final_review_text', 'final_rating']]


In [12]:
df.shape


(20219, 2)

In [17]:
df.isnull().sum()

final_review_text    0
final_rating         0
dtype: int64

In [14]:
df.head()

Unnamed: 0,final_review_text,final_rating
0,"Nice product, good quality, but price is now r...",4.0
1,They didn't supplied Yonex Mavis 350. Outside ...,1.0
2,Worst product. Damaged shuttlecocks packed in ...,1.0
3,"Quite O. K. , but nowadays the quality of the...",3.0
4,Over pricedJust â?¹620 ..from retailer.I didn'...,1.0


In [16]:
df = df.dropna(subset=['final_review_text', 'final_rating'])


In [18]:
df.shape


(19965, 2)

In [19]:
df.isnull().sum()

final_review_text    0
final_rating         0
dtype: int64

In [20]:
def get_sentiment(rating):
    if rating >= 4:
        return 1
    elif rating <= 2:
        return 0
    else:
        return np.nan

df['sentiment'] = df['final_rating'].apply(get_sentiment)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'] = df['final_rating'].apply(get_sentiment)


In [21]:
df = df.dropna(subset=['sentiment'])


In [22]:
df['sentiment'].value_counts()


sentiment
1.0    17073
0.0     1989
Name: count, dtype: int64

In [23]:
import re
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package stopwords to C:\Users\Sanket
[nltk_data]     Jadhav\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to C:\Users\Sanket
[nltk_data]     Jadhav\AppData\Roaming\nltk_data...


In [24]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


In [25]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)   # remove numbers & punctuation
    words = text.split()
    words = [lemmatizer.lemmatize(word) 
             for word in words if word not in stop_words]
    return ' '.join(words)


In [26]:
df['clean_review'] = df['final_review_text'].apply(clean_text)


In [27]:
df[['final_review_text', 'clean_review']].head()


Unnamed: 0,final_review_text,clean_review
0,"Nice product, good quality, but price is now r...",nice product good quality price rising bad sig...
1,They didn't supplied Yonex Mavis 350. Outside ...,didnt supplied yonex mavis outside cover yonex...
2,Worst product. Damaged shuttlecocks packed in ...,worst product damaged shuttlecock packed new b...
4,Over pricedJust â?¹620 ..from retailer.I didn'...,pricedjust retaileri didnt understand wat adva...
5,Good quality product. Delivered on time.READ MORE,good quality product delivered timeread


In [29]:
from sklearn.model_selection import train_test_split

X = df['clean_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2)
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [31]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)


In [32]:
from sklearn.metrics import classification_report, f1_score

y_pred = model.predict(X_test_tfidf)

print(classification_report(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))


              precision    recall  f1-score   support

         0.0       0.94      0.77      0.84       398
         1.0       0.97      0.99      0.98      3415

    accuracy                           0.97      3813
   macro avg       0.95      0.88      0.91      3813
weighted avg       0.97      0.97      0.97      3813

F1 Score: 0.9834830483917705


#### Other models were explored, but Logistic Regression with TF-IDF gave best performance with lowest complexity.

In [33]:
neg_reviews = df[df['sentiment'] == 0]


In [34]:
from collections import Counter

all_words = ' '.join(neg_reviews['clean_review']).split()
Counter(all_words).most_common(20)


[('tata', 1834),
 ('tea', 1834),
 ('goodread', 1005),
 ('v', 918),
 ('gold', 917),
 ('premiumtata', 917),
 ('premium', 917),
 ('shuttle', 300),
 ('quality', 254),
 ('product', 214),
 ('bad', 178),
 ('qualityread', 122),
 ('good', 117),
 ('worst', 102),
 ('poor', 89),
 ('productread', 79),
 ('one', 72),
 ('dont', 71),
 ('buy', 63),
 ('day', 59)]

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(ngram_range=(2,2), stop_words='english')
X_neg = cv.fit_transform(neg_reviews['clean_review'])

bigram_freq = zip(cv.get_feature_names_out(), X_neg.sum(axis=0).A1)
sorted(bigram_freq, key=lambda x: x[1], reverse=True)[:10]


[('gold tata', np.int64(917)),
 ('premium goodread', np.int64(917)),
 ('premiumtata tea', np.int64(917)),
 ('tata gold', np.int64(917)),
 ('tata tea', np.int64(917)),
 ('tea premium', np.int64(917)),
 ('tea premiumtata', np.int64(917)),
 ('poor quality', np.int64(37)),
 ('bad quality', np.int64(36)),
 ('dont buy', np.int64(34))]

In [36]:
import pickle

pickle.dump(model, open("sentiment_model.pkl", "wb"))
pickle.dump(tfidf, open("tfidf_vectorizer.pkl", "wb"))


In [38]:
df.to_csv("Flipkart_Review.csv")