In [1]:
import pandas as pd
import numpy as np
import re
import pickle
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
df = pd.read_csv("badminton_data.csv")
df.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8518 entries, 0 to 8517
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Reviewer Name    8508 non-null   object 
 1   Review Title     8508 non-null   object 
 2   Place of Review  8468 non-null   object 
 3   Up Votes         8508 non-null   float64
 4   Down Votes       8508 non-null   float64
 5   Month            8053 non-null   object 
 6   Review text      8510 non-null   object 
 7   Ratings          8518 non-null   int64  
dtypes: float64(2), int64(1), object(5)
memory usage: 532.5+ KB


In [5]:
df.columns

Index(['Reviewer Name', 'Review Title', 'Place of Review', 'Up Votes',
       'Down Votes', 'Month', 'Review text', 'Ratings'],
      dtype='object')

In [6]:
df = df.dropna(subset=['Review text', 'Ratings'])
df.isnull().sum()

Unnamed: 0,0
Reviewer Name,2
Review Title,2
Place of Review,42
Up Votes,2
Down Votes,2
Month,457
Review text,0
Ratings,0


In [7]:
df = df[df['Ratings'] != 3]

df['Sentiment'] = df['Ratings'].apply(
    lambda x: 'positive' if x >= 4 else 'negative'
)

df['Sentiment'].value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
positive,6823
negative,1072


In [8]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z ]', ' ', text)
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

In [9]:
df['clean_review'] = df['Review text'].apply(clean_text)
df[['Review text', 'clean_review']].head()

Unnamed: 0,Review text,clean_review
0,"Nice product, good quality, but price is now r...",nice product good quality price rising bad sig...
1,They didn't supplied Yonex Mavis 350. Outside ...,supplied yonex mavis outside cover yonex ad in...
2,Worst product. Damaged shuttlecocks packed in ...,worst product damaged shuttlecock packed new b...
4,Over pricedJust â?¹620 ..from retailer.I didn'...,pricedjust retailer understand wat advantage b...
5,Good quality product. Delivered on time.READ MORE,good quality product delivered time read


In [10]:
X = df['clean_review']
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [11]:
bow = CountVectorizer(max_features=5000)
X_train_bow = bow.fit_transform(X_train)
X_test_bow = bow.transform(X_test)

In [12]:
lr_bow = LogisticRegression(max_iter=1000)
lr_bow.fit(X_train_bow, y_train)
pred_lr_bow = lr_bow.predict(X_test_bow)

nb_bow = MultinomialNB()
nb_bow.fit(X_train_bow, y_train)
pred_nb_bow = nb_bow.predict(X_test_bow)

print("BoW + LR F1:", f1_score(y_test, pred_lr_bow, pos_label='positive'))
print("BoW + NB F1:", f1_score(y_test, pred_nb_bow, pos_label='positive'))

BoW + LR F1: 0.9588550983899821
BoW + NB F1: 0.9573629523468291


In [13]:
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [14]:
lr_tfidf = LogisticRegression(max_iter=1000)
lr_tfidf.fit(X_train_tfidf, y_train)
pred_lr_tfidf = lr_tfidf.predict(X_test_tfidf)

svm_tfidf = LinearSVC()
svm_tfidf.fit(X_train_tfidf, y_train)
pred_svm_tfidf = svm_tfidf.predict(X_test_tfidf)

print("TF-IDF + LR F1:", f1_score(y_test, pred_lr_tfidf, pos_label='positive'))
print("TF-IDF + SVM F1:", f1_score(y_test, pred_svm_tfidf, pos_label='positive'))

TF-IDF + LR F1: 0.954239091876552
TF-IDF + SVM F1: 0.957492795389049


In [16]:
!pip install gensim



In [18]:
from gensim.models import Word2Vec

In [19]:
tokenized = [review.split() for review in df['clean_review']]

w2v = Word2Vec(
    sentences=tokenized,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4
)

In [20]:
def w2v_vector(words, model):
    vectors = [model.wv[w] for w in words if w in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

X_w2v = np.array([w2v_vector(r.split(), w2v) for r in X])

In [21]:
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(
    X_w2v, y, test_size=0.2, random_state=42, stratify=y
)

lr_w2v = LogisticRegression(max_iter=1000)
lr_w2v.fit(X_train_w2v, y_train_w2v)
pred_lr_w2v = lr_w2v.predict(X_test_w2v)

mlp_w2v = MLPClassifier(hidden_layer_sizes=(128,64), max_iter=20)
mlp_w2v.fit(X_train_w2v, y_train_w2v)
pred_mlp_w2v = mlp_w2v.predict(X_test_w2v)

print("W2V + LR F1:", f1_score(y_test_w2v, pred_lr_w2v, pos_label='positive'))
print("W2V + DL F1:", f1_score(y_test_w2v, pred_mlp_w2v, pos_label='positive'))

W2V + LR F1: 0.9273097826086957
W2V + DL F1: 0.9273097826086957




In [22]:
!pip install transformers



In [23]:
import torch
from transformers import BertTokenizer, BertModel

In [24]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased')
bert.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [25]:
def bert_embed(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=64)
    with torch.no_grad():
        outputs = bert(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

X_bert = np.array([bert_embed(t) for t in X[:500]])
y_bert = y[:500]

In [26]:
X_train_bert, X_test_bert, y_train_bert, y_test_bert = train_test_split(
    X_bert, y_bert, test_size=0.2, random_state=42, stratify=y_bert
)

mlp_bert = MLPClassifier(hidden_layer_sizes=(256,), max_iter=20)
mlp_bert.fit(X_train_bert, y_train_bert)
pred_mlp_bert = mlp_bert.predict(X_test_bert)

print("BERT + DL F1:", f1_score(y_test_bert, pred_mlp_bert, pos_label='positive'))

BERT + DL F1: 0.9420289855072463




In [27]:
comparison = pd.DataFrame({
    "Embedding": ["BoW","BoW","TF-IDF","TF-IDF","W2V","W2V","BERT"],
    "Model": ["LR","NB","LR","SVM","LR","DL","DL"],
    "F1 Score": [
        f1_score(y_test, pred_lr_bow, pos_label='positive'),
        f1_score(y_test, pred_nb_bow, pos_label='positive'),
        f1_score(y_test, pred_lr_tfidf, pos_label='positive'),
        f1_score(y_test, pred_svm_tfidf, pos_label='positive'),
        f1_score(y_test_w2v, pred_lr_w2v, pos_label='positive'),
        f1_score(y_test_w2v, pred_mlp_w2v, pos_label='positive'),
        f1_score(y_test_bert, pred_mlp_bert, pos_label='positive')
    ]
})

comparison

Unnamed: 0,Embedding,Model,F1 Score
0,BoW,LR,0.958855
1,BoW,NB,0.957363
2,TF-IDF,LR,0.954239
3,TF-IDF,SVM,0.957493
4,W2V,LR,0.92731
5,W2V,DL,0.92731
6,BERT,DL,0.942029


In [28]:
comparison.sort_values(by="F1 Score", ascending=False)

Unnamed: 0,Embedding,Model,F1 Score
0,BoW,LR,0.958855
3,TF-IDF,SVM,0.957493
1,BoW,NB,0.957363
2,TF-IDF,LR,0.954239
6,BERT,DL,0.942029
4,W2V,LR,0.92731
5,W2V,DL,0.92731


In [29]:
import pickle

with open("sentiment_model.pkl", "wb") as f:
    pickle.dump(lr_tfidf, f)

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

print("Pickle files saved successfully")

Pickle files saved successfully
