In [1]:
import numpy as np
import pandas as pd
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
df =pd.read_csv('data/reviews_data_dump/reviews_badminton/data.csv')

In [3]:
df.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1


In [4]:
df[["Review text", 'Ratings']]

Unnamed: 0,Review text,Ratings
0,"Nice product, good quality, but price is now r...",4
1,They didn't supplied Yonex Mavis 350. Outside ...,1
2,Worst product. Damaged shuttlecocks packed in ...,1
3,"Quite O. K. , but nowadays the quality of the...",3
4,Over pricedJust â?¹620 ..from retailer.I didn'...,1
...,...,...
8513,,5
8514,,2
8515,,4
8516,,1


In [5]:
df[["Review text", "Ratings"]].dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[["Review text", "Ratings"]].dropna(inplace=True)


In [6]:
df[["Review text"]].isnull().sum()

Review text    8
dtype: int64

In [7]:
df[['Review text']].dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[['Review text']].dropna(inplace=True)


In [8]:
df

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1
...,...,...,...,...,...,...,...,...
8513,,,,,,,,5
8514,,,,,,,,2
8515,,,,,,,,4
8516,,,,,,,,1


In [9]:
df['Review text'][0]

'Nice product, good quality, but price is now rising which is a bad sign. 800-850 was an affordable price, especially when we play everyday. So kindly help us out in terms of the price. Thank You.READ MORE'

In [10]:
df["Ratings"] = df["Ratings"].apply(lambda x: "positive" if x >= 4 else "Negative")

In [11]:
df['Ratings'].value_counts()

Ratings
positive    6826
Negative    1692
Name: count, dtype: int64

### Identify target variable

In [12]:
X=df[["Review text"]]
y =df["Ratings"]

### split

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0, stratify=y)

### data preprocessing

In [14]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [15]:
nltk.download('stopwords')
# Downloading wordnet before applying Lemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [16]:
stemmer = PorterStemmer()

lemmatizer = WordNetLemmatizer()

In [17]:
def preprocess(raw_text, flag):
    # Convert input to string (if it's not already)
    raw_text = str(raw_text)

    # Removing emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642" 
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    raw_text = emoji_pattern.sub(r'', raw_text)

    # Removing special characters and digits
    sentence = re.sub("[^a-zA-Z]", " ", raw_text)

    # change sentence to lower case
    sentence = sentence.lower()

    # tokenize into words
    tokens = sentence.split()

    # remove stop words
    clean_tokens = [t for t in tokens if not t in stopwords.words("english")]

    # Stemming/Lemmatization
    if flag == 'stem':
        clean_tokens = [stemmer.stem(word) for word in clean_tokens]
    else:
        clean_tokens = [lemmatizer.lemmatize(word) for word in clean_tokens]

    return pd.Series([" ".join(clean_tokens), len(clean_tokens)])


In [18]:
from tqdm import tqdm, tqdm_notebook

In [19]:
tqdm.pandas()

In [20]:
X_train

Unnamed: 0,Review text
3962,Poor qualityREAD MORE
635,Best :)READ MORE
8381,SuperREAD MORE
1191,SuperREAD MORE
4240,GoodREAD MORE
...,...
4429,very nice product 👌👌 at a better priceREAD MORE
223,worst product not to buy low qualityREAD MORE
1664,SuperREAD MORE
3923,Super dealREAD MORE


In [21]:
temp_df = X_train["Review text"].progress_apply(lambda x: preprocess(x, "lemma"))
temp_df.head()

100%|█████████████████████████████████████████████████████████████████████████████| 6388/6388 [00:14<00:00, 441.01it/s]


Unnamed: 0,0,1
3962,poor qualityread,2
635,best read,2
8381,superread,1
1191,superread,1
4240,goodread,1


### change column name`

In [22]:
temp_df.columns = ["clean_text_lemma", "Text_length_lemma"]
temp_df.head()

Unnamed: 0,clean_text_lemma,Text_length_lemma
3962,poor qualityread,2
635,best read,2
8381,superread,1
1191,superread,1
4240,goodread,1


In [23]:
X_train_clean = pd.concat([X_train,temp_df], axis=1)
X_train_clean

Unnamed: 0,Review text,clean_text_lemma,Text_length_lemma
3962,Poor qualityREAD MORE,poor qualityread,2
635,Best :)READ MORE,best read,2
8381,SuperREAD MORE,superread,1
1191,SuperREAD MORE,superread,1
4240,GoodREAD MORE,goodread,1
...,...,...,...
4429,very nice product 👌👌 at a better priceREAD MORE,nice product better priceread,4
223,worst product not to buy low qualityREAD MORE,worst product buy low qualityread,5
1664,SuperREAD MORE,superread,1
3923,Super dealREAD MORE,super dealread,2


In [24]:
temp_df = X_test['Review text'].progress_apply(lambda x: preprocess(x, 'lemma'))

temp_df.head()

100%|█████████████████████████████████████████████████████████████████████████████| 2130/2130 [00:05<00:00, 415.96it/s]


Unnamed: 0,0,1
5730,goodread,1
2460,good qualityread,2
4964,goodread,1
6734,excellentread,1
8013,good productread,2


In [25]:
temp_df.columns = ["clean_text_lemma", "Text_length_lemma"]
temp_df.head()

Unnamed: 0,clean_text_lemma,Text_length_lemma
5730,goodread,1
2460,good qualityread,2
4964,goodread,1
6734,excellentread,1
8013,good productread,2


In [26]:
X_test_clean = pd.concat([X_test, temp_df], axis=1)
X_test_clean

Unnamed: 0,Review text,clean_text_lemma,Text_length_lemma
5730,goodREAD MORE,goodread,1
2460,Very good qualityREAD MORE,good qualityread,2
4964,goodREAD MORE,goodread,1
6734,excellentREAD MORE,excellentread,1
8013,Good productREAD MORE,good productread,2
...,...,...,...
3458,Nice 👌 worth the moneyREAD MORE,nice worth moneyread,3
2265,PowerREAD MORE,powerread,1
5040,nice product in cheap costREAD MORE,nice product cheap costread,4
3539,SuperbREAD MORE,superbread,1


In [27]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib
from sklearn.naive_bayes import MultinomialNB

In [28]:
# Define pipeline components
tfidf_vectorizer = TfidfVectorizer()
multinomial_nb = MultinomialNB()
random_forest = RandomForestClassifier()
logistic_regression = LogisticRegression(max_iter=1000)

# Create pipelines
pipeline_nb = Pipeline([('tfidf', tfidf_vectorizer), ('clf', multinomial_nb)])
pipeline_rf = Pipeline([('tfidf', tfidf_vectorizer), ('clf', random_forest)])
pipeline_lr = Pipeline([('tfidf', tfidf_vectorizer), ('clf', logistic_regression)])

# Define hyperparameters for tuning
param_grid_nb = {
    'tfidf__max_features': [5000, 10000, None],
    'clf__alpha': [0.5, 1.0, 2.0]
}

param_grid_rf = {
    'tfidf__max_features': [5000, 10000, None],
    'clf__n_estimators': [50, 100, 200]
}

param_grid_lr = {
    'tfidf__max_features': [5000, 10000, None],
    'clf__C': [0.1, 1, 10]
}


In [29]:
# Perform GridSearchCV for each pipeline
grid_search_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=5, verbose=1)
grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, verbose=1)
grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=5, verbose=1)

# Fit the models
grid_search_nb.fit(X_train_clean["clean_text_lemma"], y_train)
grid_search_rf.fit(X_train_clean["clean_text_lemma"], y_train)
grid_search_lr.fit(X_train_clean["clean_text_lemma"], y_train)


Fitting 5 folds for each of 9 candidates, totalling 45 fits
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [30]:
# Print best parameters for each model
print("Best parameters for Logistic Regression:", grid_search_lr.best_params_)
print("Best parameters for Random Forest:", grid_search_rf.best_params_)
print("Best parameters for Naive Bayes:", grid_search_nb.best_params_)


Best parameters for Logistic Regression: {'clf__C': 10, 'tfidf__max_features': 5000}
Best parameters for Random Forest: {'clf__n_estimators': 200, 'tfidf__max_features': None}
Best parameters for Naive Bayes: {'clf__alpha': 0.5, 'tfidf__max_features': 5000}


In [31]:
# Evaluate models
nb_predictions = grid_search_nb.predict(X_test_clean['clean_text_lemma'])
print("Naive Bayes Classification Report:")
print(classification_report(y_test, nb_predictions))

lr_predictions = grid_search_lr.predict(X_test_clean['clean_text_lemma'])
print("Logistic Regression Classification Report:")
print(classification_report(y_test, lr_predictions))

rf_predictions = grid_search_rf.predict(X_test_clean['clean_text_lemma'])
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_predictions))


Naive Bayes Classification Report:
              precision    recall  f1-score   support

    Negative       0.82      0.44      0.57       423
    positive       0.88      0.98      0.92      1707

    accuracy                           0.87      2130
   macro avg       0.85      0.71      0.75      2130
weighted avg       0.86      0.87      0.85      2130

Logistic Regression Classification Report:
              precision    recall  f1-score   support

    Negative       0.77      0.50      0.61       423
    positive       0.89      0.96      0.92      1707

    accuracy                           0.87      2130
   macro avg       0.83      0.73      0.77      2130
weighted avg       0.86      0.87      0.86      2130

Random Forest Classification Report:
              precision    recall  f1-score   support

    Negative       0.76      0.44      0.56       423
    positive       0.88      0.96      0.92      1707

    accuracy                           0.86      2130
   macro avg 

In [32]:
# Save the best models
joblib.dump(grid_search_lr.best_estimator_, 'lr_model.pkl')
joblib.dump(grid_search_rf.best_estimator_, 'rf_model.pkl')
joblib.dump(grid_search_nb.best_estimator_, 'nb_model.pkl')


['nb_model.pkl']