In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('data.csv')

In [3]:
df

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1
...,...,...,...,...,...,...,...,...
8513,,,,,,,,5
8514,,,,,,,,2
8515,,,,,,,,4
8516,,,,,,,,1


In [4]:
df.isnull().sum()

Reviewer Name       10
Review Title        10
Place of Review     50
Up Votes            10
Down Votes          10
Month              465
Review text          8
Ratings              0
dtype: int64

In [5]:
df.drop_duplicates(inplace=True)

In [6]:
df.dropna(subset=['Review text'], inplace=True)

In [7]:
df.isna().sum()

Reviewer Name        2
Review Title         2
Place of Review     42
Up Votes             2
Down Votes           2
Month              457
Review text          0
Ratings              0
dtype: int64

In [8]:
df=df[df['Ratings']!=3]

In [9]:
sentiment={
    1:0,
    2:0,
    4:1,
    5:1
}

In [10]:
df.loc[:,'Sentiment'] = df['Ratings'].map(sentiment)

In [11]:
df.loc[:,'Review text'] = df['Review text'].str.replace('READ MORE', '')

In [12]:
df

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings,Sentiment
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4,1
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1,0
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1,0
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1,0
5,Baji Sankar,Mind-blowing purchase,"Certified Buyer, Hyderabad",173.0,45.0,Oct 2018,Good quality product. Delivered on time.,5,1
...,...,...,...,...,...,...,...,...,...
8504,naresh g,For Mavis350,,2.0,1.0,Aug 2016,Received product intact and sealed,5,1
8506,Abani Behera,Don't waste your money,,0.0,2.0,Sep 2016,up to the mark but same is available in market...,4,1
8507,vishnu varma,Really Nice,"Certified Buyer, Agartala",0.0,1.0,Sep 2016,Nice delivery speed,5,1
8508,,,,,,,No complaints about the item . Its the best on...,5,1


In [13]:
stemmer=PorterStemmer()
lemmatizer=WordNetLemmatizer()

In [14]:
def preprocess(text, flag, n=2):
    # remove special characters
    sentence = re.sub(r'[^a-zA-Z]', ' ', text)
    # convert sentence into lower case
    sentence = sentence.lower()
    # tokenize sentence into words
    tokens = sentence.split()
    
    # Remove 'not' from default English stopwords
    custom_stopwords = set(stopwords.words("english")) - {'not'}
    
    # remove stop words
    clean_tokens = [token for token in tokens if token not in custom_stopwords]
    
    # stemming/lemmatization
    if flag == 'stem':
        clean_tokens = [stemmer.stem(token) for token in clean_tokens]
    else:
        clean_tokens = [lemmatizer.lemmatize(token) for token in clean_tokens]
    
    # Generate n-grams
    ngrams_list = list(ngrams(clean_tokens, n))
    ngrams_text = [' '.join(gram) for gram in ngrams_list]
    
    return pd.Series(' '.join(ngrams_text))

In [15]:
df['Review text'] = df['Review text'].apply(lambda x: preprocess(x, flag='lemmatize', n=2))

In [16]:
vector = TfidfVectorizer()
vector.fit_transform(df['Review text']).toarray()
text = "".join(df['Review text'].values.tolist())

In [17]:
X=df['Review text']
y=df['Sentiment']

In [18]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=2)

In [19]:
import mlflow
mlflow.set_experiment("sentiment_analysis")

2024/03/27 12:04:34 INFO mlflow.tracking.fluent: Experiment with name 'sentiment_analysis' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///C:/Users/Acer/Desktop/reviews_data_dump/reviews_badminton/mlruns/722609149144674482', creation_time=1711521274808, experiment_id='722609149144674482', last_update_time=1711521274808, lifecycle_stage='active', name='sentiment_analysis', tags={}>

In [20]:


# Define the pipeline
"""rf_pipe_tfidf = Pipeline([
    ('Vectorization', TfidfVectorizer()),
    ('classifier', RandomForestClassifier())
])

# Define the parameter grid for GridSearchCV
rf_param_grid_tfidf = {
    'Vectorization': [TfidfVectorizer()],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10]
}

# Initialize GridSearchCV
rf_clf_tfidf = GridSearchCV(
    estimator=rf_pipe_tfidf, 
    param_grid=rf_param_grid_tfidf, 
    scoring='accuracy',
    cv=5,
    return_train_score=True,
    verbose=1
)

# Enable auto-logging with MLflow
mlflow.sklearn.autolog()

# Start an MLflow run
with mlflow.start_run() as run:
    # Fit the GridSearchCV object
    rf_clf_tfidf.fit(X_train, y_train)
"""

"rf_pipe_tfidf = Pipeline([\n    ('Vectorization', TfidfVectorizer()),\n    ('classifier', RandomForestClassifier())\n])\n\n# Define the parameter grid for GridSearchCV\nrf_param_grid_tfidf = {\n    'Vectorization': [TfidfVectorizer()],\n    'classifier__n_estimators': [50, 100, 200],\n    'classifier__max_depth': [None, 10, 20],\n    'classifier__min_samples_split': [2, 5, 10]\n}\n\n# Initialize GridSearchCV\nrf_clf_tfidf = GridSearchCV(\n    estimator=rf_pipe_tfidf, \n    param_grid=rf_param_grid_tfidf, \n    scoring='accuracy',\n    cv=5,\n    return_train_score=True,\n    verbose=1\n)\n\n# Enable auto-logging with MLflow\nmlflow.sklearn.autolog()\n\n# Start an MLflow run\nwith mlflow.start_run() as run:\n    # Fit the GridSearchCV object\n    rf_clf_tfidf.fit(X_train, y_train)\n"

In [21]:
"""from sklearn.tree import DecisionTreeClassifier

# Define the pipeline
dt_pipe_tfidf = Pipeline([
    ('Vectorization', TfidfVectorizer()),
    ('classifier', DecisionTreeClassifier())
])

# Define the parameter grid for GridSearchCV
dt_param_grid_tfidf = {
    'Vectorization': [TfidfVectorizer()],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10]
}

# Initialize GridSearchCV
dt_clf_tfidf = GridSearchCV(
    estimator=dt_pipe_tfidf, 
    param_grid=dt_param_grid_tfidf, 
    scoring='accuracy',
    cv=5,
    return_train_score=True,
    verbose=1
)

# Enable auto-logging with MLflow
mlflow.sklearn.autolog()

# Start an MLflow run
with mlflow.start_run() as run:
    # Fit the GridSearchCV object
    dt_clf_tfidf.fit(X_train, y_train)
"""

"from sklearn.tree import DecisionTreeClassifier\n\n# Define the pipeline\ndt_pipe_tfidf = Pipeline([\n    ('Vectorization', TfidfVectorizer()),\n    ('classifier', DecisionTreeClassifier())\n])\n\n# Define the parameter grid for GridSearchCV\ndt_param_grid_tfidf = {\n    'Vectorization': [TfidfVectorizer()],\n    'classifier__max_depth': [None, 10, 20],\n    'classifier__min_samples_split': [2, 5, 10]\n}\n\n# Initialize GridSearchCV\ndt_clf_tfidf = GridSearchCV(\n    estimator=dt_pipe_tfidf, \n    param_grid=dt_param_grid_tfidf, \n    scoring='accuracy',\n    cv=5,\n    return_train_score=True,\n    verbose=1\n)\n\n# Enable auto-logging with MLflow\nmlflow.sklearn.autolog()\n\n# Start an MLflow run\nwith mlflow.start_run() as run:\n    # Fit the GridSearchCV object\n    dt_clf_tfidf.fit(X_train, y_train)\n"

In [22]:
pipelines = {
    'RandomForest': Pipeline([
        ('Vectorization', TfidfVectorizer()),
        ('classifier', RandomForestClassifier())
    ]),
    'DecisionTrees': Pipeline([
        ('Vectorization', TfidfVectorizer()),
        ('classifier', DecisionTreeClassifier())
    ]),
    'LogisticRegression': Pipeline([
        ('Vectorization', TfidfVectorizer()),
        ('classifier', LogisticRegression())
    ]),
    'KNeighbors': Pipeline([
        ('Vectorization', TfidfVectorizer()),
        ('classifier', KNeighborsClassifier())
    ]),
    'NaiveBayes': Pipeline([
        ('Vectorization', TfidfVectorizer()),
        ('classifier', MultinomialNB())
    ]),
    'SVC': Pipeline([
        ('Vectorization', TfidfVectorizer()),
        ('classifier', SVC())
    ]),
    'XGBoost': Pipeline([
        ('Vectorization', TfidfVectorizer()),
        ('classifier', XGBClassifier())
    ])
    
}

param_grids = {
    'RandomForest': {
        'Vectorization': [TfidfVectorizer()],
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 5, 10]
    },
    'DecisionTrees': {
        'Vectorization': [TfidfVectorizer()],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 5, 10]
    },
    'LogisticRegression': {
        'Vectorization': [TfidfVectorizer()],
        'classifier__C': [0.1, 1, 10],
        'classifier__penalty': ['l1', 'l2']
    },
    'KNeighbors': {
        'Vectorization': [TfidfVectorizer()],
        'classifier__n_neighbors': [3, 5, 7, 9],
        'classifier__p': [1, 2, 3]
    },
    'NaiveBayes': {
        'Vectorization': [TfidfVectorizer()],
    },
    'SVC': {
        'Vectorization': [TfidfVectorizer()],
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf']
    },
    'XGBoost': {
        'Vectorization': [TfidfVectorizer()],
        'classifier__learning_rate': [0.01, 0.1, 0.3],
        'classifier__max_depth': [3, 5, 7],
        'classifier__n_estimators': [50, 100, 200]
    }
     
}


In [23]:
best_models = {}

# Run the Pipeline
for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo], 
                               param_grid=param_grids[algo], 
                               cv=5, 
                               scoring='accuracy', 
                               return_train_score=True,
                               verbose=1
                              )
    
    mlflow.sklearn.autolog(max_tuning_runs=None)
    
    with mlflow.start_run() as run:
        %time grid_search.fit(X_train, y_train)
        
    print('Train Score: ', grid_search.best_score_)
    print('Test Score: ', grid_search.score(X_test, y_test))
    
    best_models[algo] = grid_search.best_estimator_
    print()

********** RandomForest **********


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



Fitting 5 folds for each of 27 candidates, totalling 135 fits
CPU times: total: 2min 35s
Wall time: 2min 47s
Train Score:  0.9376786121564603




Test Score:  0.9402228976697061

********** DecisionTrees **********
Fitting 5 folds for each of 9 candidates, totalling 45 fits




CPU times: total: 6.97 s
Wall time: 13.1 s
Train Score:  0.9344698654350554
Test Score:  0.9371833839918946

********** LogisticRegression **********
Fitting 5 folds for each of 6 candidates, totalling 30 fits




CPU times: total: 2.23 s
Wall time: 7.92 s
Train Score:  0.9310914870566769
Test Score:  0.9346504559270516

********** KNeighbors **********
Fitting 5 folds for each of 12 candidates, totalling 60 fits
CPU times: total: 2min 24s
Wall time: 33.2 s
Train Score:  0.9010306192268217




Test Score:  0.9103343465045592

********** NaiveBayes **********
Fitting 5 folds for each of 1 candidates, totalling 5 fits




CPU times: total: 797 ms
Wall time: 6.62 s
Train Score:  0.8800878093283157
Test Score:  0.894630192502533

********** SVC **********
Fitting 5 folds for each of 6 candidates, totalling 30 fits
CPU times: total: 18.7 s
Wall time: 24.6 s
Train Score:  0.9334564944691527
Test Score:  0.939209726443769

********** XGBoost **********




Fitting 5 folds for each of 27 candidates, totalling 135 fits
CPU times: total: 5min 15s
Wall time: 53.7 s
Train Score:  0.9383544303797468
Test Score:  0.9422492401215805

