In [1]:
#Liberaris import
import pandas as pd
import numpy as np

In [2]:
reviews = pd.read_csv('Musical_instruments_reviews.csv')

In [3]:
reviews.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014"


In [4]:
reviews.columns

Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime'],
      dtype='object')

In [5]:
#Data Preprocessing
reviews = reviews.drop(['reviewerID', 'asin', 'reviewerName', 'helpful', 'unixReviewTime', 'reviewTime'], axis=1)


In [6]:
#Removing missing values
reviews = reviews.dropna()


In [7]:
#Ratings in sentiments 
reviews['sentiment'] = np.where(reviews['overall'] > 3, 'Positive', 'Negative')


In [8]:
#Cleaning the review text
import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

reviews['cleaned_review'] = reviews['reviewText'].apply(lambda x: clean_text(x))


In [9]:
# Spliting data into training and testing sets - 20% test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(reviews['cleaned_review'],
                                                    reviews['sentiment'],
                                                    test_size=0.2,
                                                    random_state=42)


In [10]:
#Extracting features using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)


In [11]:
#Naive Bayes Model Training
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train_cv, y_train)


In [12]:
#Model evaluation - accuracy score & Confusion matrix
from sklearn.metrics import accuracy_score, confusion_matrix

y_pred = nb.predict(X_test_cv)
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix: ")
print(confusion_matrix(y_test, y_pred))


Accuracy Score:  0.8873720136518771
Confusion Matrix: 
[[  12  228]
 [   3 1808]]


In [13]:
#Experimenting with Models
from sklearn.svm import SVC


In [14]:
# Training an SVM model
svm = SVC(kernel='linear', C=1)
svm.fit(X_train_cv, y_train)


In [15]:
# Evaluating the model on test data
y_pred = svm.predict(X_test_cv)
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix: ")
print(confusion_matrix(y_test, y_pred))


Accuracy Score:  0.8576304241833252
Confusion Matrix: 
[[  84  156]
 [ 136 1675]]


In [16]:
#Naive Bayes Model Gives the Better accuracy then SVM Model
 #training some other models

In [17]:
#Training logistic regression model
from sklearn.linear_model import LogisticRegression


In [18]:
# Instantiating the model
lr = LogisticRegression()
# Training the model on the Dataset
lr.fit(X_train_cv, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
# Predicting the sentiment for the test data
y_pred = lr.predict(X_test_cv)


In [20]:
# Checking the performance of the model
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix: ")
print(confusion_matrix(y_test, y_pred))


Accuracy Score:  0.8766455387615797
Confusion Matrix: 
[[  63  177]
 [  76 1735]]


In [21]:
#Training SVC Model
from sklearn.svm import SVC

svm = SVC(kernel='linear')
svm.fit(X_train_cv, y_train)

y_pred = svm.predict(X_test_cv)
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix: ")
print(confusion_matrix(y_test, y_pred))


Accuracy Score:  0.8576304241833252
Confusion Matrix: 
[[  84  156]
 [ 136 1675]]


In [22]:
#Training RandomForest Model
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_cv, y_train)

y_pred = rf.predict(X_test_cv)
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix: ")
print(confusion_matrix(y_test, y_pred))


Accuracy Score:  0.8859093125304729
Confusion Matrix: 
[[   9  231]
 [   3 1808]]


In [23]:
#Training Gradient Boosting Model
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train_cv, y_train)

y_pred = gb.predict(X_test_cv)
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix: ")
print(confusion_matrix(y_test, y_pred))



Accuracy Score:  0.8859093125304729
Confusion Matrix: 
[[  12  228]
 [   6 1805]]


In [31]:
#Fine Tunning NB Model for More Accurate results

In [24]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

In [25]:
# load dataset
digits = load_digits()


In [26]:
# spliting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, random_state=42)

In [27]:
# defining the Naive Bayes model
nb = MultinomialNB()

In [28]:
# defining the hyperparameters to be searched
param_grid = {'alpha': [0.1, 1, 5, 10, 50, 100]}


In [29]:
# performing grid search using cross-validation
grid_search = GridSearchCV(nb, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)


In [30]:
# printing best hyperparameters and accuracy score
print("Best Hyperparameters: ", grid_search.best_params_)
print("Accuracy Score: ", grid_search.best_score_)



Best Hyperparameters:  {'alpha': 50}
Accuracy Score:  0.9005176924136032
