In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
#Importing important Libraries
import numpy as np
import pandas as pd


#Decompress the file
import gzip

#Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from wordcloud import WordCloud, STOPWORDS

#text preprocessing
from datetime import datetime
#text preprocessing
import spacy
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.toktok import ToktokTokenizer

#Modeling

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline


#Warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Reading data for modelling 
df = pd.read_csv('/content/drive/MyDrive/Project/df_r3.csv',index_col=False)


In [None]:
#To check the names of the columns
df.columns

In [None]:
# Extracting only useful column for Tableau EDA 
df_tableau = df[['Rating','reviewerID','style','review_sentiment','DateTime','category','also_buy','brand','feature','also_view']]

In [None]:
#To check the last 10 rows
df_tableau.tail()

In [None]:
# Save in in csv mode to do EDA in Tableau
df_tableau.to_csv('/content/drive/MyDrive/Project/TableauEDA.csv')


In [None]:
#To check first five rows
df_tableau.head()

In [None]:
#To check the information of the dataset
df.info()

In [None]:
# Droping unncessary data which is not used for modelling
df = df.drop(['Rating','reviewerName','DateTime','description','category','title','also_view','also_buy',
              'brand','similar_item','verified','style','vote','feature'],axis=1)

In [None]:
#To check top 5 rows
df.head()

In [None]:
#To make a copy of data
df_copy=df.copy()

In [None]:
#To check the values
df['rating_class'].values

In [None]:
# Convert label to a numerical variable
df['rating_class'] = df.rating_class.map({'bad':0, 'good':1})

In [None]:
# Drop unncessary columns
df = df.drop(['Unnamed: 0','reviewerID','asin','review_sentiment','rank','review_text'],axis=1)

In [None]:
# Checking null values
df['clean_text'].isnull().sum()

In [None]:

# Drop null values from clean_text column
df = df.dropna(subset = ['clean_text'])

In [None]:
df=df.drop(['price'], axis=1)

df['clean_text'].isnull().sum()

In [None]:

df.head()

In [None]:


# Rename rating class
df.rename(columns={"rating_class": "Positivity"}, inplace=True)

In [None]:


df.head()

In [None]:

sns.catplot(x="Positivity", data=df, kind="count", height=5, aspect=1.4, palette="BrBG")
plt.show();


The bar chat below showing a comparison between positive and negative reviews using phone dataset

In [None]:
#Split data into train and test 
x = df['clean_text']
y = df['Positivity']

X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.33,random_state=42)


text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',MultinomialNB())])

text_clf.fit(X_train,y_train)

predictions = text_clf.predict(X_test)

print(confusion_matrix(y_test,predictions))
cm = confusion_matrix(y_test,predictions)
print(classification_report(y_test,predictions))


This looks nice. We got an accuracy of ~86% on the test set.

In [None]:
ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax);  #annot=True to annotate cells, ftm='g' to disable scientific notation
sns.color_palette("husl", 10)

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Negative', 'Positive']); ax.yaxis.set_ticklabels(['Negative', 'Positive']);

**True Positive(TP)**: The prediction outcome is true, and it is true in reality

**True Negative(TN)**: The prediction outcome is false, and it is false in reality

**False Positive(FP)**: The prediction outcomes are true, but they are false in actuality.

**False Negative(FN)**: The predictions are false, and they are true in actuality.

**Precision**: It determines the proportion of positive prediction that was actually correct.

Precision = TP/TP+FP

**Recall/Sensitivity**: It aims to calculate the proportion of actual positive that was identified incorrectly.

Recall = TP/TP+FN

**Accuracy**: It is defined as the number of correct predictions made as a ratio of all predictions made. The model with the higher accuracy value is considered to be the best model.

Accuracy = TP+TN/TP+FP+FN+TN

F1 Score: This score will give us the harmonic mean of precision and recall. F1 score is the weighted average of the precision and recall.

F1 Score = 2(Recall Precision) / (Recall + Precision)

