In [3]:
import numpy as np
import pandas as pd 
import sys

In [4]:
df1 = pd.read_csv('Womens Clothing E-Commerce Reviews.csv')
df = df1[['Review Text', 'Rating', 'Class Name', 'Age']]
df1.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


## Counting the Occurrences of Certain words

In [5]:
# Importing CountVectorizer() 
from sklearn.feature_extraction.text import CountVectorizer


# fill the NA values by space 
df['Review Text'] = df['Review Text'].fillna('')

# Count Vectorizer() convert a collection of text
# to a matrix of token count 
vectorizer = CountVectorizer() 

# Assigning a hosrter name for the analysze tokenizes the string 
analyzer = vectorizer.build_analyzer()

def wordcounts(s):
    c = {}
    # tokenize the string and continue, if it is not empty 
    if analyzer(s):
        d = {}
        # Find counts of vocabularies and transform to arrays 
        w = vectorizer.fit_transform([s]).toarray()
        # Vocabulary and index (index of w)
        vc = vectorizer.vocabulary_
        # item's transform the dictionary's (word, index) tuple pairs 
        for k, v in vc.items(): 
            d[v] = k  # d --> index: work
        for index, i in enumerate(w[0]):
            c[d[index]] = i  # c -> word: count 
    return c

# add new column to the dataframe 
df['Word Counts'] = df['Review Text'].apply(wordcounts)
df.head()    
    

Unnamed: 0,Review Text,Rating,Class Name,Age,Word Counts
0,Absolutely wonderful - silky and sexy and comf...,4,Intimates,33,"{'absolutely': 1, 'and': 2, 'comfortable': 1, ..."
1,Love this dress! it's sooo pretty. i happene...,5,Dresses,34,"{'am': 1, 'and': 2, 'bc': 2, 'be': 1, 'below':..."
2,I had such high hopes for this dress and reall...,3,Dresses,60,"{'and': 3, 'be': 1, 'bottom': 1, 'but': 2, 'ch..."
3,"I love, love, love this jumpsuit. it's fun, fl...",5,Pants,50,"{'and': 1, 'but': 1, 'compliments': 1, 'every'..."
4,This shirt is very flattering to all due to th...,5,Blouses,47,"{'adjustable': 1, 'all': 1, 'and': 1, 'any': 1..."


## Splitting the data into Train and Test 

In [6]:
from sklearn.model_selection import train_test_split
# Rating of 4 or higher -> positive, while the ones with 
# Rating of 2 or lower -> negative 
# Rating of 3 -> neutral 

df = df[df['Rating'] != 3]
df['Sentiment'] = df['Rating'] >= 4
df.head() 

# split data 
train_data, test_data = train_test_split(df, train_size=0.8, random_state = 0)

# Select the columns and prepare data for the models 
X_train = vectorizer.fit_transform(train_data['Review Text'])
y_train = train_data['Sentiment']

X_test = vectorizer.transform(test_data['Review Text'])
y_test = test_data['Sentiment']

## Logistic Regression 

In [7]:
import datetime as dt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

start = dt.datetime.now() 
lr = LogisticRegression() 
lr.fit(X_train, y_train)
print('Elapsed Time: ', str(dt.datetime.now() - start))

Elapsed Time:  0:00:00.657765


## Naive Bayes

In [8]:
from sklearn.naive_bayes import MultinomialNB
start = dt.datetime.now() 
nb = MultinomialNB() 
nb.fit(X_train, y_train)
print('Elapsed Time: ', str(dt.datetime.now() - start))

Elapsed Time:  0:00:00.005823


## Support Vector Machine (SVM)

In [9]:
from sklearn.svm import SVC
svm = SVC() 
svm.fit(X_train, y_train)
print('Elapsed Time: ', str(dt.datetime.now() - start))

Elapsed Time:  0:00:33.806275


## Neural Network

In [10]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier() 
nn.fit(X_train, y_train)
print('Elapsed Time: ', str(dt.datetime.now() - start))

Elapsed Time:  0:02:18.949956


## Evaluating Models

In [11]:
# Adding Results to the Dataframe

df2 = test_data.copy() 
df2['Logistic Regression'] = lr.predict(X_test)
df2['Naive Bayes'] = nb.predict(X_test)
df2['SVM'] = svm.predict(X_test)
df2['Neural Network'] = nn.predict(X_test)
df2.head() 

Unnamed: 0,Review Text,Rating,Class Name,Age,Word Counts,Sentiment,Logistic Regression,Naive Bayes,SVM,Neural Network
261,This top is very cute. got it in the lighter c...,5,Blouses,50,"{'also': 1, 'and': 1, 'blue': 1, 'color': 2, '...",True,True,True,True,True
6466,I love the color of this top. it is a dark oli...,4,Knits,44,"{'and': 3, 'at': 1, 'blazer': 1, 'casual': 1, ...",True,True,True,True,True
9853,I like the color and design. it looked super c...,4,Dresses,32,"{'and': 3, 'be': 1, 'before': 1, 'brought': 1,...",True,False,False,False,False
20688,"I love this tunic, the detail the weight, the ...",4,Sweaters,54,"{'better': 1, 'big': 1, 'but': 1, 'detail': 1,...",True,True,True,True,True
10497,I ordered this dress online. the colors are pr...,2,Dresses,34,"{'and': 2, 'are': 2, 'baby': 1, 'bump': 1, 'ch...",False,True,True,True,True


## Checking Accuracy of the model 

In [12]:
from sklearn.metrics import accuracy_score
accuracy_score_lr = accuracy_score(y_test, df2['Logistic Regression'])
accuracy_score_nb = accuracy_score(y_test, df2['Naive Bayes'])
accuracy_score_svm = accuracy_score(y_test, df2['SVM'])
accuracy_score_nn = accuracy_score(y_test, df2['Neural Network'])


print('Logistic Regression : ', accuracy_score_lr)
print('Naive Bayes : ', accuracy_score_nb)
print('SVC : ', accuracy_score_svm)
print('Neural Network : ', accuracy_score_nn)


Logistic Regression :  0.9396070822216832
Naive Bayes :  0.9417899587678875
SVC :  0.9289352413291293
Neural Network :  0.9323308270676691
