In [2]:
# General
import numpy as np
import pandas as pd
import nltk
import random
import os
from os import path
from PIL import Image

# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from subprocess import check_output
from wordcloud import WordCloud, STOPWORDS

# Set Plot Theme
sns.set_palette([
    "#30a2da",
    "#fc4f30",
    "#e5ae38",
    "#6d904f",
    "#8b8b8b",
])
# Alternate # plt.style.use('fivethirtyeight')

# Pre-Processing
import string
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer
from sklearn.utils import resample
from sklearn.utils import shuffle

# Modeling
import statsmodels.api as sm
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk.util import ngrams
from collections import Counter
from gensim.models import word2vec

# Warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Read and Peak at Data
df = pd.read_csv(r'C:\Users\User\Documents\Data Science\Datasets\womens clothing sample\Womens Clothing E-Commerce Reviews.csv')
df.drop(df.columns[0],inplace=True, axis=1)

In [4]:
#df=df.sample(500)

In [5]:
# Delete missing observations for following variables
for x in ["Division Name","Department Name","Class Name","Review Text"]:
    df = df[df[x].notnull()]

# Extracting Missing Count and Unique Count by Column
unique_count = []
for x in df.columns:
    unique_count.append([x,len(df[x].unique()),df[x].isnull().sum()])

# Missing Values
print("Missing Values: {}".format(df.isnull().sum().sum()))

# Data Dimensions
print("Dataframe Dimension: {} Rows, {} Columns".format(*df.shape))

# Create New Variables: 
# Word Length
df["Word Count"] = df['Review Text'].str.split().apply(len)
# Character Length
df["Character Count"] = df['Review Text'].apply(len)
# Boolean for Positive and Negative Reviews
df["Label"] = 0
df.loc[df.Rating >= 3,["Label"]] = 1

Missing Values: 2966
Dataframe Dimension: 22628 Rows, 10 Columns


In [6]:
pd.set_option('max_colwidth', 500)
df[["Title","Review Text", "Rating"]].sample(7)

Unnamed: 0,Title,Review Text,Rating
14350,Love it!,I love this top. i also have it in blue! i wish it came in more colors/ prints! i would definitely buy it!,5
18418,,"Luckily i found this in the store though it is on backorder online, so if you really want it - go to/call the store. it's effortless, fun, beautiful, and nicely detailed. love this top. i'm 5'5"" and usually wear size 6, and i bought a small. really worth the price b/c you can just throw it on and look fabulous. layers well. so happy i got this.",5
17576,Fun swingy black dress,This is a pretty basic black dress with a swingy silhouette. the slip that comes with it is much tighter than the dress. the transparent sleeves and open-work details help it look a bit airier even though the shape is loose. the neckline has a flattering v-opening. i was able to try on a petite size at my local store (i suppose it was a returned item). the petite was still a bit long for me; it did not hit me above the knee but rather at or slightly below the knee. the dress was cute and com...,4
21211,Great jumper :),Love these! i call them my work overalls because thats pretty much what they are. the pictures of course don't show it but the sides are cut all the way to the waist band at the beginning of the pant. they were very flattering and quite comfy. these are just something easy to throw on and go to work in. i ordered these in a medium and they fit me perfectly. i do wish the top area was slightly wider to accommodate larger busts.,4
17592,Cute and classy,Very nice polka-dot pattern. i so think it is versatile and can go with a lot of different colors. i am worn it was different colors underneath and both a skirt and jeans. it is a timeless classic!!,5
12284,,"Fit is oversized but intentional. flattering on. love the neckline. modesty might call for a tank top underneath if bending over. fabric is soft, stretchy and has a good quality heft to it.",5
8888,,"I think i wanted a skirt like this in the 1980s. with its aged denim and retro style, it looks like i've had it all that time. i couldn't zip my usual 4 but the 6 is comfortable if a bit snug, sitting right at my natural waist. it's flattering and shows off my waist with tops tucked in, something i never do with my other skirts, which all sit lower. i like how it flares a little oddly due to the horizontal seam. it's interesting and different. i will wear it in summer with sandals and it wil...",5


In [7]:
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
#ps = LancasterStemmer()
ps = PorterStemmer()

tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))

def preprocessing(data):
    txt = data.str.lower().str.cat(sep=' ') #1
    words = tokenizer.tokenize(txt) #2
    words = [w for w in words if not w in stop_words] #3
    #words = [ps.stem(w) for w in words] #4
    return words

In [8]:
# Pre-Processing
SIA = SentimentIntensityAnalyzer()
df["Review Text"]= df["Review Text"].astype(str)

# Applying Model, Variable Creation
df['Polarity Score']=df["Review Text"].apply(lambda x:SIA.polarity_scores(x)['compound'])
df['Neutral Score']=df["Review Text"].apply(lambda x:SIA.polarity_scores(x)['neu'])
df['Negative Score']=df["Review Text"].apply(lambda x:SIA.polarity_scores(x)['neg'])
df['Positive Score']=df["Review Text"].apply(lambda x:SIA.polarity_scores(x)['pos'])

# Threshold

th=0.3


# Converting 0 to 1 Decimal Score to a Categorical Variable
df['Sentiment']=''
df.loc[df['Polarity Score']>th,'Sentiment']='Positive'
df.loc[df['Polarity Score'].between(-th, +th),'Sentiment']='Neutral'
#df.loc[df['Polarity Score']==0.1,'Sentiment']='Neutral'
df.loc[df['Polarity Score']<-th,'Sentiment']='Negative'

In [9]:
df_pos=df[df["Sentiment"] == 'Positive']
df_neu=df[df["Sentiment"] == 'Neutral']
df_neg=df[df["Sentiment"] == 'Negative']

In [10]:
print(df_pos.shape)
print(df_neu.shape)
print(df_neg.shape)

(20259, 18)
(1443, 18)
(926, 18)


In [11]:
df_neu_upsampled = resample(df_neu, 
                                 replace=True,     # sample with replacement
                                 n_samples= df_pos.shape[0],    # to match majority class
                                 random_state=123) # reproducible results
df_neg_upsampled = resample(df_neg, 
                                 replace=True,     # sample with replacement
                                 n_samples= df_pos.shape[0],    # to match majority class
                                 random_state=123) # reproducible results
df = pd.concat([df_pos, df_neu_upsampled,df_neg_upsampled])

In [12]:
df_pos=df[df["Sentiment"] == 'Positive']
df_neu=df[df["Sentiment"] == 'Neutral']
df_neg=df[df["Sentiment"] == 'Negative']

In [13]:
print(df_pos.shape)
print(df_neu.shape)
print(df_neg.shape)

(20259, 18)
(20259, 18)
(20259, 18)


In [14]:
df_pos=df[df["Sentiment"] == 'Positive']
df_neu=df[df["Sentiment"] == 'Neutral']
df_neg=df[df["Sentiment"] == 'Negative']

In [15]:
print(df_pos.shape)
print(df_neu.shape)
print(df_neg.shape)

(20259, 18)
(20259, 18)
(20259, 18)


In [16]:
df['tokenized'] = df["Review Text"].astype(str).str.lower() # Turn into lower case text
df['tokenized'] = df.apply(lambda row: tokenizer.tokenize(row['tokenized']), axis=1) # Apply tokenize to each row
df['tokenized'] = df['tokenized'].apply(lambda x: [w for w in x if not w in stop_words]) # Remove stopwords from each row
df['tokenized'] = df['tokenized'].apply(lambda x: [ps.stem(w) for w in x]) # Apply stemming to each row
all_words = nltk.FreqDist(preprocessing(df['Review Text'])) # Calculate word occurrence from whole block of text

vocab_count = 200
word_features= list(all_words.keys())[:vocab_count] # 2000 most recurring unique words
print("Number of words columns (One Hot Encoding): {}".format(len(all_words)))

Number of words columns (One Hot Encoding): 14034


In [17]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
from sklearn import metrics
import scikitplot as skplt

In [18]:
vect = TfidfVectorizer()
vect.fit(df["Review Text"])
X = vect.transform(df["Review Text"])

In [19]:
y = df["Sentiment"].copy()

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.20, random_state=23, stratify=y)

In [20]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
lr=model.fit(X_train, y_train)
print("Train Set Accuracy: {}".format(metrics.accuracy_score(model.predict(X_train), y_train)))
#print("Train Set ROC: {}\n".format(metrics.roc_auc_score(model.predict(X_train), y_train)))

print("Validation Set Accuracy: {}".format(metrics.accuracy_score(model.predict(X_valid), y_valid)))
#print("Validation Set ROC: {}".format(metrics.roc_auc_score(model.predict(X_valid), y_valid)))

Train Set Accuracy: 0.9510705250817548
Validation Set Accuracy: 0.9324613359657782


In [32]:
text= vect.transform(["nice dress"])
pred = lr.predict(text)
print("The predicted Sentiment is",pred)

The predicted Sentiment is ['Positive']


The predicted Sentiment is ['Neutral']


array(['Neutral'], dtype=object)

In [17]:
print(metrics.classification_report(model.predict(X_valid), y_valid))

              precision    recall  f1-score   support

          No       0.56      0.78      0.65       596
         Yes       0.96      0.91      0.94      3930

    accuracy                           0.89      4526
   macro avg       0.76      0.84      0.79      4526
weighted avg       0.91      0.89      0.90      4526

