In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading the data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("/kaggle/input/womens-ecommerce-clothing-reviews/Womens Clothing E-Commerce Reviews.csv")

In [None]:
df.head()

In [None]:
#Number of unique values
df.nunique()

# Checking for missing values

In [None]:
df.isnull().sum()/len(df)*100

We won't be using Title feature since it has a lot of missing values.

In [None]:
df.describe().T

In [None]:
df.drop(['Clothing ID', 'Title', 'Unnamed: 0'], axis = 1, inplace = True)

We are dropping these features because they hold very less significance to sentiment analysis of the review.

In [None]:
df[df['Review Text'].isnull()]

Removing the unwanted null values.

In [None]:
df = df[~df['Review Text'].isnull()]

In [None]:
df.shape

# Data Analysis and Visualization

In [None]:
df.head()

In [None]:
plt.figure(figsize = (20,6))
sns.countplot(x = 'Age', data = df)
plt.show()

In [None]:
plt.figure(figsize = (20,6))
sns.countplot(x = 'Rating', data = df)
plt.show()

In [None]:
plt.figure(figsize = (20,6))
sns.countplot(x = 'Class Name', data = df)
plt.xticks(rotation = 45)
plt.show()

Dresses, Knits and Blouses are bought the most by women. 

In [None]:
plt.figure(figsize = (15,6))
sns.barplot(x ='Age',y= 'Positive Feedback Count',data = df, palette = 'viridis')
plt.title('Age vs Positive Feedback', fontsize = 20)
plt.xticks(rotation = 90)
plt.show()

There's no noteable relation between Age and Positive Feedback, excluding some outliers.  



In [None]:
plt.figure(figsize = (15,6))
sns.distplot(df['Positive Feedback Count'])
plt.show()

In [None]:
plt.figure(figsize = (15,6))

sns.barplot(x ='Age',y= 'Rating',data = df, palette = 'viridis')
plt.title('Age vs Rating', fontsize = 20)
plt.xticks(rotation = 90)
plt.show()

There's no noteable relation between Age and Rating, excluding some outliers. Same as Age vs Positive Feedback.

In [None]:
plt.figure(figsize = (15,10))
sns.boxplot(x="Recommended IND", y="Rating", hue = "Recommended IND", data = df)
plt.xlabel("3-G", fontsize = 20)
plt.ylabel("RAM", fontsize = 20)

plt.show()

Here 1 means recommended. 0 means not recommended.

Women recommend a product if they rate it to be more than or equal to 3.

# Cleaning the text for visualization of polarity

Removing all the punctuations from the review text.

In [None]:
import string
string.punctuation
def remove_punctuation(text):
    no_punct=[words for words in text if words not in string.punctuation]
    words_wo_punct=''.join(no_punct)
    return words_wo_punct
df['Review Text']=df['Review Text'].apply(lambda x: remove_punctuation(x))
df.head()

In [None]:
!pip install TextBlob
from textblob import *

df['polarity'] = df['Review Text'].map(lambda text: TextBlob(text).sentiment.polarity)
df['polarity']

In [None]:
import plotly.express as px
px.histogram(df, x = 'polarity',color="Rating", opacity = 0.5)

In [None]:
plt.figure(figsize = (15,10))
sns.boxplot(x="polarity", y="Department Name", hue = "Recommended IND", data = df)
plt.xlabel("Polarity of the review", fontsize = 20)
plt.ylabel("Department Name", fontsize = 20)

plt.show()

This plot looks right because all **the polarities of "not recommended" are less than that of polarities of the "recommended".**

# Reviews with positive polarity

In [None]:
example = df.loc[df.polarity == 1,['Review Text']].sample(3).values
for i in example:
    print(i[0])

# Reviews with neutral polarity

In [None]:
example = df.loc[df.polarity == 0.5,['Review Text']].sample(3).values
for i in example:
    print(i[0])

# Reviews with negative polarity

In [None]:
example = df.loc[df.polarity < 0,['Review Text']].sample(3).values
for i in example:
    print(i[0])

# Polarity Pie-Chart

In [None]:
negative = (len(df.loc[df.polarity < 0, ['Review Text']].values)/len(df))*100
positive = (len(df.loc[df.polarity > 0.5, ['Review Text']].values)/len(df))*100
neutral = len(df.loc[df.polarity >0 ,['Review Text']].values) - len(df.loc[df.polarity >0.5 ,['Review Text']].values)
neutral = neutral/len(df)*100
plt.figure(figsize =(10, 7)) 
plt.pie([positive,negative,neutral], labels = ['Positive','Negative','Neutral']) 
plt.show()

# Create N-grams

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
def top_n_ngram(corpus,n = None,ngram = 1):
    vec = CountVectorizer(stop_words = 'english',ngram_range=(ngram,ngram)).fit(corpus)
    bag_of_words = vec.transform(corpus) #Have the count of  all the words for each review
    sum_words = bag_of_words.sum(axis =0) #Calculates the count of all the word in the whole review
    words_freq = [(word,sum_words[0,idx]) for word,idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq,key = lambda x:x[1],reverse = True)
    return words_freq[:n]

# Visualizing Top 10 Unigrams

In [None]:
common_words= top_n_ngram(df['Review Text'], 10,1)
data = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
plt.figure(figsize =(10,5))
data.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
kind='bar', title='Top 10 unigrams in review after removing stop words')

# Visualizing Top 20 Bigrams

In [None]:
common_words = top_n_ngram(df['Review Text'], 20,2)
data = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
plt.figure(figsize =(10,5))
data.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
kind='bar', title='Top 10 unigrams in review after removing stop words')

# Visualizing Top 10 POS Tagging

In [None]:
blob= TextBlob(str(df['Review Text']))
pos = pd.DataFrame(blob.tags,columns =['word','pos'])
pos1 = pos.pos.value_counts()[:20]
plt.figure(figsize = (10,5))
pos1.plot(kind='bar',title ='Top 20 Part-of-speech taggings')


# Correlation of fetaures using Heatmaps 

Adding Review Length as a feature

In [None]:
df['review_len'] = df['Review Text'].astype(str).apply(len)

In [None]:
y = df['Recommended IND']
X = df.drop(columns = 'Recommended IND')

In [None]:
sns.heatmap(X.corr(), annot = True )

There's not a strong correleation between any of the features.

# Statistical Description

In [None]:
class1 = []
for i in X.polarity:
    if float(i)>=0.0:
        class1.append(1)
        
    elif float(i)<0.0:
        class1.append(0)
X['sentiment'] = class1

X.groupby(X['sentiment']).describe().T

# Creating Bag Of Words Model

In [None]:
print("Shape of X: " , X.shape)
print("Shape of y: " , y.shape)

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


In [None]:
X.index = np.arange(len(X))
corpus = []
from tqdm import tqdm
for i in tqdm(range(len(X))):
  review = re.sub('[^a-zA-Z]', ' ', X['Review Text'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

In [None]:
corpus

In [None]:
# from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
# wc= WordCloud(background_color="white", random_state=1,stopwords=STOPWORDS, max_words = 2000, width =1000, height = 1500)
# wc.generate(review)
# plt.figure(figsize=[10,10])
# plt.imshow(wc,interpolation="bilinear")
# plt.axis('off')
# plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer as CV
cv  = CV(max_features = 3000,ngram_range=(1,1))
X_cv = cv.fit_transform(corpus).toarray()
y = y.values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_cv, y, test_size = 0.20, random_state = 0)
from sklearn.naive_bayes import BernoulliNB
classifier = BernoulliNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
from sklearn.metrics import accuracy_score
from sklearn import metrics
acc = accuracy_score(y_test, y_pred)
print("Accuracy of the classifier: ",acc)
print("Confusion matrix is :\n",metrics.confusion_matrix(y_test,y_pred))
print("Classification report: \n" ,metrics.classification_report(y_test,y_pred))


In [None]:
acc

An accuracy score of 87.28% is pretty good.

# TF-IDF Technique

Term Frequency - Inverse Document Frequency is used to measure the originality of a word. It converts sentences to vectors(after tokenization, stemming/lemmatization). 

Bag of Words technique doesn't provide us with the semantic meaning of the word, here TF-IDF comes in play as it provides us the semantic meaning of the word.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer as TV
tv  = TV(ngram_range =(1,1),max_features = 3000)
X_tv = tv.fit_transform(corpus).toarray()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tv, y, test_size = 0.20, random_state = 0)
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)

In [None]:
acc

An accuracy of 83.55% from the TF-IDF technique, which is less than that of Bag of Words Technique.

# Deep Learning Model

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer = Tokenizer(num_words = 3000)
tokenizer.fit_on_texts(corpus)

In [None]:
sequences = tokenizer.texts_to_sequences(corpus)
padded = pad_sequences(sequences, padding='post')

In [None]:
word_index = tokenizer.word_index
count = 0
for i,j in word_index.items():
    if count == 11:
        break
    print(i,j)
    count = count+1

These are the Top 11 most frequent words.

In [None]:
embedding_dim = 64
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(3000, embedding_dim),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.summary()

In [None]:
num_epochs = 10

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
model.fit(padded,y,epochs= num_epochs)

In [None]:
sample_string = "I Will tell my friends for sure"
sample = tokenizer.texts_to_sequences(sample_string)
padded_sample = pad_sequences(sample, padding='post')

In [None]:
padded_sample.T

In [None]:
model.predict(padded_sample.T)

There's a 99.45% accuracy that this review will result in recommendation.