## Importing libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import cufflinks as cf
from plotly.offline import iplot
from wordcloud import WordCloud
import re
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
py.offline.init_notebook_mode(connected=True)
cf.go_offline()

## Loading the dataset

In [None]:
df = pd.read_csv('../input/womens-ecommerce-clothing-reviews/Womens Clothing E-Commerce Reviews.csv')

In [None]:
df.head(3)

In [None]:
df.shape

<b> The dataset has 23,486 records and 8 columns. </b>

In [None]:
# Dropping unwanted columns

df.drop(['Unnamed: 0', 'Clothing ID', 'Title'], axis=1, inplace=True)

In [None]:
df.info()

<b> The dataset has 4 integer and 4 object columns. </b>

In [None]:
# Checking for null values

df.isnull().sum()

In [None]:
# Dropping records having null values

df.dropna(inplace=True)

In [None]:
# Checking if null values are removed

df.isnull().sum()

In [None]:
# Checking if any duplicate records are present

duplicate=df[df.duplicated()] 
duplicate

<b> There are 3 duplicate records. </b>

In [None]:
# Removing duplicate records

df.drop_duplicates(inplace=True)

In [None]:
# Again check if any duplicate records are left

duplicate = df[df.duplicated()] 
duplicate

<b> Hence, all duplicate records are removed. </b>

In [None]:
df.describe()

In [None]:
df.describe(include='object')

In [None]:
# Renaming columns

df.rename(columns={'Review Text':'Review', 
                   'Recommended IND':'Recommended', 
                   'Positive Feedback Count':'PositiveFeedback', 
                   'Division Name':'Division', 'Department Name':'Department', 
                   'Class Name':'Class'}, inplace=True)

## Text Preprocessing

In [None]:
# Expanding contractions

# Dictionary of English Contractions
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}

# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

# Expanding Contractions in the title, text
df['Review'] = df['Review'].apply(lambda x:expand_contractions(x))

In [None]:
df['polarity'] = df['Review'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['ReviewLen'] = df['Review'].apply(lambda x: len(x))
df['WordCount'] = df['Review'].apply(lambda x: len(x.split()))

In [None]:
# Polarity Distribution

df['polarity'].iplot(kind='hist', xTitle='Polarity', yTitle='Count', title='Distribution of Polarity')

In [None]:
# Distribution of Division

px.histogram(df, x=df['Division'], title='Count of Division')

In [None]:
# Distribution of Department

px.histogram(df, x=df['Department'], title='Count of Department')

In [None]:
# Distribution of Class

px.histogram(df, x=df['Class'], title='Count of Class')

<b> Most clothes belong to General division, Tops department and are Dresses or Knits. </b>

In [None]:
# Distribution of Rating and Age

px.histogram(df, x='Age', color='Rating', barmode='stack', title='Distribution of Rating and Age')

<b> People in their 30s are more likely to give rating. </b>

In [None]:
# Distribution of Review Length

df['ReviewLen'].iplot(kind='hist', bins=50, xTitle='Review Length', yTitle='Count', title='Distribution of Review Length')

<b> Most reviews have 200-300 length. </b>

In [None]:
# Distribution of Word count

df['WordCount'].iplot(kind='hist', bins=50, xTitle='Word count', yTitle='Count', title='Distribution of Word count')

<b> Most reviews have 30-40 words. </b>

In [None]:
df_pos = df.groupby('Recommended')['PositiveFeedback'].sum()
df_pos

<b> 45,438 customers who gave a positive review recommend the product whereas 14,114 customers who gave review didn't recommend it. </b>

In [None]:
df_rat = df.groupby('Recommended')['Rating'].mean()
df_rat

<b> The mean rating for products which were recommended is 4.59 and for products which weren't recommended is 2.30. </b>

In [None]:
df_ratp = df.groupby('Rating')['PositiveFeedback'].sum()
df_ratp

<b> Maximum positive feedback is for products with rating 4 followed by products with rating 5. </b>

## N-Gram analysis before removing stopwords

### 1. Unigram

In [None]:
def get_top_n_words(x, n):
    vec = CountVectorizer().fit(x)
    bow = vec.transform(x)
    sum_words = bow.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x:x[1], reverse=True)
    return words_freq[:n]

In [None]:
# Unigram

words = get_top_n_words(df['Review'], 20)

df_uni = pd.DataFrame(words, columns=['Unigram', 'Frequency'])
df_uni = df_uni.set_index('Unigram')
df_uni.iplot(kind='bar', xTitle='Unigram', yTitle='Count', title='Top 20 Unigram Words')

### 2. Bigram

In [None]:
def get_top_nwords(x, n, i):
    vec = CountVectorizer(ngram_range=(i,i)).fit(x)
    bow = vec.transform(x)
    sum_words = bow.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x:x[1], reverse=True)
    return words_freq[:n]

In [None]:
# Bigram

words = get_top_nwords(df['Review'], 20, 2) 

df_bi = pd.DataFrame(words, columns=['Bigram', 'Frequency'])
df_bi = df_bi.set_index('Bigram')
df_bi.iplot(kind='bar', xTitle='Bigram', yTitle='Count', title='Top 20 Bigram Words')

### 3. Trigram

In [None]:
words = get_top_nwords(df['Review'], 20, 3) 

df_tri = pd.DataFrame(words, columns=['Trigram', 'Frequency'])
df_tri = df_tri.set_index('Trigram')
df_tri.iplot(kind='bar', xTitle='Trigram', yTitle='Count', title='Top 20 Trigram Words')

In [None]:
# Cleaning Review column

# Converting text to lowercase
df['Review'] = df['Review'].apply(lambda x:x.lower())

# Removing digits and words containing digits
df['Review'] = df['Review'].apply(lambda x: re.sub('\w*\d\w*','', x))

# Removing punctuations
df['Review'] = df['Review'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Removing extra spaces
df['Review'] = df['Review'].apply(lambda x: re.sub(' +',' ',x))

In [None]:
# Applying lemmatization and removing stopwords

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    rev = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text) if w not in stopwords.words('english')]
    rev = ' '.join(rev)
    return rev

df['Review'] = df.Review.apply(lemmatize_text)

## N-Gram analysis after removing stopwords

### 1. Unigram

In [None]:
words = get_top_n_words(df['Review'], 20)

df_uni = pd.DataFrame(words, columns=['Unigram', 'Frequency'])
df_uni = df_uni.set_index('Unigram')
df_uni.iplot(kind='bar', xTitle='Unigram', yTitle='Count', title='Top 20 Unigram Words')

### 2. Bigram

In [None]:
# Bigram

words = get_top_nwords(df['Review'], 20, 2) 

df_bi = pd.DataFrame(words, columns=['Bigram', 'Frequency'])
df_bi = df_bi.set_index('Bigram')
df_bi.iplot(kind='bar', xTitle='Bigram', yTitle='Count', title='Top 20 Bigram Words')

### 3. Trigram

In [None]:
words = get_top_nwords(df['Review'], 20, 3) 

df_tri = pd.DataFrame(words, columns=['Trigram', 'Frequency'])
df_tri = df_tri.set_index('Trigram')
df_tri.iplot(kind='bar', xTitle='Trigram', yTitle='Count', title='Top 20 Trigram Words')

In [None]:
# Wordcloud of Review in Recommended product

# Cleaned dataframe of Recommended
df_true = df[df.Recommended == 1]

text_true = " ".join(txt for txt in df_true['Review'])

text_cloud = WordCloud(collocations=False, background_color='black').generate(text_true)
plt.axis("off")
plt.imshow(text_cloud, interpolation='bilinear')

In [None]:
# Wordcloud of Review in not Recommended product

# Cleaned dataframe of Recommended
df_false = df[df.Recommended == 0]

text_true = " ".join(txt for txt in df_true['Review'])

text_cloud = WordCloud(collocations=False, background_color='black').generate(text_true)
plt.axis("off")
plt.imshow(text_cloud, interpolation='bilinear')

In [None]:
# Bar plot of polarity

negative = (len(df.loc[df.polarity < 0, ['Review']].values)/len(df))*100
positive = (len(df.loc[df.polarity > 0.5, ['Review']].values)/len(df))*100
neutral = len(df.loc[df.polarity >0 ,['Review']].values) - len(df.loc[df.polarity >0.5 ,['Review']].values)
neutral = neutral/len(df)*100

plt.figure(figsize =(10, 7)) 
plt.pie([positive,negative,neutral], labels = ['Positive','Negative','Neutral']) 
plt.show()

In [None]:
# Reviews with positive polarity

pos = df.loc[df.polarity == 1,['Review']].sample(3).values
for i in pos:
    print(i)

In [None]:
# Reviews with negative polarity

neg = df.loc[df.polarity < 0,['Review']].sample(3).values
for i in neg:
    print(i)

In [None]:
# Reviews with neutral polarity

neu = df.loc[df.polarity == 0,['Review']].sample(3).values
for i in neu:
    print(i)

In [None]:
# Distribution of Sentiment Polarity based on Recommendation

x1 = df[df['Recommended']==1]['polarity']
x0 = df[df['Recommended']==0]['polarity']

trace1 = go.Histogram(x=x0, name='Not Recommended', opacity=0.6)
trace0 = go.Histogram(x=x1, name='Recommended', opacity=0.8)

data = [trace0,trace1]
layout = go.Layout(barmode='overlay', title='Distribution of Sentiment Polarity of Reviews Based On The Recommendation ')
fig = go.Figure(data=data,layout=layout)
fig.show()

In [None]:
# Distribution of Sentiment Polarity based on Rating

r1 = df[df['Rating'] == 1]['polarity']
r2 = df[df['Rating'] == 2]['polarity']
r3 = df[df['Rating'] == 3]['polarity']
r4 = df[df['Rating'] == 4]['polarity']
r5 = df[df['Rating'] == 5]['polarity']

rat1 = go.Histogram(x=r1, name='1', opacity=0.5)
rat2 = go.Histogram(x=r2, name='2', opacity=0.6)
rat3 = go.Histogram(x=r3, name='3', opacity=0.7)
rat4 = go.Histogram(x=r4, name='4', opacity=0.8)
rat5 = go.Histogram(x=r5, name='5', opacity=0.9)

data = [rat1, rat2, rat3, rat4, rat5]
layout = go.Layout(barmode='overlay', title='Distribution of Sentiment Polarity of Reviews Based On The Rating')
fig = go.Figure(data=data, layout=layout)
fig.show()