In [None]:
# !pip install plotly
# !pip install cufflinks
# !pip install textblob

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
import plotly as py
import cufflinks as cf

In [None]:
from plotly.offline import iplot

In [None]:
py.offline.init_notebook_mode(connected=True)
cf.go_offline()

## Data Import 

In [None]:
Reviews_df = pd.read_csv('../input/womens-ecommerce-clothing-reviews/Womens Clothing E-Commerce Reviews.csv', index_col=0)
Reviews_df.head()

In [None]:
Reviews_df.drop(labels=['Title', 'Clothing ID'], axis = 1, inplace=True)

In [None]:
Reviews_df.head()

In [None]:
Reviews_df.isnull().sum()

In [None]:
Reviews_df.dropna(subset=['Review Text', 'Division Name'], inplace = True)
Reviews_df

In [None]:
Reviews_df.isnull().sum()

In [None]:
' '.join(Reviews_df['Review Text'].tolist())

[](http://)## Text Cleaning **

In [None]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how does",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
" u ": " you ",
" ur ": " your ",
" n ": " and "}

In [None]:
def contractions_to_expression(x):
    if type(x) is str:
        x = x.replace('\\', '')
        for key in contractions:
            value = contractions[key]
            x = x.replace(key, value)
        return x
    else:
        return x

In [None]:
%%time
Reviews_df['Review Text'] = Reviews_df['Review Text'].apply(lambda x: contractions_to_expression(x))

In [None]:
Reviews_df.head()

In [None]:
print(' '.join(Reviews_df['Review Text'].tolist())[:1000])

## Feature Engineering 

In [None]:
from textblob import TextBlob

In [None]:
Reviews_df['polarity'] = Reviews_df['Review Text'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [None]:
Reviews_df['review_len'] = Reviews_df['Review Text'].apply(lambda x: len(x))

In [None]:
Reviews_df['word_count'] = Reviews_df['Review Text'].apply(lambda x: len(x.split()))

In [None]:
def average_word_length(x):
    words = x.split()
    word_len = 0
    for word in words:
        word_len = word_len + len(word)
        
    return word_len/len(words)

In [None]:
Reviews_df['avg_word_len'] = Reviews_df['Review Text'].apply(lambda x: average_word_length(x))

In [None]:
Reviews_df.head(5)

## Distribution of Sentiment Polarity 

In [None]:
Reviews_df['polarity'].iplot(kind = 'hist', colors = 'blue', bins = 25,
                    xTitle = 'Polarity', yTitle = 'Count', title  = 'Sentiment_Polarity_Distribution')

## Distribution of Reviews Rating and Reviewers Age

In [None]:
Reviews_df['Rating'].iplot(kind = 'hist',colors='red',bins = 15, xTitle = 'Rating', yTitle = 'Count',
                  title = 'Review_Rating_Distribution')

In [None]:
Reviews_df['Age'].iplot(kind = 'hist', bins = 40, xTitle = 'Age', yTitle = 'Count',
               title = 'Reviewers_Age_Dist', colors = 'orange', linecolor = 'blue')

## Distribution of Review Text Length and Word Length

In [None]:
Reviews_df['review_len'].iplot(kind = 'hist', xTitle = 'Review Len', yTitle = 'Count', title = 'Review_Text_Len_Dist')

In [None]:
Reviews_df['word_count'].iplot(kind = 'hist', xTitle = 'Word Count', yTitle = 'Count', title = 'Word_Count_Distribution')

In [None]:
Reviews_df['avg_word_len'].iplot(kind = 'hist', xTitle = 'Avg Word Len', yTitle = 'Count', title = 'Review_Text_Avg_Word_Len_Dist')

In [None]:
Reviews_df['word_count'].iplot(kind = 'hist', xTitle = 'Word Count', yTitle = 'Count', 
                       title = 'Word_Count_Distribution')

## Distribution of Department, Division, and Class 

In [None]:
Reviews_df['Department Name'].value_counts()

In [None]:
Reviews_df.groupby('Department Name').count()

In [None]:
Reviews_df['Department Name'].value_counts().iplot(kind = 'bar', yTitle = 'Count', xTitle = 'Department',
                                          title = "Bar Chart of Department's Name")

In [None]:
Reviews_df['Division Name'].value_counts().iplot(kind = 'bar', yTitle = 'Count', xTitle = 'Division',
                                          title = "Bar Chart of Division's Name")


In [None]:
Reviews_df['Class Name'].value_counts().iplot(kind = 'bar', yTitle = 'Count', xTitle = 'Class',
                                          title = "Bar Chart of Class's Name")


## Distribution of Unigram, Bigram and Trigram 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

### Unigram 

In [None]:
x = ['this is the list list this this this']

In [None]:
vec = CountVectorizer().fit(x)
bow = vec.transform(x)
sum_words = bow.sum(axis = 0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
words_freq[:2]

In [None]:
def get_top_n_words(x, n):
    vec = CountVectorizer().fit(x)
    bow = vec.transform(x)
    sum_words = bow.sum(axis = 0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    return words_freq[:n]

In [None]:
get_top_n_words(x, 3)

In [None]:
words = get_top_n_words(Reviews_df['Review Text'], 20)

In [None]:
words

In [None]:
Reviews = pd.DataFrame(words, columns = ['Unigram', 'Frequency'])
Reviews = Reviews.set_index('Unigram')
Reviews.iplot(kind = 'bar', xTitle = 'Unigram', yTitle = 'Count', title = ' Top 20 unigram words')

### Bigram 

In [None]:
def get_top_n_words(x, n):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(x)
    bow = vec.transform(x)
    sum_words = bow.sum(axis = 0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    return words_freq[:n]

In [None]:
get_top_n_words(x, 3)

In [None]:
words = get_top_n_words(Reviews_df['Review Text'], 20)

In [None]:
words

In [None]:
Reviews = pd.DataFrame(words, columns = ['Bigram', 'Frequency'])
Reviews = Reviews.set_index('Bigram')
Reviews.iplot(kind = 'bar', xTitle = 'Bigram', yTitle = 'Count', title = ' Top 20 Bigram words')

### Trigram 

In [None]:
def get_top_n_words(x, n):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(x)
    bow = vec.transform(x)
    sum_words = bow.sum(axis = 0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    return words_freq[:n]

In [None]:
get_top_n_words(x, 3)

In [None]:
words = get_top_n_words(Reviews_df['Review Text'], 20)

In [None]:
words

In [None]:
Reviews = pd.DataFrame(words, columns = ['Trigram', 'Frequency'])
Reviews = Reviews.set_index('Trigram')
Reviews.iplot(kind = 'bar', xTitle = 'Trigram', yTitle = 'Count', title = ' Top 20 Trigram words')

## Bivariate Analysis 

In [None]:
sns.pairplot(Reviews_df)

In [None]:
sns.catplot(x = 'Division Name', y = 'polarity', data = Reviews_df)

In [None]:
sns.catplot(x = 'Division Name', y = 'polarity', data = Reviews_df, kind = 'box')

In [None]:
sns.catplot(x = 'Department Name', y = 'polarity', data = Reviews_df)

In [None]:
sns.catplot(x = 'Department Name', y = 'polarity', data = Reviews_df, kind = 'box')

In [None]:
sns.catplot(x = 'Division Name', y = 'review_len', data = Reviews_df, kind = 'box')

In [None]:
sns.catplot(x = 'Department Name', y = 'review_len', data = Reviews_df, kind = 'box')

## Distribution of Sentiment Polarity of Reviews Based on the Recommendation 

In [None]:
import plotly.express as px
import plotly.graph_objects as go

In [None]:
x1 = Reviews_df[Reviews_df['Recommended IND']==1]['polarity']
x0 = Reviews_df[Reviews_df['Recommended IND']==0]['polarity']

In [None]:
type(x1)

In [None]:
trace0 = go.Histogram(x = x0, name = 'Not Recommended', opacity = 0.7)
trace1 = go.Histogram(x = x1, name = 'Recommended', opacity = 0.7)

In [None]:
data = [trace0, trace1]
layout = go.Layout(barmode = 'overlay', title = 'Distribution of Sentiment Polarity of Reviews Based on the Recommendation')
fig = go.Figure(data = data, layout = layout)

iplot(fig)

## Distribution of Ratings Based on the Recommendation 

In [None]:
x1 = Reviews_df[Reviews_df['Recommended IND']==1]['Rating']
x0 = Reviews_df[Reviews_df['Recommended IND']==0]['Rating']

In [None]:
type(x1)

In [None]:
trace0 = go.Histogram(x = x0, name = 'Not Recommended', opacity = 0.7)
trace1 = go.Histogram(x = x1, name = 'Recommended', opacity = 0.7)

In [None]:
data = [trace0, trace1]
layout = go.Layout(barmode = 'overlay', title = 'Distribution of Reviews Rating Based on the Recommendation')
fig = go.Figure(data = data, layout = layout)

iplot(fig)

In [None]:
sns.jointplot(x = 'polarity', y = 'review_len', data = Reviews_df, kind = 'kde')

In [None]:
sns.jointplot(x = 'polarity', y = 'Age', data = Reviews_df, kind = 'kde')