# Women's E-Commerce Clothing Reviews 

Clothing ID: Integer Categorical variable that refers to the specific piece being reviewed.

Age: Positive Integer variable of the reviewers age.

Title: String variable for the title of the review.

Review Text: String variable for the review body.

Rating: Positive Ordinal Integer variable for the product score granted by the customer from 1 Worst, to 5 Best.

Recommended IND: Binary variable stating where the customer recommends the product where 1 is recommended, 0 is not 
recommended.

Positive Feedback Count: Positive Integer documenting the number of other customers who found this review positive.

Division Name: Categorical name of the product high level division.

Department Name: Categorical name of the product department name.

Class Name: Categorical name of the product class name.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
import cufflinks as cf

from plotly.offline import iplot
%matplotlib inline


In [None]:
py.offline.init_notebook_mode(connected=True)
cf.go_offline()

# Data Import

In [None]:
df = pd.read_csv('../input/womens-ecommerce-clothing-reviews/Womens Clothing E-Commerce Reviews.csv', index_col=0)
df.head()

In [None]:
df.drop(labels=['Title', 'Clothing ID'], axis=1, inplace=True)
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.dropna(subset=['Review Text', 'Division Name'], inplace=True)
df.isnull().sum()

In [None]:
' '.join(df['Review Text'].tolist())[:1000]

# Text Cleaning

In [None]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how does",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
" u ": " you ",
" ur ": " your ",
" n ": " and "}

In [None]:
def cont_to_exp(x):
    if type(x) is str:
        x = x.replace('\\', '')
        for key in contractions:
            value = contractions[key]
            x = x.replace(key, value)
        return x
    else:
        return x

In [None]:
x = "i don't know what date is today, I am 5'8\"" 

In [None]:
print(cont_to_exp(x))

In [None]:
%%time
df['Review Text'] = df['Review Text'].apply(lambda x: cont_to_exp(x))

In [None]:
df.head()

In [None]:
print(' '.join(df['Review Text'].tolist())[:1000])

## Feature Engineering 

In [None]:
from textblob import TextBlob

In [None]:
df.head()

In [None]:
df['polarity'] = df['Review Text'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [None]:
df['review_len'] = df['Review Text'].apply(lambda x: len(x))

In [None]:
df['word_count'] = df['Review Text'].apply(lambda x: len(x.split()))

In [None]:
def get_avg_word_len(x):
    words = x.split()
    word_len = 0
    for word in words:
        word_len = word_len + len(word)
    return word_len/len(words)

In [None]:
df['avg_word_len'] = df['Review Text'].apply(lambda x: get_avg_word_len(x))

In [None]:
df.head()

## Distribution of Sentiment Polarity 

In [None]:
df.head()

In [None]:
df['polarity'].iplot(kind = 'hist', colors = 'red', bins = 50,
                    xTitle = 'Polarity', yTitle = 'Count', title  = 'Sentiment Polarity Distribution')

## Distribution of Reviews Rating and Reviewers Age

In [None]:
df['Rating'].iplot(kind='hist', xTitle='Rating', yTitle='Count',
                  title='Review Rating Distribution')

In [None]:
df['Age'].iplot(kind='hist', bins=40, xTitle='Age', yTitle='Count',
                  title='Reviewers Age Dist', colors='red', linecolor='black')

## Distribution of Review Text Length and Word Length

In [None]:
df['review_len'].iplot(kind='hist', xTitle='Review Len', yTitle='Count',
                      title='Review Text Len Dist')

In [None]:
df['word_count'].iplot(kind = 'hist', xTitle = 'Word Count', yTitle = 'Count',
                       title = 'Word Count Distribution')

In [None]:
df['avg_word_len'].iplot(kind = 'hist', xTitle = 'Avg Word Len', yTitle = 'Count',
                         title = 'Review Text Avg Word Len Dist')

In [None]:
df['word_count'].iplot(kind = 'hist', xTitle = 'Word Count', yTitle = 'Count', 
                       title = 'Word Count Distribution')

## Distribution of Department, Division, and Class 

In [None]:
df.head(1)

In [None]:
df['Department Name'].value_counts()

In [None]:
df.groupby('Department Name').count()

In [None]:
df['Department Name'].value_counts().iplot(kind = 'bar', yTitle = 'Count', xTitle = 'Department',
                                          title = "Bar Chart of Department's Name")

In [None]:
df['Division Name'].value_counts().iplot(kind = 'bar', yTitle = 'Count', xTitle = 'Division',
                                          title = "Bar Chart of Division's Name")

In [None]:
df['Class Name'].value_counts().iplot(kind = 'bar', yTitle = 'Count', xTitle = 'Class',
                                          title = "Bar Chart of Class Name")

## Distribution of Unigram, Bigram and Trigram 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

### Unigram

In [None]:
def get_top_n_words(x, n):
    vec = CountVectorizer().fit(x)
    bow = vec.transform(x)
    sum_words = bow.sum(axis = 0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    return words_freq[:n]

In [None]:
words = get_top_n_words(df['Review Text'], 20)

In [None]:
words

In [None]:
df1 = pd.DataFrame(words, columns=['Unigram', 'Frequency'])
df1 = df1.set_index('Unigram')
df1.iplot(kind='bar', xTitle = 'Unigram', yTitle = 'Count', title = ' Top 20 unigram words')

### Bigram 


In [None]:
def get_top_n_words(x, n):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(x)
    bow = vec.transform(x)
    sum_words = bow.sum(axis = 0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    return words_freq[:n]

In [None]:
words = get_top_n_words(df['Review Text'], 20)

In [None]:
words

In [None]:
df1 = pd.DataFrame(words, columns = ['Bigram', 'Frequency'])
df1 = df1.set_index('Bigram')
df1.iplot(kind = 'bar', xTitle = 'Bigram', yTitle = 'Count', title = ' Top 20 Bigram words')

### Trigram 

In [None]:
def get_top_n_words(x, n):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(x)
    bow = vec.transform(x)
    sum_words = bow.sum(axis = 0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    return words_freq[:n]

In [None]:
words = get_top_n_words(df['Review Text'], 20)

In [None]:
words

In [None]:
df1 = pd.DataFrame(words, columns = ['Trigram', 'Frequency'])
df1 = df1.set_index('Trigram')
df1.iplot(kind = 'bar', xTitle = 'Trigram', yTitle = 'Count', title = ' Top 20 Trigram words')

## Distribution of Unigram, Bigram and Trigram without STOP WORDS

### Unigram 

In [None]:
def get_top_n_words(x, n):
    vec = CountVectorizer(ngram_range=(1, 1), stop_words='english').fit(x)
    bow = vec.transform(x)
    sum_words = bow.sum(axis = 0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    return words_freq[:n]

In [None]:
words = get_top_n_words(df['Review Text'], 20)

In [None]:
words

In [None]:
df1 = pd.DataFrame(words, columns = ['Unigram', 'Frequency'])
df1 = df1.set_index('Unigram')
df1.iplot(kind = 'bar', xTitle = 'Unigram', yTitle = 'Count', title = ' Top 20 Unigram words')

### Bigram 

In [None]:
def get_top_n_words(x, n):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(x)
    bow = vec.transform(x)
    sum_words = bow.sum(axis = 0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    return words_freq[:n]

In [None]:
words = get_top_n_words(df['Review Text'], 20)

In [None]:
words

In [None]:
df1 = pd.DataFrame(words, columns = ['Bigram', 'Frequency'])
df1 = df1.set_index('Bigram')
df1.iplot(kind = 'bar', xTitle = 'Bigram', yTitle = 'Count', title = ' Top 20 Bigram words')

### Trigram 

In [None]:
def get_top_n_words(x, n):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(x)
    bow = vec.transform(x)
    sum_words = bow.sum(axis = 0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    return words_freq[:n]

In [None]:
words = get_top_n_words(df['Review Text'], 20)
words

In [None]:
df1 = pd.DataFrame(words, columns = ['Trigram', 'Frequency'])
df1 = df1.set_index('Trigram')
df1.iplot(kind = 'bar', xTitle = 'Trigram', yTitle = 'Count', title = ' Top 20 Trigram words')

## Distribution of Top 20 Parts-of-Speech POS tags 

In [None]:
import nltk

In [None]:
print(str(df['Review Text']))

In [None]:
blob = TextBlob(str(df['Review Text']))

In [None]:
print(nltk.help.upenn_tagset())

In [None]:
pos_df = pd.DataFrame(blob.tags, columns=['words', 'pos'])
pos_df = pos_df['pos'].value_counts()
pos_df

In [None]:
pos_df.iplot(kind='bar')

## Bivariate Analysis 

In [None]:
df.head(2)

In [None]:
sns.pairplot(df)

In [None]:
sns.catplot(x='Division Name', y='polarity', data=df)

In [None]:
sns.catplot(x = 'Division Name', y = 'polarity', data = df, kind = 'box')

In [None]:
sns.catplot(x = 'Department Name', y = 'polarity', data = df)

In [None]:
sns.catplot(x = 'Department Name', y = 'polarity', data = df, kind = 'box')

In [None]:
sns.catplot(x = 'Division Name', y = 'review_len', data = df, kind = 'box')

In [None]:
sns.catplot(x = 'Department Name', y = 'review_len', data = df, kind = 'box')

## Distribution of Sentiment Polarity of Reviews Based on the Recommendation 

In [None]:
import plotly.express as px
import plotly.graph_objects as go

In [None]:
x1 = df[df['Recommended IND']==1]['polarity']
x0 = df[df['Recommended IND']==0]['polarity']

In [None]:
type(x1)

In [None]:
trace0 = go.Histogram(x = x0, name = 'Not Recommended', opacity = 0.7)
trace1 = go.Histogram(x = x1, name = 'Recommended', opacity = 0.7)

In [None]:
data = [trace0, trace1]
layout = go.Layout(barmode = 'overlay', title = 'Distribution of Sentiment Polarity of Reviews Based on the Recommendation')
fig = go.Figure(data = data, layout = layout)

iplot(fig)

## Distribution of Ratings Based on the Recommendation 

In [None]:
x1 = df[df['Recommended IND']==1]['Rating']
x0 = df[df['Recommended IND']==0]['Rating']

In [None]:
type(x1)

In [None]:
trace0 = go.Histogram(x = x0, name = 'Not Recommended', opacity = 0.7)
trace1 = go.Histogram(x = x1, name = 'Recommended', opacity = 0.7)

In [None]:
data = [trace0, trace1]
layout = go.Layout(barmode = 'overlay', title = 'Distribution of Reviews Rating Based on the Recommendation')
fig = go.Figure(data = data, layout = layout)

iplot(fig)