##Women's E-commerce Clothing Reviews


Clothing ID: Integer Categorical variable that refers to the specific piece being reviewed.<br>
Age: Positive Integer variable of the reviewers age.<br>
Title: String variable for the title of the review.<br>
Review Text: String variable for the review body.<br>
Rating: Positive Ordinal Integer variable for the product score granted by the customer from 1 Worst, to 5 Best.<br>
Recommended IND: Binary variable stating where the customer recommends the product where 1 is recommended, 0 is not recommended.<br>
Positive Feedback Count: Positive Integer documenting the number of other customers who found this review positive.<br>
Division Name: Categorical name of the product high level division.<br>
Department Name: Categorical name of the product department name.<br>
Class Name: Categorical name of the product class name.<br>

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn  as sns
%matplotlib inline

In [32]:
import plotly as py
import cufflinks as cf

In [33]:
from plotly.offline import iplot

In [34]:
py.offline.init_notebook_mode(connected=True)
cf.go_offline()

## Data Import

In [35]:
df = pd.read_csv('../input/eda-for-text-data/Womens Clothing E-Commerce Reviews.csv', index_col = 0)

In [36]:
df.head(5)

In [37]:
df.drop(labels=['Title', 'Clothing ID'], axis = 1, inplace = True)

In [38]:
df.head(5)

In [39]:
df.isnull().sum()

In [40]:
df.dropna(subset=['Review Text', 'Division Name'], inplace = True)

In [41]:
df.isnull().sum()

In [42]:
#' '.join(df['Review Text'].tolist())

## Text Cleaning

In [43]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

In [44]:
def cont_to_exp(x):
  if type(x) is str:
    x = x.replace('\\', '')
    for key in contractions:
      value = contractions[key]
      x = x.replace(key, value)
    return x
  else:
    return x

In [45]:
%%time
df['Review Text'] = df['Review Text'].apply(lambda x: cont_to_exp(x))

In [46]:
df.head(2)

In [47]:
print(' '.join(df['Review Text'].tolist())[:1000])

## Feature Engineering

In [48]:
from textblob import TextBlob

In [49]:
df.head(2)

In [50]:
df['polarity'] = df['Review Text'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [51]:
df['review_len'] = df['Review Text'].apply(lambda x: len(x))

In [52]:
df['word_count'] = df['Review Text'].apply(lambda x: len(x.split()))

In [53]:
def get_avg_word_len(x):
  words = x.split()
  word_len = 0
  for word in words:
    word_len = word_len + len(word)

  return word_len/len(words)

In [54]:
df['avg_word_len'] = df['Review Text'].apply(lambda x: get_avg_word_len(x))

In [55]:
df.head(2)

### Distribution of Sentiment Polarity

In [59]:
df['polarity'].iplot(kind = 'hist', colors = 'red', bins = 50, xTitle = 'Polarity', yTitle = 'Count', title = 'Sentiment Polarity Distribution')

### Distribution of Reviews Rating and Revieweres Age

In [60]:
df['Rating'].iplot(kind = 'hist', xTitle = 'Rating', yTitle = 'Count', title = 'Review Rating Distribution')

In [63]:
df['Age'].iplot(kind = 'hist', bins = 40, xTitle = 'Age', yTitle = 'Count', title = 'Reviewers Age Distribution')

### Distribution of Review Text Length and Word Length

In [65]:
df['review_len'].iplot(kind = 'hist', xTitle = 'Review Len', yTitle = 'Count', title = 'Review Text Len Dist')

In [66]:
df['word_count'].iplot(kind = 'hist', xTitle = 'Word Count', yTitle = 'Count', title = 'Word Count Dist')

In [67]:
df['avg_word_len'].iplot(kind = 'hist', xTitle = 'Avg Word Len', yTitle = 'Count', title = 'Review Text Avg Word Len Dist')

### Distribution of Department, Division and Class

In [69]:
df['Department Name'].value_counts()

In [70]:
df.groupby('Department Name').count()

In [72]:
df['Department Name'].value_counts().iplot(kind = 'bar', yTitle = 'Count', xTitle = 'Department', title = 'Bar Chart of Department Name')

In [73]:
df['Division Name'].value_counts().iplot(kind = 'bar', yTitle = 'Count', xTitle = 'Division', title = 'Bar Chart of Division Name')

In [74]:
df['Class Name'].value_counts().iplot(kind = 'bar', yTitle = 'Count', xTitle = 'Class', title = 'Bar Chart of Class Name')

### Distribution of Unigram, Bigram and Trigram

In [75]:
from sklearn.feature_extraction.text import CountVectorizer

In [76]:
x = ['this is the list list this this this']

In [81]:
vec = CountVectorizer().fit(x)
bow = vec.transform(x)
sum_words = bow.sum(axis = 0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
words_freq[:2]

In [86]:
def get_top_n_words(x, n):
    vec = CountVectorizer().fit(x)
    bow = vec.transform(x)
    sum_words = bow.sum(axis = 0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    return words_freq[:n]

In [87]:
get_top_n_words(x, 3)

In [88]:
words = get_top_n_words(df['Review Text'], 20)

In [89]:
words

In [94]:
df1 = pd.DataFrame(words, columns = ['Unigram', 'Frequency'])
df1 = df1.set_index('Unigram')
df1.iplot(kind = 'bar', xTitle = 'Unigram', yTitle = 'Count', title = 'Top 20 words')

### Bigram

In [95]:
def get_top_n_words(x, n):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(x)
    bow = vec.transform(x)
    sum_words = bow.sum(axis = 0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    return words_freq[:n]

In [96]:
get_top_n_words(x, 3)

In [97]:
words = get_top_n_words(df['Review Text'], 20)

In [98]:
words

In [101]:
df1 = pd.DataFrame(words, columns = ['Bigram', 'Frequency'])
df1 = df1.set_index('Bigram')
df1.iplot(kind = 'bar', xTitle = 'Bigram', yTitle = 'Count', title = 'Top 20 words')

### Trigram

In [102]:
def get_top_n_words(x, n):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(x)
    bow = vec.transform(x)
    sum_words = bow.sum(axis = 0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    return words_freq[:n]

In [103]:
get_top_n_words(x, 3)

In [104]:
words = get_top_n_words(df['Review Text'], 20)

In [105]:
words

In [106]:
df1 = pd.DataFrame(words, columns = ['Trigram', 'Frequency'])
df1 = df1.set_index('Trigram')
df1.iplot(kind = 'bar', xTitle = 'Trigram', yTitle = 'Count', title = 'Top 20 words')

### Distribution of Unigram, Bigram, Trigram without STOP WORDS

### Unigram

In [107]:
def get_top_n_words(x, n):
    vec = CountVectorizer(ngram_range=(1, 1), stop_words = 'english').fit(x)
    bow = vec.transform(x)
    sum_words = bow.sum(axis = 0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    return words_freq[:n]

In [108]:
get_top_n_words(x, 3)

In [109]:
words = get_top_n_words(df['Review Text'], 20)

In [110]:
words

In [111]:
df1 = pd.DataFrame(words, columns = ['Unigram', 'Frequency'])
df1 = df1.set_index('Unigram')
df1.iplot(kind = 'bar', xTitle = 'Unigram', yTitle = 'Count', title = 'Top 20 words')

### Bigram

In [112]:
def get_top_n_words(x, n):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words = 'english').fit(x)
    bow = vec.transform(x)
    sum_words = bow.sum(axis = 0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    return words_freq[:n]

In [113]:
get_top_n_words(x, 3)

In [114]:
words

In [115]:
df1 = pd.DataFrame(words, columns = ['Bigram', 'Frequency'])
df1 = df1.set_index('Bigram')
df1.iplot(kind = 'bar', xTitle = 'Bigram', yTitle = 'Count', title = 'Top 20 words')

### Trigram

In [116]:
def get_top_n_words(x, n):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words = 'english').fit(x)
    bow = vec.transform(x)
    sum_words = bow.sum(axis = 0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    return words_freq[:n]

In [118]:
#get_top_n_words(x, 3)

In [120]:
words = get_top_n_words(df['Review Text'], 20)

In [121]:
words

In [122]:
df1 = pd.DataFrame(words, columns = ['Trigram', 'Frequency'])
df1 = df1.set_index('Trigram')
df1.iplot(kind = 'bar', xTitle = 'Trigram', yTitle = 'Count', title = 'Top 20 words')

### Distribution of Top 20 POS tags

In [123]:
import nltk

In [125]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [126]:
blob = TextBlob(str(df['Review Text']))

In [127]:
blob

In [129]:
print(str(df['Review Text']))

In [130]:
blob = TextBlob(str(df['Review Text']))

In [133]:
nltk.help.upenn_tagset()

In [136]:
pos_df = pd.DataFrame(blob.tags, columns = ['words', 'pos'])
pos_df = pos_df['pos'].value_counts()
pos_df

In [137]:
pos_df.iplot(kind = 'bar')

### Bivariate Analysis

In [138]:
df.head(2)

In [139]:
sns.pairplot(df)

In [141]:
sns.catplot(x = 'Division Name', y = 'polarity', data = df)

In [142]:
sns.catplot(x = 'Division Name', y = 'polarity', data = df, kind = 'box')

In [143]:
sns.catplot(x = 'Department Name', y = 'polarity', data = df)

In [144]:
sns.catplot(x = 'Department Name', y = 'polarity', data = df, kind = 'box')

In [145]:
sns.catplot(x = 'Division Name', y = 'review_len', data = df, kind = 'box')

In [146]:
import plotly.express as px
import plotly.graph_objects as go

In [158]:
x1 = df[df['Recommended IND']==1]['polarity']
x0 = df[df['Recommended IND']==0]['polarity']

In [159]:
x1

In [160]:
trace0= go.Histogram(x = x0, name = 'Not Recommended', opacity = 0.8)
trace1= go.Histogram(x = x1, name = 'Recommended', opacity = 0.8)

In [161]:
data = [trace0, trace1]
layout = go.Layout(barmode = 'overlay', title = 'Distribution of Sentiment Plarity of Reviews based on the Recommendation')
fig = go.Figure(data=data, layout=layout)
fig.show()

### Distribution of Ratings based on the Recommendation

In [162]:
x1 = df[df['Recommended IND']==1]['Rating']
x0 = df[df['Recommended IND']==0]['Rating']

In [163]:
trace0= go.Histogram(x = x0, name = 'Not Recommended', opacity = 0.7)
trace1= go.Histogram(x = x1, name = 'Recommended', opacity = 0.7)

In [164]:
data = [trace0, trace1]
layout = go.Layout(barmode = 'overlay', title = 'Distribution of Reviews Rating based on the Recommendation')
fig = go.Figure(data=data, layout=layout)
fig.show()

In [168]:
sns.jointplot(x = 'polarity', y = 'review_len', data = df, kind = 'kde')

In [169]:
sns.jointplot(x = 'polarity', y = 'Age', data = df, kind = 'kde')