In [119]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
%matplotlib inline
from collections import Counter
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from textblob import TextBlob
from nltk.tokenize import RegexpTokenizer
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer

In [120]:
df = pd.read_csv('Womens Clothing E-Commerce Reviews.csv')

In [121]:
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [122]:
df = df.drop(['Unnamed: 0'],axis=1)

In [124]:
df = df[df['Review Text'].notnull()]

In [60]:
df['Polarity score'] = df['Review Text'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [61]:
df['Sentiment']=''
df.loc[df['Polarity score']>0,'Sentiment']='Positive'
df.loc[df['Polarity score']==0,'Sentiment']='Neutral'
df.loc[df['Polarity score']<0,'Sentiment']='Negative'

In [88]:
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [101]:
def preprocessing(data):
    text = data.str.lower().str.cat(sep='')
    text = text.replace('\d+','')
    words = tokenizer.tokenize(text) 
    words = [w for w in words if not w in stop_words]
    words = [lemmatizer.lemmatize(w) for w in words]
    return words

In [102]:
def get_ngrams_freq(text,n):
    n_grams = ngrams((text),n)
    result = [' '.join(grams)for grams in n_grams]
    result_count = Counter(result)
    df = pd.DataFrame.from_dict(result_count,orient='index')
    df = df.rename(columns={'index':'words', 0:'frequency'})
    return df.sort_values('frequency', ascending=False).head(10)

In [103]:
data = df['Review Text']
words = preprocessing(data)
words

['absolutely',
 'wonderful',
 'silky',
 'sexy',
 'comfortablelove',
 'dress',
 'sooo',
 'pretty',
 'happened',
 'find',
 'store',
 'glad',
 'bc',
 'never',
 'would',
 'ordered',
 'online',
 'bc',
 'petite',
 'bought',
 'petite',
 '5',
 '8',
 'love',
 'length',
 'hit',
 'little',
 'knee',
 'would',
 'definitely',
 'true',
 'midi',
 'someone',
 'truly',
 'petite',
 'high',
 'hope',
 'dress',
 'really',
 'wanted',
 'work',
 'initially',
 'ordered',
 'petite',
 'small',
 'usual',
 'size',
 'found',
 'outrageously',
 'small',
 'small',
 'fact',
 'could',
 'zip',
 'reordered',
 'petite',
 'medium',
 'ok',
 'overall',
 'top',
 'half',
 'comfortable',
 'fit',
 'nicely',
 'bottom',
 'half',
 'tight',
 'layer',
 'several',
 'somewhat',
 'cheap',
 'net',
 'layer',
 'imo',
 'major',
 'design',
 'flaw',
 'net',
 'layer',
 'sewn',
 'directly',
 'zipper',
 'ci',
 'love',
 'love',
 'love',
 'jumpsuit',
 'fun',
 'flirty',
 'fabulous',
 'every',
 'time',
 'wear',
 'get',
 'nothing',
 'great',
 'complime

In [111]:
data_recommend = df['Review Text'][df['Recommended IND']==1]
words_recommend = preprocessing(data_recommend)

In [112]:
print('the most popular words in recommended reviews')
get_ngrams_freq(words_recommend,1)

the most popular words in recommended reviews


Unnamed: 0,frequency
dress,9302
fit,8654
size,8058
love,7820
top,6637
color,5938
wear,5737
great,5506
like,5375
look,5312


In [113]:
get_ngrams_freq(words_recommend,2)

Unnamed: 0,frequency
true size,1231
fit perfectly,1029
look great,832
love dress,635
5 4,622
usually wear,588
fit well,566
fit great,549
well made,529
look like,527


In [114]:
get_ngrams_freq(words_recommend,3)

Unnamed: 0,frequency
fit true size,451
run true size,193
received many compliment,163
small fit perfectly,137
fit like glove,128
love love love,123
usually wear size,106
run little big,86
look great jean,86
run little large,84


In [115]:
data_not_recommend = df['Review Text'][df['Recommended IND']==0]
words_not_recommend = preprocessing(data_not_recommend)

In [116]:
print('the most popular words in NOT recommended reviews')
get_ngrams_freq(words_not_recommend,1)

the most popular words in NOT recommended reviews


Unnamed: 0,frequency
dress,2103
like,1776
top,1706
look,1581
fit,1494
size,1353
would,1346
fabric,1266
back,1022
color,1017


In [117]:
get_ngrams_freq(words_not_recommend,2)

Unnamed: 0,frequency
look like,279
wanted love,243
going back,211
looked like,187
really wanted,147
made look,136
5 4,113
felt like,109
usually wear,103
true size,100


In [118]:
get_ngrams_freq(words_not_recommend,3)

Unnamed: 0,frequency
really wanted love,67
wanted love dress,65
fit true size,39
really wanted like,39
made look like,30
wanted love top,28
ordered usual size,25
way much fabric,25
make look like,24
like maternity top,24
