# Women's E-Commerce Clothing Reviews

Data Source: https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews

<p>
    This dataset consists of >20,000 product reviews in a women's e-commerce clothing site. <br>
    We will have a general walk through of the data, and also create a word cloud to see what the reviews are about.
</p>

# Environment Setup

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Load and Preprocess Data

In [None]:
df = pd.read_csv('../input/womens-ecommerce-clothing-reviews/Womens Clothing E-Commerce Reviews.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
df

# Missing Data

In [None]:
import seaborn as sns

sns.heatmap(df.isna())

In [None]:
df.isna().sum()

In [None]:
len(df)

In [None]:
df['Review Text'] = df['Review Text'].fillna('')
df['Title'] = df['Title'].fillna('')

# Univariate Stats

In [None]:
sns.countplot(data=df, x='Recommended IND')

In [None]:
sns.histplot(data=df[df['Positive Feedback Count'] > 0], x='Positive Feedback Count', log_scale=True)

In [None]:
sns.countplot(data=df, x='Rating', palette='flare')

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(20, 4))
sns.countplot(data=df, x='Class Name', order=df['Class Name'].value_counts().sort_values(ascending=False).index, hue='Recommended IND')

In [None]:
plt.figure(figsize=(12, 4))
sns.histplot(data=df, x='Age')

# Review Length

In [None]:
df['review_text_split_len'] = df['Review Text'].apply(lambda text: len(text.split(' ')))
sns.histplot(data=df, x='review_text_split_len')

# Tokenize

In [None]:
%%time
# 10s
import nltk
from gensim.parsing.preprocessing import strip_tags, strip_punctuation, strip_multiple_whitespaces, \
strip_numeric, remove_stopwords, strip_short, preprocess_string

lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
df['review_text_tokenized'] = df['Review Text'].apply(lambda text: preprocess_string(text, [
    strip_tags, 
    strip_punctuation, 
    strip_multiple_whitespaces, 
    strip_numeric, 
    remove_stopwords, 
    strip_short, 
    lemmatizer.lemmatize, 
    lambda x: x.lower()
]))

check reviews word count after tokenized (change mainly due to stopword removal)

In [None]:
sns.histplot(data=df['review_text_tokenized'].apply(len))

# Word Cloud

In [None]:
%%time
# 10s
from wordcloud import WordCloud

long_string = ' '.join([' '.join(words) for words in df['review_text_tokenized'].values])
wordcloud = WordCloud(width=800, height=400)
wordcloud.generate(long_string)
wordcloud.to_image()