In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import re
import string
from wordcloud import WordCloud

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/womens-ecommerce-clothing-reviews/Womens Clothing E-Commerce Reviews.csv', header=0,index_col=0)
df.head()

In [None]:
df.sample(5)

In [None]:
df.duplicated().sum()

In [None]:
df.describe()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.info()

In [None]:
# check null value percent
(df.isna().sum()/df.shape[0])*100

In [None]:
df.isna().sum()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
#type of variables
df.dtypes

In [None]:
df['Review Text'][1]

In [None]:
review = df[['Review Text', 'Recommended IND']]
review.head()

In [None]:
#columns rename
review = review.rename(columns={"Recommended IND": "Recommended", "Review Text": "Review"})
review.head()

In [None]:
review.isnull().sum()

In [None]:
review.dropna(subset=['Review'], inplace=True)

In [None]:
review.isnull().sum()

In [None]:
review['Review'] = review['Review'].astype(str)

In [None]:
review.shape

In [None]:
review['Recommended'].value_counts()

In [None]:
review["Recommended"].value_counts().plot(kind='bar')
plt.xlabel("Recommended")
plt.ylabel("Counts")
plt.title("Proportion Target Class")

In [None]:
df.isnull()

In [None]:
plt.figure(figsize=(10,10))
ax =df.Rating.value_counts()
labels=df['Rating'].value_counts().index
plt.pie(ax,labels=labels,autopct='%.2f')
plt.title("Number in which figure shown",fontsize=25,color='purple')
plt.legend()
plt.show()

In [None]:
teju=df.cov()
teju

In [None]:
sns.heatmap(teju,annot=True)

In [None]:
plt.figure(figsize=(10,10))
ax=df.Rating.value_counts()[:10]
ax.plot(kind='bar')

In [None]:
sns.distplot(df.Rating, color = 'red')

In [None]:
def tokens(words):
    words = re.sub("[^a-zA-Z]"," ", words)
    text = words.lower().split()                   
    return " ".join(text)

In [None]:
review['Review_clear'] = review['Review'].apply(tokens)
review.head()

In [None]:
review['Review_clear'] = review['Review_clear'].astype(str)

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
# Show some stop words
stop_words = stopwords.words('english')
print(stop_words[::10])

In [None]:
#clothing stopwords
clothes =['dress','color','wear','top','sweater','material','shirt','jeans','pant',
          'skirt','order','white','black','fabric','blouse','sleeve','even', 'jacket']

In [None]:
def stopwords(review):
    text = [word.lower() for word in review.split() if word.lower() not in stop_words and word.lower() not in clothes]
    return " ".join(text)

In [None]:
review['Review_clear'] = review['Review_clear'].apply(stopwords)
review.head()

In [None]:
review['Review_clear'][267]

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
lem = WordNetLemmatizer()

def lemma(text):
    lem_text = [lem.lemmatize(word) for word in text.split()]
    return " ".join(lem_text)

In [None]:
review['Review_clear'] = review['Review_clear'].apply(lemma)
review.head()

In [None]:
review['Review_clear'][1]

In [None]:
positive = review[review.Recommended== 1]
negative = review[review.Recommended== 0]
positive.head()

In [None]:
positive_words =[]

for review in positive.Review_clear:
    positive_words.append(review) 
positive_words = ' '.join(positive_words)
positive_words[:48]

In [None]:
negative_words = []

for review in negative.Review_clear:
    negative_words.append(review)
negative_words = ' '.join(negative_words)
negative_words[:455]

In [None]:
wordcloud = WordCloud(background_color="white", max_words=len(positive_words))

wordcloud.generate(positive_words)

plt.figure(figsize=(13,13))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
wordcloud = WordCloud(background_color="white", max_words=len(negative_words), colormap='gist_heat')

wordcloud.generate(negative_words)

plt.figure(figsize=(13,13))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
negative.head()

# Models
We use the following classification models:

Logistic Regression,
Naive Bayes,
Support Vector Machine,
Random Forest,
Ada Boosting.

In [None]:
X = positive['Review_clear']
y = positive['Recommended']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

In [None]:
vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)

X_train_vectorized = vect.transform(X_train)

len(vect.get_feature_names())

# Naive Bayes
To make the vectorizer => transformer => classifier easier to work with, we will use Pipeline class in Scilkit-Learn.

In [None]:
model_nb = Pipeline([('vect', CountVectorizer(min_df=5, ngram_range=(1,2))),
                   ('tfidf', TfidfTransformer()),
                   ('clf',MultinomialNB()),
                   ])

model_nb.fit(X_train, y_train)

ytest = np.array(y_test)
pred_y = model_nb.predict(X_test)

In [None]:
print('accuracy %s' % accuracy_score(pred_y, y_test))
print(classification_report(ytest, pred_y))

# Random Forest

In [None]:
model_rf = Pipeline([('vect', CountVectorizer(min_df=5, ngram_range=(1,2))),
                    ('tfidf', TfidfTransformer()),
                    ('clf-rf', RandomForestClassifier(n_estimators=50)),
                    ])

model_rf.fit(X_train, y_train)

ytest = np.array(y_test)
pred = model_rf.predict(X_test)

In [None]:
print('accuracy %s' % accuracy_score(pred, y_test))
print(classification_report(ytest, pred))

# Ada Boosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada = Pipeline([('vect', CountVectorizer(min_df=5, ngram_range=(1,2))),
                ('tfidf', TfidfTransformer()),
                ('clf-ada', AdaBoostClassifier()),
                ])

ada.fit(X_train, y_train)
ytest = np.array(y_test)
ada_pred = ada.predict(X_test)

In [None]:
print('accuracy %s' % accuracy_score(ada_pred, y_test))
print(classification_report(ytest, ada_pred))

# Which one is the best Model?

In [None]:
nb_acc = accuracy_score(pred_y, y_test)
rf_acc = accuracy_score(pred, y_test)
ada_acc = accuracy_score(ada_pred, y_test)

In [None]:
models = pd.DataFrame({
                      'Model': ['Naive Bayes', 'Random Forest', 'AdaBoosting'],
                      'Score': [nb_acc,rf_acc, ada_acc]})
models.sort_values(by='Score', ascending=False)

# Conclusion
This project was aimed to used sentiment analysis to determined product recommendation. We started with the data engineering and text mining, which cover change text into tokens, remove punctuation, numbers, stop words and normalization them by using lemmatization. Following we used bag of words model to convert the text into numerical feature vectors. 

# THANK YOU