***

<font size="6"><b>Importing All Necessary Libraries</b> </font>

***

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

import nltk
import re
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

***

<font size="6"><b>Importing Dataset</b> </font>

***

In [5]:
# Importing dataset
review = pd.read_csv("desktop/Womens Clothing E-Commerce Reviews.csv")
review.head()

FileNotFoundError: [Errno 2] No such file or directory: 'desktop/Womens Clothing E-Commerce Reviews.csv'

***

<font size="6"><b>Data Cleaning</b> </font>

***

## Removing First Unnamed Column

In [None]:
# Deleting first unnamed column
review = review.loc[:, ~review.columns.str.contains('^Unnamed')]
review.head()

## Checking Null Values

In [None]:
# Checking null value
review.isnull().sum()

## Removing Null Values

In [None]:
# Deleting null values because it is less than 30% of whole data
review = review.dropna(axis=0)

## Checking Null Values After Removing

In [None]:
# Checking null value after removing
review.isnull().sum()

***

<font size="6"><b>Data Exploration</b> </font>

***

## Number of Rows and Columns

In [None]:
# Shows that there are 19662 rows and 10 columns
review.shape

## Datatype of Each Columns

In [None]:
# Shows datatype of each column
review.dtypes

## Describe Dataset

In [None]:
# Shows count, mean, standard deviation, minimum, maximum, 25% 50% 75% percentiles
review.describe()

In [None]:
review.describe(include=object)

## Count of Age of Reviewer

In [None]:
print(review.groupby('Age').size())

In [None]:
sns.histplot(review['Age'])
plt.title('Count of Age of Reviewer')
plt.show()

## Number of Customer's Positive and Negative Recommendation

In [None]:
ri = review.groupby('Recommended IND').size()
print(ri)

In [None]:
ri_labels = ['Not Recommended', 'Recommended']
plt.pie(ri, labels=ri_labels, autopct='%.1f%%')
plt.title('Number of Positive and Negative Reviews')
plt.legend()
plt.show()

## Number of Different Divisions

In [None]:
print(review.groupby('Division Name').size())

In [None]:
sns.countplot(x = review['Division Name'])
plt.title('Number of Different Divisions')
plt.show()

## Number of Different Department

In [None]:
rdn = review.groupby('Department Name').size()
print(rdn)

In [None]:
rdn_labels = review['Department Name'].unique()
plt.pie(rdn, labels=rdn_labels, autopct='%.1f%%')
plt.title('Number of Different Department')
plt.legend()
plt.show()

## Number of Different Classes

In [None]:
print(review.groupby('Class Name').size())

In [None]:
sns.countplot(x = review['Class Name'])
plt.xticks(rotation=90)
plt.title('Number of Different Classes')
plt.show()

## Number of Customer's Rating from 1 to 5

In [None]:
rr = review.groupby('Rating').size()
print(rr)

In [None]:
rr_labels = review['Rating'].unique()
plt.pie(rr, labels=rr_labels, autopct='%.1f%%')
plt.title('Number of Rating from 1 to 5')
plt.legend()
plt.show()

## Number of Positive Feedbacks

In [None]:
print(review.groupby('Positive Feedback Count').size())

In [None]:
sns.set(rc = {'figure.figsize':(18,5)})
sns.countplot(x = review['Positive Feedback Count'], label="Number of Visitors")
plt.xticks(rotation=90)
plt.title('Number of Positive Feedbacks')
plt.show()

## Rating Compared to Recommended or Not

In [None]:
g = sns.catplot(data = review, x ="Rating", hue = "Recommended IND", kind='count', height=7, aspect=2.5, legend_out=False)

plt.title('Rating Distribution By Recommendation', fontsize=26)
plt.xlabel("Rating", fontsize=20)
plt.ylabel("Number of Recommendations", fontsize=20)
plt.legend(title='Recommendation Indicator', loc='upper left', labels=['Not Recomnended', 'Recomnended'], fontsize='x-large', title_fontsize='24')

ax = g.facet_axis(0, 0)
for p in ax.patches:
    ax.text(p.get_x() + 0.12, 
            p.get_height() * 1.025, 
            '{0:.0f}'.format(p.get_height()), 
            color='black', rotation='horizontal', size='large')

plt.show()

***

<font size="6"><b>Data Preprocessing</b> </font>

***

## Before Processing

In [None]:
review['Title'].loc[510]

In [None]:
review['Review Text'].loc[51]

In [None]:
words = review['Review Text'].str.split(expand=True).stack().value_counts()
words200 = words[:200]
fig = px.treemap(words200, path=[words200.index], values=0, width=900, height=900)
fig.update_layout(title_text='Top Frequent 200 Words in Review Text (Before Processing)')
fig.update_traces(textinfo="label+value")
fig.show()

## Initializing Classes for Text Preparation

In [None]:
# Removing Special Characters in Sentence
# ^ (Do Not  match) \w (alphanumeric characters) and \s (white space and tab)
def removing_special_character(text):
    new_text = re.sub('[^\w\s]','', text)
    return new_text

# Tokenizing Sentence with all lower case
def tokenize_sentence(text):
    text_tokens = nltk.word_tokenize(text.lower())
    return text_tokens

# Removing Numbers from text sentence
def removing_numbers(text):
    new_text = [x for x in text if x.isalpha()]
    return new_text

# Removing Stopwords from text sentence
def removing_stopwords(text):
    stopwords = nltk.corpus.stopwords.words("english")
    new_text = [x for x in text if x not in stopwords]
    return new_text

# Lemmatizing Sentence
def lemmatizer(text):
    new_text = [WordNetLemmatizer().lemmatize(x) for x in text]
    return new_text

# Joining the Tokenized Sentences
def join_token(text):
     return " ".join(text)

## Processing Title Column

In [None]:
review['Title'] = review['Title'].apply(removing_special_character)

In [None]:
review['Title'] = review['Title'].apply(tokenize_sentence)

In [None]:
review['Title'] = review['Title'].apply(removing_numbers)

In [None]:
review['Title'] = review['Title'].apply(removing_stopwords)

In [None]:
review['Title'] = review['Title'].apply(lemmatizer)

In [None]:
review['Title'] = review['Title'].apply(join_token)

## Processing Review Text Column

In [None]:
review['Review Text'] = review['Review Text'].apply(removing_special_character)

In [None]:
review['Review Text'] = review['Review Text'].apply(tokenize_sentence)

In [None]:
review['Review Text'] = review['Review Text'].apply(removing_numbers)

In [None]:
review['Review Text'] = review['Review Text'].apply(removing_stopwords)

In [None]:
review['Review Text'] = review['Review Text'].apply(lemmatizer)

In [None]:
review['Review Text'] = review['Review Text'].apply(join_token)

## After Processing

In [None]:
review['Title'].loc[510]

In [None]:
review['Review Text'].loc[51]

In [None]:
words = review['Review Text'].str.split(expand=True).stack().value_counts()
words200 = words[:200]
fig = px.treemap(words200, path=[words200.index], values=0, width=900, height=900)
fig.update_layout(title_text='Top Frequent 200 Words in Review Text (After Processing)')
fig.update_traces(textinfo="label+value")
fig.show()

## Frequent Words in Positive Recommendation

In [None]:
positive = review[review['Recommended IND']==1].copy()

positive_words = positive['Review Text'].str.split(expand=True).stack().value_counts()
words200 = positive_words[:200]
fig = px.treemap(words200, path=[words200.index], values=0, width=900, height=900)
fig.update_layout(title_text='Top Frequent 200 Positive Words in Review Text (After Processing)')
fig.update_traces(textinfo="label+value")
fig.show()

## Frequent Words in Negative Recommendation

In [None]:
negative = review[review['Recommended IND']==0].copy()

negative_words = negative['Review Text'].str.split(expand=True).stack().value_counts()
words200 = negative_words[:200]
fig = px.treemap(words200, path=[words200.index], values=0, width=900, height=900)
fig.update_layout(title_text='Top Frequent 200 Negative Words in Review Text (After Processing)')
fig.update_traces(textinfo="label+value")
fig.show()

***

<font size="6"><b>1. Predicting using only Review Text</b> </font>

***

## Selecting Columns

In [None]:
X1 = review["Review Text"]
Y1 = review["Recommended IND"]

## Splitting Dataset into 80% Training Set and 20% Testing Set

In [None]:
# splitting test train data
X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1, Y1, test_size=0.2)

print('Training Set:\tX1_train: ', X1_train.shape, ', Y1_train: ', Y1_train.shape, 
      '\nTesting Set:\tX1_test: ', X1_test.shape, ', Y1_test: ', Y1_test.shape)

## Converting into Vectorize Form

In [None]:
vectorizer = CountVectorizer()

X1_train = vectorizer.fit_transform(X1_train)
X1_test = vectorizer.transform(X1_test)

## Naive Bayes Multinomial

In [None]:
nb = MultinomialNB()
nb.fit(X1_train, Y1_train)

nb_predict1 = nb.predict(X1_test)

print('Accuracy of Naive Bayes: {:.2f}'.format(nb.score(X1_test, Y1_test)))

## Classification Report and Confusion Matrix

In [None]:
# Confusion Matrix of Naive Bayes using testset output and predicted output
print('Confusion Matrix:')
print(confusion_matrix(Y1_test, nb_predict1))

# Classification Report of Naive Bayes using testset output and predicted output
print('\nClassification Report:')
print(classification_report(Y1_test, nb_predict1))

***

<font size="6"><b>2. Predicting using only Review Text where Positive Feedback Count is greater than 1</b> </font>

***

## Selecting Columns

In [None]:
r2 = review.loc[review['Positive Feedback Count'] > 1]
X2 = r2['Review Text']
Y2 = r2['Recommended IND']

## Splitting Dataset into 80% Training Set and 20% Testing Set

In [None]:
# splitting test train data
X2_train, X2_test, Y2_train, Y2_test = train_test_split(X2, Y2, test_size=0.2)

print('Training Set:\tX2_train: ', X2_train.shape, ', Y2_train: ', Y2_train.shape, 
      '\nTesting Set:\tX2_test: ', X2_test.shape, ', Y2_test: ', Y2_test.shape)

## Converting into Vectorize Form

In [None]:
vectorizer = CountVectorizer()

X2_train = vectorizer.fit_transform(X2_train)
X2_test = vectorizer.transform(X2_test)

## Naive Bayes Multinomial

In [None]:
nb = MultinomialNB()
nb.fit(X2_train, Y2_train)

nb_predict2 = nb.predict(X2_test)

print('Accuracy of Naive Bayes: {:.2f}'.format(nb.score(X2_test, Y2_test)))

## Classification Report and Confusion Matrix

In [None]:
# Confusion Matrix of Naive Bayes using testset output and predicted output
print('Confusion Matrix:')
print(confusion_matrix(Y2_test, nb_predict2))

# Classification Report of Naive Bayes using testset output and predicted output
print('\nClassification Report:')
print(classification_report(Y2_test, nb_predict2))

***

<font size="6"><b>3. Predicting using only Title</b> </font>

***

## Selecting Columns

In [None]:
X3 = review['Title']
Y3 = review['Recommended IND']

## Splitting Dataset into 80% Training Set and 20% Testing Set

In [None]:
# splitting test train data
X3_train, X3_test, Y3_train, Y3_test = train_test_split(X3, Y3, test_size=0.2)

print('Training Set:\tX3_train: ', X3_train.shape, ', Y3_train: ', Y3_train.shape, 
      '\nTesting Set:\tX3_test: ', X3_test.shape, ', Y3_test: ', Y3_test.shape)

## Converting into Vectorize Form

In [None]:
vectorizer = CountVectorizer()

X3_train = vectorizer.fit_transform(X3_train)
X3_test = vectorizer.transform(X3_test)

## Naive Bayes Multinomial

In [None]:
nb = MultinomialNB()
nb.fit(X3_train, Y3_train)

nb_predict3 = nb.predict(X3_test)

print('Accuracy of Naive Bayes: {:.2f}'.format(nb.score(X3_test, Y3_test)))

## Classification Report and Confusion Matrix

In [None]:
# Confusion Matrix of Naive Bayes using testset output and predicted output
print('Confusion Matrix:')
print(confusion_matrix(Y3_test, nb_predict3))

# Classification Report of Naive Bayes using testset output and predicted output
print('\nClassification Report:')
print(classification_report(Y3_test, nb_predict3))

***

<font size="6"><b>4. Predicting using only Title where Positive Feedback Count is greater than 1</b> </font>

***

## Selecting Columns

In [None]:
r4 = review.loc[review['Positive Feedback Count'] > 1]
X4 = r4['Title']
Y4 = r4['Recommended IND']

## Splitting Dataset into 80% Training Set and 20% Testing Set

In [None]:
# splitting test train data
X4_train, X4_test, Y4_train, Y4_test = train_test_split(X4, Y4, test_size=0.2)

print('Training Set:\tX4_train: ', X4_train.shape, ', Y4_train: ', Y4_train.shape, 
      '\nTesting Set:\tX4_test: ', X4_test.shape, ', Y4_test: ', Y4_test.shape)

## Converting into Vectorize Form

In [None]:
vectorizer = CountVectorizer()

X4_train = vectorizer.fit_transform(X4_train)
X4_test = vectorizer.transform(X4_test)

## Naive Bayes Multinomial

In [None]:
nb = MultinomialNB()
nb.fit(X4_train, Y4_train)

nb_predict4 = nb.predict(X4_test)

print('Accuracy of Naive Bayes: {:.2f}'.format(nb.score(X4_test, Y4_test)))

## Classification Report and Confusion Matrix

In [None]:
# Confusion Matrix of Naive Bayes using testset output and predicted output
print('Confusion Matrix:')
print(confusion_matrix(Y4_test, nb_predict4))

# Classification Report of Naive Bayes using testset output and predicted output
print('\nClassification Report:')
print(classification_report(Y4_test, nb_predict4))

***

<font size="6"><b>5. Predicting using Recommended IND and Positive Feedback Count</b> </font>

***

## Selecting Columns

In [None]:
X5 = review[['Rating', 'Positive Feedback Count']]
Y5 = review['Recommended IND']

## Splitting Dataset into 80% Training Set and 20% Testing Set

In [None]:
# splitting test train data
X5_train, X5_test, Y5_train, Y5_test = train_test_split(X5, Y5, test_size=0.2)

print('Training Set:\tX5_train: ', X5_train.shape, ', Y5_train: ', Y5_train.shape, 
      '\nTesting Set:\tX5_test: ', X5_test.shape, ', Y5_test: ', Y5_test.shape)

## Converting into Vectorize Form

In [None]:
# vectorizer = CountVectorizer()

# X5_train = vectorizer.fit_transform(X5_train)
# X5_test = vectorizer.transform(X5_test)

## Naive Bayes Multinomial

In [None]:
nb = MultinomialNB()
nb.fit(X5_train, Y5_train)

nb_predict5 = nb.predict(X5_test)

print('Accuracy of Naive Bayes: {:.2f}'.format(nb.score(X5_test, Y5_test)))

## Classification Report and Confusion Matrix

In [None]:
# Confusion Matrix of Naive Bayes using testset output and predicted output
print('Confusion Matrix:')
print(confusion_matrix(Y4_test, nb_predict4))

# Classification Report of Naive Bayes using testset output and predicted output
print('\nClassification Report:')
print(classification_report(Y4_test, nb_predict4))

# Process

1. Predicting using only Review Text

2. Predicting using only Review Text where Positive Feedback Count is greater than 1

3. Predicting using only Title

4. Predicting using only Title where Positive Feedback Count is greater than 1

5. Predicting using Rating and Positive Feedback Count

***