In [3]:
import pandas as pd
import numpy as np

import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt 
%matplotlib inline

import warnings
warnings.filterwarnings('ignore') 

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import plot_confusion_matrix, classification_report, accuracy_score


In [4]:
reviews_df = pd.read_csv('clothing_reviews_clean_with_features.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'clothing_reviews_clean_with_features.csv'

In [None]:
reviews_df.head()

In [None]:
reviews_df['recommended'].value_counts()

In [None]:
analysis_df = reviews_df[['recommended', 'lemmatized']]

In [None]:
analysis_df['recommended']

In [None]:
X = analysis_df['lemmatized']
y = analysis_df['recommended']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

Model 1: Vectorizer + TFID transformer + Naive Bayes

In [None]:
vect = CountVectorizer(min_df = 5, ngram_range = (1,2)).fit(X_train)

X_train_vectorized = vect.transform(X_train)

len(vect.get_feature_names())

In [None]:
model_nb = Pipeline([('vect', CountVectorizer(min_df = 5, ngram_range = (1,2))),
                   ('tfidf', TfidfTransformer()),
                   ('clf', MultinomialNB()),
                   ])

model_nb.fit(X_train, y_train)

ytest = np.array(y_test)
pred_y = model_nb.predict(X_test)

In [None]:
print('accuracy %s' % accuracy_score(pred_y, y_test))
print(classification_report(ytest, pred_y))

Model 2: Vectorizer + Naive Bayes

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [None]:
vectorizer = CountVectorizer(ngram_range = (1,2)) 
classifier = MultinomialNB()

In [None]:
sentiment_pipeline = Pipeline([
        ('vectorizer', vectorizer),
        ('classifier', classifier)
    ])

In [None]:
sentiment_pipeline.fit(X_train, y_train)

ytest = np.array(y_test)
pred_y = sentiment_pipeline.predict(X_test)

In [None]:
print('accuracy %s' % accuracy_score(pred_y, y_test))
print(classification_report(ytest, pred_y))

Model 3: Balanced Samples. Vectorizer + TFID transformer + Naive Bayes.

In [None]:
pos_df = analysis_df[analysis_df['recommended'] == 1]
neg_df = analysis_df[analysis_df['recommended'] == 0]

In [None]:
pos_sample = pos_df.sample(4000)
neg_sample = neg_df.sample(4000)

In [None]:
new_df = pd.concat([pos_sample, neg_sample])

In [None]:
new_df = new_df.reset_index(drop = True)

In [None]:
new_df

In [None]:
X = new_df['lemmatized']
y = new_df['recommended']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

In [None]:
vect = CountVectorizer(min_df = 5, ngram_range = (1,2)).fit(X_train)

X_train_vectorized = vect.transform(X_train)

len(vect.get_feature_names())

In [None]:
# I create a model pipeline including the vectorizer, TfidfTransformer and classifier

model_nb = Pipeline([('vect', CountVectorizer(min_df = 5, ngram_range = (1,2))),                   # ignore terms that have a document frequency lower than 5
                   ('tfidf', TfidfTransformer()),                                                  # include unigrams and bigrams
                   ('clf', MultinomialNB()),
                   ])

model_nb.fit(X_train, y_train)

ytest = np.array(y_test)
pred_y = model_nb.predict(X_test)

In [None]:
print('accuracy %s' % accuracy_score(pred_y, y_test))
print(classification_report(ytest, pred_y))

In [None]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(ytest, pred_y))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
import numpy as np

cnf_matrix = metrics.confusion_matrix(ytest, pred_y)
cnf_matrix
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot = True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label');