In [None]:
import pandas as pd
import numpy as np
import wordcloud
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt
import itertools

In [None]:
df = pd.read_csv('reviews.csv',encoding='latin-1')
df.head()

In [None]:
df.columns

In [None]:
df.set_index('review_stars')

In [None]:
df_stars = df.set_index('review_stars')

In [None]:
df_stars.loc['1.0 out of 5 stars']

In [None]:
df_stars.loc['5.0 out of 5 stars']

In [None]:
df['review_body'][:11]

In [None]:
reviews = df['review_body']
reviews.count()

In [None]:
# using wordcloud to see a visual representation of text body, used to depict keyword.
cloud = wordcloud.WordCloud(background_color='black', max_font_size=45, 
                                relative_scaling=1).generate(' '.join(df_work.review_body))

In [None]:
fig = plt.figure(figsize=(20, 10))
plt.axis('off')
plt.imshow(cloud);

In [None]:
#nltk
nltk.download('stopwords')
stops = stopwords.words('english')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
def tokenize(text):
    tokenized = word_tokenize(text)
    no_punc = []
    for review in tokenized:
        line = "".join(char for char in review if char not in string.punctuation)
        no_punc.append(line)
    tokens = lemmatize(no_punc)
    return tokens


def lemmatize(tokens):
    lmtzr = WordNetLemmatizer()
    lemma = [lmtzr.lemmatize(t) for t in tokens]
    return lemma

In [None]:
reviews = reviews.apply(lambda x: tokenize(x))
# Remove punctuations, stopwords, and lemmatize
reviews[:11]

In [None]:
df['review_stars'] = df['review_stars'].replace('5.0 out of 5 stars',int(5))
df['review_stars'] = df['review_stars'].replace('4.0 out of 5 stars',int(4))
df['review_stars'] = df['review_stars'].replace('3.0 out of 5 stars',int(3))
df['review_stars'] = df['review_stars'].replace('2.0 out of 5 stars',int(2))
df['review_stars'] = df['review_stars'].replace('1.0 out of 5 stars',int(1))

df.to_csv("reviews_work.csv")

In [None]:
df = pd.read_csv('reviews_work.csv',encoding='latin-1')

df.dropna(inplace=True)
df_work = df[df['review_stars'] != 3]
# Take 3 stars away because they are neutral and did not provide us any useful information.
df_work

In [None]:
# create a column positivity to encode 1 if review stars > 3, other wise encode 0.
df_work['positivity'] = [1 if x > 3 else 0 for x in df_work.review_stars]
df_work.head()

In [None]:
review_body = df_work["review_body"]

In [None]:
#Train/Test Split
x_train, x_test, y_train, y_test = train_test_split(df_work.review_body, df_work.positivity, random_state=0)

In [None]:
print("x_train shape: {}".format(x_train.shape), end='\n')
print("y_train shape: {}".format(y_train.shape), end='\n\n')
print("x_test shape: {}".format(x_test.shape), end='\n')
print("y_test shape: {}".format(y_test.shape), end='\n\n')

In [None]:
# Vectorize X_train

vectorizer = CountVectorizer(min_df=5).fit(x_train)
X_train = vectorizer.transform(x_train)
print("X_train:\n{}".format(repr(X_train)))

In [None]:
#tey to build Logistic Regression Model

scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.3f}".format(np.mean(scores)))

In [None]:
logreg = LogisticRegression(C=0.1).fit(X_train, y_train)

X_test = vectorizer.transform(x_test)

log_y_pred = logreg.predict(X_test)

logreg_score = accuracy_score(y_test, log_y_pred)
print("Accuracy:   {:.3f}".format(logreg_score))

In [None]:
print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))

In [None]:
log_cfm = confusion_matrix(y_test, log_y_pred)
print("Confusion matrix:")
print(log_cfm, end='\n\n')
print('-'*15)
print(np.array([['TN', 'FP'],[ 'FN' , 'TP']]))

In [None]:
plt.imshow(log_cfm, interpolation='nearest')

for i, j in itertools.product(range(log_cfm.shape[0]), range(log_cfm.shape[1])):
    plt.text(j, i, log_cfm[i, j],
             horizontalalignment="center",
             color="red")

plt.ylabel('True label (Recall)')
plt.xlabel('Predicted label (Precision)')
plt.title('Logistic Reg | Confusion Matrix')
plt.colorbar();

In [None]:
log_f1 = f1_score(y_test, log_y_pred)
print("Logistic Reg - F1 score: {:.3f}".format(log_f1))

In [None]:
#try to build Multinomial Bayes Model and to compare with the above one!
mnb = MultinomialNB(alpha=.01)
mnb.fit(X_train, y_train)

In [None]:
mnb_y_pred = mnb.predict(X_test)

In [None]:
mnb_score = accuracy_score(y_test, mnb_y_pred)
print("Accuracy:   {:.3f}".format(mnb_score))

In [None]:
print("Training set score: {:.3f}".format(mnb.score(X_train, y_train)))
print("Test set score: {:.3f}".format(mnb.score(X_test, y_test)))

In [None]:
mnb_cfm = confusion_matrix(y_test, mnb_y_pred)
print("Confusion matrix:")
print(mnb_cfm, end='\n\n')
print('-'*15)
print(np.array([['TN', 'FP'],[ 'FN' , 'TP']]))

In [None]:
plt.imshow(mnb_cfm, interpolation='nearest')

for i, j in itertools.product(range(mnb_cfm.shape[0]), range(mnb_cfm.shape[1])):
    plt.text(j, i, mnb_cfm[i, j],
             horizontalalignment="center",
             color="red")

plt.ylabel('True label (Recall)')
plt.xlabel('Predicted label (Precision)')
plt.title('Multinomial | Confusion Matrix')
plt.colorbar();

In [None]:
mnb_f1 = f1_score(y_test, mnb_y_pred)
print("Multinomial NB - F1 score: {:.3f}".format(mnb_f1))

In [None]:
#Comparing the accuracy and scores, looks like Multinomial Bayes is the better model to deal with this work!