In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sentiment_labelled_sentences/yelp_labelled.txt"
            )
yelp_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
yelp_raw.columns = ['text', 'sentiment']

yelp_raw['text'] = yelp_raw['text'].str.replace(',', '')
yelp_raw['text'] = yelp_raw['text'].str.replace('.', '')
yelp_raw['text'] = yelp_raw['text'].str.replace('!', '')

# create sentiment features

keywords = ['great', 'best', 'good', 'nice', 'friendly', "attentive", "excellent", 'amazing', 'delicious', 'wonderful', 'love',
           'perfect', 'tasty', 'ambiance', 'attentive', 'absolutely','loved','prompt','beautiful','delight','tender',
           'right','fresh','best','fantastic','terrific','fun','flavourful', 'atmosphere',
           'clean','inexpensive','outstanding','recommend','favorite','5','better','perfectly','back','awesome',
           'filling','thumbs up', 'perfect', 'promptly', 'favourite', 'reasonable', 'quick', 'happy',
           'satifying', 'devine','yummy', 'moist', 'flavorful','pleasure','delightful', 'enjoyed', 'highlight']

for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    yelp_raw[str(key)] = yelp_raw.text.str.contains(' ' + str(key) + ' ', case=False)
    
yelp_raw['sentiment'] = (yelp_raw['sentiment'] == 1)

data = yelp_raw[keywords]
target = yelp_raw['sentiment']

In [3]:
# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 295


In [4]:
# Test your model with different holdout groups.

from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

With 20% Holdout: 0.7
Testing on Sample: 0.705


In [6]:
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=20)
print('With 30% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

With 30% Holdout: 0.67
Testing on Sample: 0.705


In [7]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.4, random_state=20)
print('With 40% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

With 40% Holdout: 0.6625
Testing on Sample: 0.705


In [8]:
# cross-validation

from sklearn.model_selection import cross_val_score
cross_val_score(bnb, data, target, cv=10)

array([0.64, 0.68, 0.7 , 0.69, 0.68, 0.68, 0.71, 0.73, 0.66, 0.71])

In [9]:
cross_val_score(bnb, data, target, cv=20)

array([0.64, 0.64, 0.66, 0.7 , 0.7 , 0.7 , 0.66, 0.72, 0.66, 0.74, 0.66,
       0.7 , 0.76, 0.66, 0.68, 0.78, 0.62, 0.66, 0.72, 0.7 ])

In [11]:
# create confusion matrix

from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report

actual = yelp_raw['sentiment']
predicted = y_pred
results = confusion_matrix(actual, predicted)

# recall=when its true how often are we predicting true
# precision=when it predicts positive, how often is it correct
#f1 score - measurement that represents both of predicted and actual. We calculate an F-measure which uses Harmonic Mean in place of Arithmetic Mean as it punishes the extreme values more.
# The F-Measure will always be nearer to the smaller value of Precision or Recall.
print(results)
print('Accuracy Score:',accuracy_score(actual, predicted))
print(classification_report(actual, predicted))

[[471  29]
 [266 234]]
Accuracy Score: 0.705
              precision    recall  f1-score   support

       False       0.64      0.94      0.76       500
        True       0.89      0.47      0.61       500

    accuracy                           0.70      1000
   macro avg       0.76      0.70      0.69      1000
weighted avg       0.76      0.70      0.69      1000

