# Naive Bayes for Sentiment Analysis

### Bayes rule

In [1]:
days = [["ran", "was tired"],
        ["ran", "was not tired"],
        ["didn't run", "was tired"],
        ["ran", "was tired"],
        ["didn't run", "was not tired"],
        ["ran", "was not tired"],
        ["ran", "was tired"]]

In [2]:
prob_tired = len([d for d in days if d[1] == "was tired"]) / len(days)
prob_tired

0.5714285714285714

In [3]:
prob_ran = len([d for d in days if d[0] == "ran"]) / len(days)
prob_ran

0.7142857142857143

In [4]:
prob_ran_given_tired = len([d for d in days if d[0] == "ran" and d[1] == "was tired"]) / len([d for d in days if d[1] == "was tired"])
prob_ran_given_tired

0.75

In [5]:
prob_tired_given_ran = (prob_ran_given_tired * prob_tired) / prob_ran
prob_tired_given_ran

0.6

### Naive Bayes 

In [6]:
days = [["ran", "was tired", "woke up early"],
        ["ran", "was not tired", "didn't wake up early"],
        ["didn't run", "was tired", "woke up early"],
        ["ran", "was tired", "didn't wake up early"],
        ["didn't run", "was tired", "woke up early"],
        ["ran", "was not tired", "didn't wake up early"],
        ["ran", "was tired", "woke up early"]]

In [7]:
new_day = ["ran", "didn't wake up early"]

In [8]:
def calc_y_probability(y_label, days):
    return len([d for d in days if d[1] == y_label]) / len(days)

In [9]:
def calc_ran_probability_given_y(ran_label, y_label, days):
    return len([d for d in days if d[1] == y_label and d[0] == ran_label]) / len(days)

In [11]:
def calc_woke_early_probability_given_y(woke_label, y_label, days):
    return len([d for d in days if d[1] == y_label and d[2] == woke_label]) / len(days)

In [13]:
denominator = len([d for d in days if d[0] == new_day[0] and d[2] == new_day[1]]) / len(days)
denominator

0.42857142857142855

In [14]:
prob_tired = (calc_y_probability("was tired", days) * calc_ran_probability_given_y(new_day[0], "was tired", days) * calc_woke_early_probability_given_y(new_day[1], "was tired", days)) / denominator
prob_tired

0.10204081632653061

In [15]:
prob_not_tired = (calc_y_probability("was not tired", days) * calc_ran_probability_given_y(new_day[0], "was not tired", days) * calc_woke_early_probability_given_y(new_day[1], "was not tired", days)) / denominator
prob_not_tired

0.054421768707482984

### Classification based on the probabilities

In [16]:
classification = "was tired"

if prob_not_tired > prob_tired:
    classification = "was not tired"
    
print("Final classification for new day: {0}. Tired probability: {1}. Not tired probability: {2}.".format(classification, prob_tired, prob_not_tired))

Final classification for new day: was tired. Tired probability: 0.10204081632653061. Not tired probability: 0.054421768707482984.


## Predicting the tone of a review

We'll be working with a CSV file containing movie reviews. Each row contains the text of the review, as well as a number indicating whether the tone of the review is positive(1) or negative(-1). We want to predict whether a review is negative or positive, based on the text alone.

In [18]:
from collections import Counter
import csv
import re

### Finding Word counts

In [19]:
with open("train.csv", 'r') as file:
    reviews = list(csv.reader(file))

In [21]:
reviews[:3]

[['plot : two teen couples go to a church party drink and then drive . they get into an accident . one of the guys dies but his girlfriend continues to see him in her life and has nightmares . what\'s the deal ? watch the movie and " sorta " find out . . . critique : a mind-fuck movie for the teen generation that touches on a very cool idea but presents it in a very bad package . which is what makes this review an even harder one to write since i generally applaud films which attempt',
  '-1'],
 ["the happy bastard's quick movie review damn that y2k bug . it's got a head start in this movie starring jamie lee curtis and another baldwin brother ( william this time ) in a story regarding a crew of a tugboat that comes across a deserted russian tech ship that has a strangeness to it when they kick the power back on . little do they know the power within . . . going for the gore and bringing on a few action sequences here and there virus still feels very empty like a movie going for a",
  

In [22]:
def get_text(reviews, score):
    return " ".join([r[0].lower() for r in reviews if r[1] == str(score)])

In [23]:
def count_text(text):
    words = re.split("\s+", text)
    return Counter(words)

In [25]:
negative_text = get_text(reviews, -1)

In [26]:
positive_text = get_text(reviews, 1)

In [29]:
negative_counts = count_text(negative_text)
negative_counts

Counter({'plot': 79,
         ':': 278,
         'two': 106,
         'teen': 10,
         'couples': 2,
         'go': 41,
         'to': 1279,
         'a': 1941,
         'church': 2,
         'party': 15,
         'drink': 2,
         'and': 1414,
         'then': 58,
         'drive': 11,
         '.': 2752,
         'they': 161,
         'get': 76,
         'into': 116,
         'an': 308,
         'accident': 6,
         'one': 291,
         'of': 1649,
         'the': 3181,
         'guys': 17,
         'dies': 7,
         'but': 304,
         'his': 315,
         'girlfriend': 8,
         'continues': 4,
         'see': 79,
         'him': 57,
         'in': 954,
         'her': 104,
         'life': 48,
         'has': 203,
         'nightmares': 1,
         "what's": 12,
         'deal': 11,
         '?': 181,
         'watch': 28,
         'movie': 376,
         '"': 1228,
         'sorta': 1,
         'find': 28,
         'out': 142,
         'critique': 10,
         'mind

In [47]:
positive_counts = count_text(positive_text)
positive_counts

Counter({'films': 142,
         'adapted': 6,
         'from': 250,
         'comic': 12,
         'books': 7,
         'have': 187,
         'had': 77,
         'plenty': 9,
         'of': 1631,
         'success': 14,
         'whether': 7,
         "they're": 9,
         'about': 194,
         'superheroes': 3,
         '(': 520,
         'batman': 4,
         'superman': 1,
         'spawn': 3,
         ')': 499,
         'or': 133,
         'geared': 3,
         'toward': 8,
         'kids': 15,
         'casper': 2,
         'the': 3238,
         'arthouse': 2,
         'crowd': 4,
         'ghost': 4,
         'world': 58,
         'but': 320,
         "there's": 33,
         'never': 52,
         'really': 63,
         'been': 116,
         'a': 1933,
         'book': 21,
         'like': 146,
         'hell': 12,
         'before': 44,
         '.': 2639,
         'for': 404,
         'starters': 1,
         'it': 541,
         'was': 306,
         'created': 9,
         'by':

In [31]:
print("Negative text sample: {0}".format(negative_text[:100]))
print("Positive text sample: {0}".format(positive_text[:100]))

Negative text sample: plot : two teen couples go to a church party drink and then drive . they get into an accident . one 
Positive text sample: films adapted from comic books have had plenty of success whether they're about superheroes ( batman


### Predicting review classifications

In [32]:
def get_y_count(score):
    return len([r for r in reviews if r[1] == str(score)])

In [33]:
positive_review_count = get_y_count(1)
negative_review_count = get_y_count(-1)

In [34]:
prob_positive = positive_review_count / len(reviews)
prob_positive

0.5007052186177715

In [35]:
prob_negative = negative_review_count / len(reviews)
prob_negative

0.4992947813822285

In [36]:
def make_class_prediction(text, counts, class_prob, class_count):
    prediction = 1
    text_counts = Counter(re.split("\s+", text))
    for word in text_counts:
        prediction =  prediction * text_counts.get(word) * ((counts.get(word, 0) + 1) / (sum(counts.values()) + class_count))
    return prediction * class_prob

In [37]:
print("Review: {0}".format(reviews[0][0]))
print("Negative prediction: {0}".format(make_class_prediction(reviews[0][0], negative_counts, prob_negative, negative_review_count)))
print("Positive prediction: {0}".format(make_class_prediction(reviews[0][0], positive_counts, prob_positive, positive_review_count)))

Review: plot : two teen couples go to a church party drink and then drive . they get into an accident . one of the guys dies but his girlfriend continues to see him in her life and has nightmares . what's the deal ? watch the movie and " sorta " find out . . . critique : a mind-fuck movie for the teen generation that touches on a very cool idea but presents it in a very bad package . which is what makes this review an even harder one to write since i generally applaud films which attempt
Negative prediction: 3.005053036235652e-221
Positive prediction: 1.307170546690679e-226


### Predicting the test set

In [39]:
def make_decision(text, make_class_prediction):
    negative_prediction = make_class_prediction(text, negative_counts, prob_negative, negative_review_count)
    positive_prediction = make_class_prediction(text, positive_counts, prob_positive, positive_review_count)
    if negative_prediction > positive_prediction:
        return -1
    return 1

In [40]:
with open("test.csv", 'r') as file:
    test = list(csv.reader(file))

In [44]:
predictions = [make_decision(r[0], make_class_prediction) for r in test]
predictions[:10]

[-1, -1, -1, 1, -1, -1, 1, -1, 1, -1]

### Computing prediction error

In [45]:
actual = [int(r[1]) for r in test]
actual[:10]

[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]

In [48]:
from sklearn import metrics

In [49]:
fpr, tpr, thresholds = metrics.roc_curve(actual, predictions, pos_label=1)

In [50]:
print("AUC of the predictions: {0}".format(metrics.auc(fpr, tpr)))

AUC of the predictions: 0.680701754385965


## Naive Bayes using scikit-learn

In [51]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

In [52]:
vectorizer = CountVectorizer(stop_words='english', max_df=.05)
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.05, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [53]:
train_features = vectorizer.fit_transform([r[0] for r in reviews])
train_features

<1418x13888 sparse matrix of type '<class 'numpy.int64'>'
	with 48704 stored elements in Compressed Sparse Row format>

In [54]:
test_features = vectorizer.transform([r[0] for r in test])
test_features

<197x13888 sparse matrix of type '<class 'numpy.int64'>'
	with 5750 stored elements in Compressed Sparse Row format>

In [55]:
nb = MultinomialNB()
nb.fit(train_features, [int(r[1]) for r in reviews])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [56]:
predictions = nb.predict(test_features)
predictions[:10]

array([-1, -1, -1,  1, -1, -1,  1, -1,  1, -1])

In [57]:
fpr, tpr, thresholds = metrics.roc_curve(actual, predictions, pos_label=1)

In [58]:
print("Multinomal naive bayes AUC: {0}".format(metrics.auc(fpr, tpr)))

Multinomal naive bayes AUC: 0.635500515995872
