In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re, random
from copy import copy

In [2]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/shikhar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
with open('all_sentiment_shuffled.txt', 'r') as file:
    data = file.readlines()
    data = [re.sub(r'([^\w\s]|[0-9])', ' ', line) for line in data]
    data = [re.sub(r'(\s+)', ' ', line) for line in data]

In [4]:
split_data = [(line2[0], line2[1], line2[2], line2[3:]) for line2 in [line1.strip().split() for line1 in data]]

In [None]:
X = [line[3] for line in split_data]
Y = [line[1] for line in split_data]

stop_words = set(stopwords.words('english'))
stop_words.add('')

X = [[w for w in words if w not in stop_words] for words in X]

In [None]:
print(X[:5])

[['bought', 'album', 'loved', 'title', 'song', 'great', 'song', 'bad', 'rest', 'album', 'right', 'well', 'rest', 'songs', 'filler', 'n', 'worth', 'money', 'paid', 'either', 'shameless', 'bubblegum', 'oversentimentalized', 'depressing', 'tripe', 'kenny', 'chesney', 'popular', 'artist', 'result', 'cookie', 'cutter', 'category', 'nashville', 'music', 'scene', 'gotta', 'pump', 'albums', 'record', 'company', 'keep', 'lining', 'pockets', 'suckers', 'keep', 'buying', 'garbage', 'perpetuate', 'garbage', 'coming', 'town', 'get', 'soapbox', 'country', 'music', 'really', 'needs', 'get', 'back', 'roots', 'stop', 'pop', 'nonsense', 'country', 'music', 'really', 'considered', 'mainstream', 'two', 'different', 'things'], ['misled', 'thought', 'buying', 'entire', 'cd', 'contains', 'one', 'song'], ['introduced', 'many', 'ell', 'high', 'school', 'students', 'lois', 'lowery', 'depth', 'characters', 'brilliant', 'writer', 'capable', 'inspiring', 'fierce', 'passion', 'readers', 'encounter', 'shocking', 'de

In [None]:
def get_vocab(XData):
    vocab = set()

    for line in XData:
        for word in line:
            vocab.add(word)
            
    return vocab

In [None]:
def TrainNaiveBayes(XTrain, YTrain, alpha=1):
    prior = {}
    vocabulary = get_vocab(XTrain)
    prob_word_given_class = {}
    classes = set(YTrain)

    for c in classes:
        prior[c] = np.log(len([y for y in YTrain if y == c]) / len(YTrain))
        class_documents = [doc for doc, label in zip(XTrain, YTrain) if label == c]
        total_word_count = sum([len(doc) for doc in class_documents])
        prob_word_given_class[c] = {}
        for word in vocabulary:
            word_occurences = 0
            for doc in class_documents:
                word_occurences += len([w for w in doc if w == word])
            prob_word_given_class[c][word] = np.log((word_occurences + alpha) / (total_word_count + alpha * len(vocabulary)))
    
    return prior, prob_word_given_class, vocabulary

In [None]:
TrainNaiveBayes(X[1:4], Y[1:4])

({'pos': -1.0986122886681098, 'neg': -0.40546510810816444},
 {'pos': {'older': -4.727387818712341,
   'good': -4.034240638152395,
   'depth': -4.727387818712341,
   'cd': -4.727387818712341,
   'one': -4.727387818712341,
   'characters': -4.727387818712341,
   'graders': -4.727387818712341,
   'return': -4.034240638152395,
   'thought': -4.727387818712341,
   'introduced': -4.727387818712341,
   'language': -4.727387818712341,
   'worlds': -4.727387818712341,
   'shocking': -4.727387818712341,
   'passion': -4.727387818712341,
   'sadly': -4.727387818712341,
   'th': -4.727387818712341,
   'bible': -4.034240638152395,
   'great': -3.628775530044231,
   'message': -4.727387818712341,
   'fast': -4.034240638152395,
   'utopian': -4.727387818712341,
   'still': -4.034240638152395,
   'site': -4.034240638152395,
   'fiction': -4.034240638152395,
   'inspiring': -4.727387818712341,
   'class': -4.727387818712341,
   'writer': -4.727387818712341,
   'christ': -4.034240638152395,
   'anxious'

In [None]:
def PredNaiveBayes(XTest, prior, prob_word_given_class, vocabulary):
    pred_labels = []
    
    for line in XTest:
        posterior = {}
        max_line = -float('inf')
        argmax_line = None

        for c in prior.keys():
            posterior[c] = prior[c]    
            
            for word in line:
                if word in vocabulary:
                    posterior[c] += prob_word_given_class[c][word]

            if max_line < posterior[c]:
                max_line = posterior[c]
                argmax_line = c
        
        pred_labels.append(argmax_line)
    
    return pred_labels

In [None]:
def get_scores(ytrue, ypred):
    POS_CLASS, NEG_CLASS = 'pos', 'neg'
    true_positives = len([1 for a, b in zip(ytrue, ypred) if ytrue == POS_CLASS and ypred == POS_CLASS])
    false_positives = len([1 for a, b in zip(ytrue, ypred) if ytrue == NEG_CLASS and ypred == POS_CLASS])
    true_negatives = len([1 for a, b in zip(ytrue, ypred) if ytrue == NEG_CLASS and ypred == NEG_CLASS])
    false_negatives = len([1 for a, b in zip(ytrue, ypred) if ytrue == POS_CLASS and ypred == NEG_CLASS])
    
    acc = (true_positives + true_negatives) / len(ytrue)
    if (true_positives + false_positives) != 0:
        prec = (true_positives) / (true_positives + false_positives)
    else:
        prec = float('nan')
    if (true_positives + false_negatives) != 0:
        rec = (true_positives) / (true_positives + false_negatives)
    else:
        rec = float('nan')
    if (prec + rec) != 0:
        f1 = 2. * prec * rec / (prec + rec)
    else:
        f1 = float('nan')
    
    return acc, prec, rec, f1

In [None]:
def TrainTestNaiveBayes(XTrain, YTrain, XTest, YTest):
    prior, prob_word_given_class, vocabulary = TrainNaiveBayes(XTrain, YTrain)
    YPred = PredNaiveBayes(XTest, prior, prob_word_given_class, vocabulary)
    return get_scores(YTest, YPred)

In [None]:
data = list(zip(X, Y))
random.shuffle(data)
X, Y = [d[0] for d in data], [d[1] for d in data]

In [None]:
num_folds = 5
split_size = round(len(X) / num_folds + 0.5)
X_splits = []
Y_splits = []
for i in range(num_folds):
    X_splits.append(X[i * split_size: (i + 1) * split_size])
    Y_splits.append(Y[i * split_size: (i + 1) * split_size])

In [None]:
len(X_splits[0]), len(Y_splits[0])

(2383, 2383)

In [None]:
all_scores = []

for fold in range(num_folds):
    XTrain = copy(X_splits)
    del XTrain[fold]
    XTrain = sum(XTrain, [])
    XTest = X_splits[fold]

    YTrain = copy(Y_splits)
    del YTrain[fold]
    YTrain = sum(YTrain, [])
    YTest = Y_splits[fold]
    scores = TrainTestNaiveBayes(XTrain, YTrain, XTest, YTest)
    print('Acc: {} Prec: {}, Rec: {}, F1: {}'.format(*scores))
    all_scores.append(scores)