# Mini Project 2: IMDB Sentiment Analysis

February 22, 2019

Akshal Aniche, Jacob Sanz-Robinson, Raphael Hotter

COMP 551

## Imports

In [4]:
import numpy as np

## Load the data

In [5]:
import os
pos_train_files = os.listdir('data/train/pos/')
neg_train_files = os.listdir('data/train/neg/')
test_files = os.listdir('data/test/')

pos_words = 'opinion-lexicon-English/positive-words.txt'
neg_words = 'opinion-lexicon-English/negative-words.txt'


# Remove .DS_Store files
while '.DS_Store' in pos_train_files:
  pos_train_files.remove('.DS_Store')
while '.DS_Store' in neg_train_files:
  neg_train_files.remove('.DS_Store')
while '.DS_Store' in test_files:
  test_files.remove('.DS_Store')

test_files.sort(key=lambda x : int(x[:-4]))

In [6]:
# Reads data into 2 arrays
train_text, test_raw = [], []

for file in pos_train_files:
  with open('data/train/pos/{}'.format(file), 'r') as f:
    train_text.append(f.read().lower())

for file in neg_train_files:
  with open('data/train/neg/{}'.format(file), 'r') as f:
    train_text.append(f.read().lower())

for file in test_files:
  with open('data/test/{}'.format(file), 'r') as f:
    test_raw.append(f.read().lower())

In [7]:
# Training set Y vector
pos_goal = np.ones((12500))
neg_goal = np.zeros((12500))
Y_train = np.append(pos_goal, neg_goal, axis = 0)

### Read lexicon of sentiment words

In [8]:
sent_words = []

with open(pos_words, 'r', encoding="latin-1") as f:
    for line in f:
        if (line[0] != ';'):
            sent_words.append(line.strip())

with open(neg_words, 'r', encoding="latin-1") as f:
    for line in f:
        if (line[0] != ';'):
            sent_words.append(line.strip())

            
print(sent_words)



## Data processer preparation

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import Normalizer

bin_vec = CountVectorizer(binary=True)
tfidf_vec = TfidfVectorizer()
bigram_vec = CountVectorizer(ngram_range=(1,2))
sentc_vec = CountVectorizer(binary=True)
sentt_vec = TfidfVectorizer()

## Text processing

### Binary representation

In [10]:
# Tokenize and build the vocabulary
bin_vec.fit(train_text)

# Create the feautre matrices 
train_bin = bin_vec.transform(train_text)
test_bin = bin_vec.transform(test_raw)

## Binary representation, only sentiment lexicon words

In [11]:
# Tokenize and build vocabulary 
sentc_vec.fit(sent_words)

# Create the feautre matrices 
train_s_bin = sentc_vec.transform(train_text)
test_s_bin = sentc_vec.transform(test_raw)
print(train_s_bin.shape)

(25000, 6781)


### Monograms and Bigrams 

In [9]:
# Tokenize and build the vocabulary
bigram_vec.fit(train_text)

# Create the feautre matrices 
train_bigram = bigram_vec.transform(train_text)
test_bigram = bigram_vec.transform(test_raw)

### Tf-Idf 

In [10]:
# Tokenize and build the vocabulary
tfidf_vec.fit(train_text)

# Create the feautre matrices 
train_tfidf = tfidf_vec.transform(train_text)
test_tfidf = tfidf_vec.transform(test_raw)

### Tf-Idf with only sentiment lexicon, Normalized

In [12]:
# Tokenize and build the vocabulary
sentt_vec.fit(sent_words)

# Create the feautre matrices 
train_s_tfidf = sentt_vec.transform(train_text)
test_s_tfidf = sentt_vec.transform(test_raw)

normalizer_tran = Normalizer().fit(X=train_s_tfidf)
X_train_sn = normalizer_tran.transform(train_s_tfidf)
X_test_sn = normalizer_tran.transform(test_s_tfidf)

### Normalized Tf-Idf

In [12]:
from sklearn.preprocessing import Normalizer
normalizer_tranformer = Normalizer().fit(X=train_tfidf)
X_train_normalized = normalizer_tranformer.transform(train_tfidf)
X_test_normalized = normalizer_tranformer.transform(test_tfidf)

In [18]:
bin_vec.get_feature_names()

['00',
 '000',
 '0000000000001',
 '00001',
 '00015',
 '000s',
 '001',
 '003830',
 '006',
 '007',
 '0079',
 '0080',
 '0083',
 '0093638',
 '00am',
 '00pm',
 '00s',
 '01',
 '01pm',
 '02',
 '020410',
 '029',
 '03',
 '04',
 '041',
 '05',
 '050',
 '06',
 '06th',
 '07',
 '08',
 '087',
 '089',
 '08th',
 '09',
 '0f',
 '0ne',
 '0r',
 '0s',
 '10',
 '100',
 '1000',
 '1000000',
 '10000000000000',
 '1000lb',
 '1000s',
 '1001',
 '100b',
 '100k',
 '100m',
 '100min',
 '100mph',
 '100s',
 '100th',
 '100x',
 '100yards',
 '101',
 '101st',
 '102',
 '102nd',
 '103',
 '104',
 '1040',
 '1040a',
 '1040s',
 '105',
 '1050',
 '105lbs',
 '106',
 '106min',
 '107',
 '108',
 '109',
 '10am',
 '10lines',
 '10mil',
 '10min',
 '10minutes',
 '10p',
 '10pm',
 '10s',
 '10star',
 '10th',
 '10x',
 '10yr',
 '11',
 '110',
 '1100',
 '11001001',
 '1100ad',
 '111',
 '112',
 '1138',
 '114',
 '1146',
 '115',
 '116',
 '117',
 '11f',
 '11m',
 '11th',
 '12',
 '120',
 '1200',
 '1200f',
 '1201',
 '1202',
 '123',
 '12383499143743701',
 '1

## Bernoulli Naive Bayes model from scratch

In [17]:
import math

class NaiveBayesScratch():
    """Bernouli Naive Bayes"""
    def train(self, X_train, Y_train, bag_of_words):
        Y_train = Y_train.reshape(len(Y_train),1)
        self.bag_of_words_ = bag_of_words
        bow_array = bag_of_words.toarray() # bag of words
        num_pos = np.sum(Y_train)
        num_neg = len(Y_train) - num_pos
        
        self.theta_1_ = num_pos/float(len(Y_train))
        self.theta_j1_ = (bow_array*Y_train).sum(axis=0)/float(num_pos) # sum along training examples
        self.theta_j0_ = (1-bow_array)*(1-Y_train).sum(axis=0)/float(num_neg)
        
        # Laplace smoothing
        self.theta_j1_ = (self.theta_j1_ + 1)/(float(num_pos) + 2)
        self.theta_j0_ = (self.theta_j0_ + 1)/(float(num_neg) + 2)
        
        # Prepare for predictions
        self.predict_theta_1_ = math.log(self.theta_1_/(1-self.theta_1_))
        self.predict_pos_ = np.log(self.theta_j1_/self.theta_j0_).T
        self.predict_neg_ = np.log(1-self.theta_j1_/1-self.theta_j0_).T
    
    def predict(self, X):
        pos = np.dot(X, self.predict_pos_).sum(axis=1)
        neg = np.dot(1-X, self.predict_neg_).sum(axis=1)
        delta_predictions = self.predict_theta_1_ + pos + neg
        binary_predictions = delta_predictions > 0
        return binary_predictions

In [18]:
naive_bayes = NaiveBayesScratch()
naive_bayes.train(X_train_sn, Y_train, train_s_tfidf)

In [None]:
naive_bayes.predict(X_test_sn)

## Cross Validation

### To test standard Naive Bayes:

In [None]:
# Define train(), evaluate() functions
import random
import math

def k_cross_validate(X_train, Y_train, bin_vec, k):
    indeces = random.sample(range(X_train.shape[0]),X_train.shape[0])
    step = int(X_train.shape[0]/k)
    scores = []
    for k_fold in range(k):
        k_validate_indeces = indeces[k_fold*step:(k_fold+1)*step]
        k_train_indeces = [i for i in range(X_train.shape[0]) if i not in k_validate_indeces]
        
        b = NaiveBayesScratch()
        b.train(X_train[k_train_indeces], Y_train[k_train_indeces], bin_vec)
        predictions = b.evaluate(X_train[k_validate_indeces])
        score = np.sum(predictions == Y_train[k_validate_indeces])/len(Y_train) # accuracy
        scores.append(score)
    mean = np.array(scores).mean()
    print("Scores: {}".format(scores))
    print("Scores Mean: {}".format(mean))

    
# sample_indeces = random.sample(range(train_s_bin.shape[0]), 500)
k_cross_validate(train_s_bin, Y_train, sentc_vec, 3)

0
250
500
750
1000
1250
1500
1750
2000
2250
2500
2750
3000
3250
3500
3750
4000
4250
4500
4750
5000
5250
5500
5750
6000
6250
6500
6750
7000
7250
7500
7750
8000
8250
8500
8750
9000
9250
9500
9750
10000
10250
10500
10750
11000
11250
11500
11750
12000
12250
12500
12750
13000
13250
13500
13750
14000
14250
14500
14750
15000
15250
15500
15750
16000
16250
16500


### Cross validation function for sklearn models

In [13]:
from sklearn.model_selection import cross_val_score
# Function to cross validate a scikitlearn model
def crossvalidate(model, X_train, Y_train, fold) :
    scores = cross_val_score(model, X_train, Y_train, cv=fold)
    mean = scores.mean()
    print("Scores: {}".format(scores))
    print("Scores Mean: {}".format(mean))

## Models

### Naive Bayes (to make sure it worked)

In [15]:
from sklearn.naive_bayes import BernoulliNB
bayes_clf = BernoulliNB()
crossvalidate(bayes_clf, X_train_normalized, Y_train, 4)

Scores: [0.85552 0.85424 0.832   0.85968]
Scores Mean: 0.85036


### Logistic Regression

In [17]:
from sklearn import linear_model
regr_clf = linear_model.LogisticRegression(solver='lbfgs')
crossvalidate(regr_clf, X_train_normalized, Y_train, 4)

Scores: [0.89008 0.8904  0.87408 0.888  ]
Scores Mean: 0.88564


### Decision Trees

In [16]:
from sklearn import tree
dec_clf = tree.DecisionTreeClassifier()
crossvalidate(dec_clf, X_train_normalized, Y_train, 4)

Scores: [0.70992 0.70832 0.69872 0.69744]
Scores Mean: 0.7035999999999999


I wonder if we can play with the parameters more 

### Support Vector Machines

In [42]:
from sklearn import svm
svm_clf = svm.LinearSVC()
crossvalidate(svm_clf, X_train_normalized, 4)

Scores: [0.84704 0.82496 0.84832 0.84848]
Scores Mean: 0.8422


# Additional Stuff (to delete)

In [79]:
import math

#Returns single value of delta x for prediction.
def predictNB(example, theta_j1, theta_j0, theta_1, bagOfWords):
    f1 = math.log(theta_1/(1.0 - theta_1))
    f2 = 0
    currWordNum = 0
    for word in bagOfWords.vocabulary_.items(): #for every word in the bag of words
        #if word is in example, xj is 1, else = 0
        x_j = example[currWordNum] # in calling fn do: testSet[exampleNum].toarray().sum(axis=0)

        if ((theta_j1[currWordNum] > 0.0) and (theta_j0[currWordNum] > 0)):
            t2 = math.log(theta_j1[currWordNum]/theta_j0[currWordNum]) * x_j
        else:
            t2 = 0
        t3 = (1 - x_j) * math.log((1 - theta_j1[currWordNum])/(1 - theta_j0[currWordNum]))
        
        t4 = t2 + t3
        f2 = f2 + t4
        currWordNum = currWordNum + 1    
    deltaX = f1 + f2
    p = 1/(1-math.exp(-deltaX))
    return p

#Returns predictions for a set
def evaluateNB(testSet, bagOfWords, theta_j1, theta_j0, theta_1): #testSet is test_bin, bagOfWords is bin_vec
    predictions = [] #Holds predicted classes for each example in testSet
    for review in testSet:
        example = review.sum(axis=0)
        p = predictNB(example, theta_j1, theta_j0, theta_1, bagOfWords)
        predictions.append(p)
    return predictions

#pred = evaluateNB(train_bin[12000:13000], bin_vec, t1, tj1, tj0)

In [85]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.estimator_checks import check_estimator


class ScratchNaiveBayes(BaseEstimator, ClassifierMixin):
    def fit(self, X, y):
        # assumes bin_vec is there
        self.thetaj1_, self.thetaj0_, self.theta1_ = trainNB(X, bin_vec, y)
        return self
    
    def predict(self, X, y=None):
        try:
            getattr(self, "thetaj1_")
        except AttributeError:
            raise RuntimeError("You must train classifer before predicting data")
        return evaluateNB(X, bin_vec, self.thetaj1_, self.thetaj0_, self.theta1_)
    
model = ScratchNaiveBayes()
# print(X_train_normalized[:100].shape)

import random
indeces = random.sample(range(X_train_normalized.shape[0]), 500)
crossvalidate(model, X_train_normalized[indeces], Y_train[indeces], 3)

0
250


IndexError: index 1 is out of bounds for axis 0 with size 1