# Mini Project 2: IMDB Sentiment Analysis

February 22, 2019

Akshal Aniche, Jacob Sanz-Robinson, Raphael Hotter

COMP 551

## Imports

In [2]:
import numpy as np

## Load the data

In [3]:
import os
pos_train_files = os.listdir('data/train/pos/')
neg_train_files = os.listdir('data/train/neg/')
test_files = os.listdir('data/test/')

pos_words = 'opinion-lexicon-English/positive-words.txt'
neg_words = 'opinion-lexicon-English/negative-words.txt'


# Remove .DS_Store files
while '.DS_Store' in pos_train_files:
  pos_train_files.remove('.DS_Store')
while '.DS_Store' in neg_train_files:
  neg_train_files.remove('.DS_Store')
while '.DS_Store' in test_files:
  test_files.remove('.DS_Store')

test_files.sort(key=lambda x : int(x[:-4]))

In [4]:
# Reads data into 2 arrays
train_text, test_raw = [], []

for file in pos_train_files:
  with open('data/train/pos/{}'.format(file), 'r') as f:
    train_text.append(f.read().lower())

for file in neg_train_files:
  with open('data/train/neg/{}'.format(file), 'r') as f:
    train_text.append(f.read().lower())

for file in test_files:
  with open('data/test/{}'.format(file), 'r') as f:
    test_raw.append(f.read().lower())

In [5]:
# Training set Y vector
pos_goal = np.ones((12500))
neg_goal = np.zeros((12500))
Y_train = np.append(pos_goal, neg_goal, axis = 0)

### Read lexicon of sentiment words

In [None]:
sent_words = []

with open(pos_words, 'r', encoding="latin-1") as f:
    for line in f:
        if (line[0] != ';'):
            sent_words.append(line.strip())

with open(neg_words, 'r', encoding="latin-1") as f:
    for line in f:
        if (line[0] != ';'):
            sent_words.append(line.strip())

## Data processer preparation

In [None]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 

wnl = WordNetLemmatizer()
class LemmaTokenizer(object):
    def __call__(self, text):
        return [wnl.lemmatize(t) for t in word_tokenize(text)]
    

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import Normalizer

bin_vec = CountVectorizer(binary=True, tokenizer=LemmaTokenizer())
tfidf_vec = TfidfVectorizer(tokenizer=LemmaTokenizer(), norm='l2')
bigram_vec = CountVectorizer(ngram_range=(1,2))
sentc_vec = CountVectorizer(binary=True)
sentt_vec = TfidfVectorizer(tokenizer=LemmaTokenizer(), norm='l2')

## Text processing

### Binary representation

In [None]:
# Tokenize and build the vocabulary
bin_vec.fit(train_text)

# Create the feautre matrices 
train_bin = bin_vec.transform(train_text)
test_bin = bin_vec.transform(test_raw)

### Monograms and Bigrams 

In [None]:
# Tokenize and build the vocabulary
bigram_vec.fit(train_text)

# Create the feautre matrices 
train_bigram = bigram_vec.transform(train_text)
test_bigram = bigram_vec.transform(test_raw)

### Tf-Idf 

In [None]:
# Tokenize and build the vocabulary
tfidf_vec.fit(train_text)

# Create the feautre matrices 
train_tfidf = tfidf_vec.transform(train_text)
test_tfidf = tfidf_vec.transform(test_raw)

### Tf-Idf with only sentiment lexicon, Normalized

In [None]:
# Tokenize and build the vocabulary
sentt_vec.fit(sent_words)

# Create the feautre matrices 
train_s_tfidf = sentt_vec.transform(train_text)
test_s_tfidf = sentt_vec.transform(test_raw)

normalizer_tran = Normalizer().fit(X=train_s_tfidf)
X_train_sn = normalizer_tran.transform(train_s_tfidf)
X_test_sn = normalizer_tran.transform(test_s_tfidf)

### Normalized Tf-Idf

In [None]:
from sklearn.preprocessing import Normalizer, PolynomialFeatures
from sklearn.decomposition import PCA

def process (X_train, X_test, p, var):
    poly = PolynomialFeatures(p)
    pca = PCA(var)
    
    poly.fit(X=X_train)
    X_train = poly.transform(X_train)
    X_test = poly.transform(X_test)
    
    pca.fit(X=X_train)
    X_train = pca.transform(X_train)
    X_test = pca.transform(X_test)
    
    normalizer_tranformer = Normalizer().fit(X=X_train)
    X_train_normalized = normalizer_tranformer.transform(X_train)
    X_test_normalized = normalizer_tranformer.transform(X_test)
    return (X_train_normalized, X_test_normalized)

(X_train_normalized, X_test_normalized) = process(train_tfidf, test_tfidf, 2, 0.95)

## Bernoulli Naive Bayes model from scratch

In [13]:
#Returns the theta_1 value, and the theta_j,1 and theta_j,0 arrays

class NaiveBayesScratch():        
    def train(self, trainSet, YTrain, bagOfWords): #trainSet is train_bin, bagOfWords is bin_vec, YTrain is Y_train
        self.bag_of_words_ = bagOfWords
        
        theta_j1 = [0] * len(bagOfWords.get_feature_names()) #instantiate array of 0's. One cell per each word in bag of words.
        theta_j0 = [0] * len(bagOfWords.get_feature_names())
        numPosExamples = np.sum(YTrain)
        numNegExamples = len(YTrain) - numPosExamples

        for exampleNum in range(len(YTrain)): # for each review
            currentWordNum = 0 #index to keep track of current word's index

            for wordOccurs in trainSet[exampleNum].toarray().sum(axis=0): #For every word in every example
                if wordOccurs != 0: #if a word occurs, then we add to its count
                    if YTrain[exampleNum] == 1:
                        theta_j1[currentWordNum] += 1
                    if YTrain[exampleNum] == 0:
                        theta_j0[currentWordNum] += 1
                currentWordNum += 1

            if exampleNum %1000 == 0:
                print(exampleNum)
        #Laplace Smoothing    
        theta_j1_np = np.array(theta_j1)
        self.theta_j1_ = (theta_j1_np + 1)/(float(numPosExamples) + 2) #Holds the list of probabilities for positive examples
        theta_j0_np = np.array(theta_j0)
        self.theta_j0_ = (theta_j0_np + 1)/(float(numNegExamples) + 2) #List of probabilities for negative examples
        self.theta_1_ = numPosExamples/float(len(YTrain))
    
    def predict(self, example):
        f1 = math.log(self.theta_1_/(1.0 - self.theta_1_))
        f2 = 0
        for currWordNum, word in enumerate(self.bag_of_words_.vocabulary_.items()): #for every word in the bag of words
            #if word is in example, xj is 1, else = 0
            x_j = example[currWordNum] # in calling fn do: testSet[exampleNum].toarray().sum(axis=0)

            if ((self.theta_j1_[currWordNum] > 0.0) and (self.theta_j0_[currWordNum] > 0)):
                t2 = math.log(self.theta_j1_[currWordNum]/self.theta_j0_[currWordNum]) * x_j
            else:
                t2 = 0
            t3 = (1 - x_j) * math.log((1 - self.theta_j1_[currWordNum])/(1 - self.theta_j0_[currWordNum]))

            t4 = t2 + t3
            f2 = f2 + t4   
        deltaX = f1 + f2
        p = 1/(1-math.exp(-deltaX))
        return p
    
    def evaluate(self, testSet): #testSet is test_bin
        predictions = [] #Holds predicted classes for each example in testSet
        for review in testSet:
            example = review.toarray().sum(axis=0)
            p = self.predict(example)
            predictions.append(p)
        predictions = np.array(predictions)
        predictions = predictions >= 0.5 # binarize
        return predictions

# tj1, tj0, t1 = trainNB(train_bin, bin_vec, Y_train)

## Cross Validation

### To test standard Naive Bayes:

In [None]:
# Define train(), evaluate() functions
import random
import math

def k_cross_validate(X_train, Y_train, bin_vec, k):
    indeces = random.sample(range(X_train.shape[0]),X_train.shape[0])
    step = int(X_train.shape[0]/k)
    scores = []
    for k_fold in range(k):
        k_validate_indeces = indeces[k_fold*step:(k_fold+1)*step]
        k_train_indeces = [i for i in range(X_train.shape[0]) if i not in k_validate_indeces]
        
        b = NaiveBayesScratch()
        b.train(X_train[k_train_indeces], Y_train[k_train_indeces], bin_vec)
        predictions = b.evaluate(X_train[k_validate_indeces])
        score = np.sum(predictions == Y_train[k_validate_indeces])/len(Y_train) # accuracy
        scores.append(score)
    mean = np.array(scores).mean()
    print("Scores: {}".format(scores))
    print("Scores Mean: {}".format(mean))

    
# sample_indeces = random.sample(range(train_s_bin.shape[0]), 500)
k_cross_validate(train_s_bin, Y_train, sentc_vec, 3)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000


### Cross validation function for sklearn models

In [26]:
from sklearn.model_selection import cross_val_score
# Function to cross validate a scikitlearn model
def crossvalidate(model, X_train, fold) :
    scores = cross_val_score(model, X_train, Y_train, cv=fold)
    mean = scores.mean()
    print("Scores: {}".format(scores))
    print("Scores Mean: {}".format(mean))

## Models

### Naive Bayes (to make sure it worked)

In [27]:
from sklearn.naive_bayes import MultinomialNB
bayes_clf = MultinomialNB()
crossvalidate(bayes_clf, X_train_normalized, 4)

Scores: [0.79552 0.79632 0.79984 0.79904]
Scores Mean: 0.79768


### Logistic Regression

In [28]:
from sklearn import linear_model
regr_clf = linear_model.LogisticRegression(solver='lbfgs')
crossvalidate(regr_clf, X_train_normalized, 4)

Scores: [0.86608 0.85936 0.8624  0.86656]
Scores Mean: 0.8635999999999999


### Decision Trees

In [29]:
from sklearn import tree
dec_clf = tree.DecisionTreeClassifier()
crossvalidate(dec_clf, X_train_normalized, 4)

KeyboardInterrupt: 

I wonder if we can play with the parameters more 

### Support Vector Machines

In [None]:
from sklearn import svm
svm_clf = svm.LinearSVC()
crossvalidate(svm_clf, X_train_normalized, 4)