# Mini Project 2: IMDB Sentiment Analysis

February 22, 2019

Akshal Aniche, Jacob Sanz-Robinson, Raphael Hotter

COMP 551

## Imports

In [1]:
import numpy as np

## Load the data

In [2]:
import os
pos_train_files = os.listdir('data/train/pos/')
neg_train_files = os.listdir('data/train/neg/')
test_files = os.listdir('data/test/')

pos_words = 'opinion-lexicon-English/positive-words.txt'
neg_words = 'opinion-lexicon-English/negative-words.txt'


# Remove .DS_Store files
while '.DS_Store' in pos_train_files:
  pos_train_files.remove('.DS_Store')
while '.DS_Store' in neg_train_files:
  neg_train_files.remove('.DS_Store')
while '.DS_Store' in test_files:
  test_files.remove('.DS_Store')

test_files.sort(key=lambda x : int(x[:-4]))

In [3]:
# Reads data into 2 arrays
train_text, test_raw = [], []

for file in pos_train_files:
  with open('data/train/pos/{}'.format(file), 'r') as f:
    train_text.append(f.read().lower())

for file in neg_train_files:
  with open('data/train/neg/{}'.format(file), 'r') as f:
    train_text.append(f.read().lower())

for file in test_files:
  with open('data/test/{}'.format(file), 'r') as f:
    test_raw.append(f.read().lower())

In [4]:
# Training set Y vector
pos_goal = np.ones((12500))
neg_goal = np.zeros((12500))
Y_train = np.append(pos_goal, neg_goal, axis = 0)

### Read lexicon of sentiment words

In [5]:
sent_words = []

with open(pos_words, 'r', encoding="latin-1") as f:
    for line in f:
        if (line[0] != ';'):
            sent_words.append(line.strip())

with open(neg_words, 'r', encoding="latin-1") as f:
    for line in f:
        if (line[0] != ';'):
            sent_words.append(line.strip())
            
# print(sent_words)

## Data processer preparation

In [35]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import Normalizer

#Lemmatizer integrated with the tokenizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        
    def __call__(self, text):
        return [self.wnl.lemmatize(t) for t in word_tokenize(text)]

In [36]:
from nltk import word_tokenize, sent_tokenize        
from nltk.stem import WordNetLemmatizer 

bin_vec = CountVectorizer(binary=True)
tfidf_vec = TfidfVectorizer(tokenizer = LemmaTokenizer())
bigram_vec = CountVectorizer(ngram_range=(1,2))
sentc_vec = CountVectorizer(binary=True)
sentt_vec = TfidfVectorizer(tokenizer=LemmaTokenizer())

## Text processing

### Binary representation

In [19]:
# Tokenize and build the vocabulary
bin_vec.fit(train_text)

# Create the feautre matrices 
train_bin = bin_vec.transform(train_text)
test_bin = bin_vec.transform(test_raw)

### Monograms and Bigrams 

In [None]:
# Tokenize and build the vocabulary
bigram_vec.fit(train_text)

# Create the feautre matrices 
train_bigram = bigram_vec.transform(train_text)
test_bigram = bigram_vec.transform(test_raw)

### Tf-Idf 

In [None]:
# Tokenize and build the vocabulary
tfidf_vec.fit(train_text)

# Create the feautre matrices 
train_tfidf = tfidf_vec.fit_transform(train_text)
test_tfidf = tfidf_vec.transform(test_raw)

### Tf-Idf with only sentiment lexicon, Normalized

In [20]:
# Tokenize and build the vocabulary
sentt_vec.fit(sent_words)

# Create the feautre matrices 
train_s_tfidf = sentt_vec.transform(train_text)
test_s_tfidf = sentt_vec.transform(test_raw)

normalizer_tran = Normalizer().fit(X=train_s_tfidf)
X_train_sn = normalizer_tran.transform(train_s_tfidf)
X_test_sn = normalizer_tran.transform(test_s_tfidf)

TypeError: unhashable type: 'list'

### Normalized Tf-Idf

In [24]:
from sklearn.preprocessing import Normalizer, PolynomialFeatures
from sklearn.decomposition import TruncatedSVD

def process (X_train, X_test, p, var):
    poly = PolynomialFeatures(p)
    pca = TruncatedSVD(n_components = 10000)
    normalizer_tranformer = Normalizer().fit(X_train)
    X_train = normalizer_tranformer.transform(X_train)
    X_test= normalizer_tranformer.transform(X_test)


    pca.fit(X_train)
    X_train = pca.transform(X_train)
    X_test = pca.transform(X_test)
    '''
    poly.fit(X=X_train)
    X_train = poly.transform(X_train)
    X_test = poly.transform(X_test)
    
    pca.fit(X=X_train)
    X_train = pca.transform(X_train)
    X_test = pca.transform(X_test)
    '''
    return (X_train, X_test)

In [27]:
(X_train_normalized, X_test_normalized) = process(train_tfidf, test_tfidf, 2, 0.95)
print (X_train_normalized.shape)

KeyboardInterrupt: 

In [None]:
from sklearn.preprocessing import Normalizer
normalizer_tranformer = Normalizer().fit(X=train_tfidf)
X_train_normalized = normalizer_tranformer.transform(train_tfidf)
X_test_normalized = normalizer_tranformer.transform(test_tfidf)

## Bernoulli Naive Bayes model from scratch

In [21]:
import math

class NaiveBayesScratch():
    """Bernouli Naive Bayes"""
    def train(self, X_train, Y_train): #bow = bag of words
#         X_train = X_train.toarray()
        Y_train = Y_train.reshape(len(Y_train),1)
        num_pos = np.sum(Y_train)
        num_neg = len(Y_train) - num_pos
        
        self.theta_1_ = num_pos/float(len(Y_train))
        self.theta_j1_ = (X_train*Y_train).sum(axis=0) # sum along training examples
        self.theta_j0_ = (X_train*(1-Y_train)).sum(axis=0)
        
        # Laplace smoothing
        self.theta_j1_ = (self.theta_j1_ + 1)/(float(num_pos) + 2)
        self.theta_j0_ = (self.theta_j0_ + 1)/(float(num_neg) + 2)
        
        # Prepare for predictions
        self.predict_theta_1_ = math.log(self.theta_1_/(1-self.theta_1_))

        self.predict_pos_ = np.log(self.theta_j1_/self.theta_j0_)
        self.predict_pos_ = self.predict_pos_.reshape(len(self.predict_pos_),1)
        
        self.predict_neg_ = np.log((1-self.theta_j1_)/(1-self.theta_j0_))
        self.predict_neg_ = self.predict_neg_.reshape(len(self.predict_neg_),1)
    
    def predict(self, X):
#         X = X.toarray()
        pos = np.dot(X, self.predict_pos_)
        neg = np.dot(1-X, self.predict_neg_)
        delta_predictions = self.predict_theta_1_ + pos + neg
        binary_predictions = delta_predictions > 0
        return binary_predictions

In [22]:
naive_bayes = NaiveBayesScratch()
naive_bayes.train(train_bin, Y_train)

KeyboardInterrupt: 

In [23]:
mini_X_test = train_bin
mini_Y_test = Y_train
mini_Y_test = mini_Y_test.reshape(mini_Y_test.shape[0],1)

predictions = naive_bayes.predict(mini_X_test)
# print(predictions==mini_Y_test)
score1 = np.sum(predictions==mini_Y_test, axis=0)/float(mini_Y_test.shape[0])
print(score1)

[0.90496]


## Cross Validation

### To test standard Naive Bayes:

In [28]:
# Define train(), evaluate() functions
import random
import math

def k_cross_validate(X_train, Y_train, k):
    """
    X_train: n x m array
    Y_train: n x 1 array
    k: number of folds
    """
    indeces = random.sample(range(Y_train.shape[0]), Y_train.shape[0])
    step = int(X_train.shape[0]/k)
    scores = []
    for k_fold in range(k):
        k_validate_indeces = indeces[k_fold*step:(k_fold+1)*step]
        k_train_indeces = [i for i in range(X_train.shape[0]) if i not in k_validate_indeces]
        
        b = NaiveBayesScratch()
        b.train(X_train[k_train_indeces,:], Y_train[k_train_indeces,:])
        predictions = b.predict(X_train[k_validate_indeces,:])
        score = np.sum(predictions==Y_train[k_validate_indeces,:]) / float(Y_train[k_validate_indeces,:].shape[0])
        scores.append(score)
    mean = np.array(scores).mean()
    print("Scores: {}".format(scores))
    print("Scores Mean: {}".format(mean))

    
formatted_Y_train = Y_train.reshape(Y_train.shape[0], 1)
k_cross_validate(train_bin.toarray(), formatted_Y_train, 3)

Scores: [4171.192967718709, 4166.631705268211, 4164.270850834034]
Scores Mean: 4167.3651746069845


### Cross validation function for sklearn models

In [None]:
from sklearn.model_selection import cross_val_score
# Function to cross validate a scikitlearn model
def crossvalidate(model, X_train, fold) :
    scores = cross_val_score(model, X_train, Y_train, cv=fold)
    mean = scores.mean()
    print("Scores: {}".format(scores))
    print("Scores Mean: {}".format(mean))

## Models

### Naive Bayes (to make sure it worked)

In [None]:
from sklearn.naive_bayes import BernoulliNB
bayes_clf = BernoulliNB()
crossvalidate(bayes_clf, train_bin, Y_train, 4)

### Logistic Regression

In [None]:
from sklearn import linear_model
regr_clf = linear_model.LogisticRegression(solver='lbfgs')
crossvalidate(regr_clf, X_train_normalized, Y_train, 4)

### Decision Trees

In [None]:
from sklearn import tree
dec_clf = tree.DecisionTreeClassifier()
crossvalidate(dec_clf, X_train_normalized, Y_train, 4)

I wonder if we can play with the parameters more 

### Support Vector Machines

In [None]:
from sklearn import svm
svm_clf = svm.LinearSVC()
crossvalidate(svm_clf, X_train_normalized, 4)