# Mini Project 2: IMDB Sentiment Analysis

February 22, 2019

Akshal Aniche, Jacob Sanz-Robinson, Raphael Hotter

COMP 551

## Imports

In [56]:
import numpy as np

## Load the data

In [57]:
import os
pos_train_files = os.listdir('data/train/pos/')
neg_train_files = os.listdir('data/train/neg/')
test_files = os.listdir('data/test/')

pos_words = 'opinion-lexicon-English/positive-words.txt'
neg_words = 'opinion-lexicon-English/negative-words.txt'


# Remove .DS_Store files
while '.DS_Store' in pos_train_files:
  pos_train_files.remove('.DS_Store')
while '.DS_Store' in neg_train_files:
  neg_train_files.remove('.DS_Store')
while '.DS_Store' in test_files:
  test_files.remove('.DS_Store')

test_files.sort(key=lambda x : int(x[:-4]))

In [58]:
# Reads data into 2 arrays
train_text, test_raw = [], []

for file in pos_train_files:
  with open('data/train/pos/{}'.format(file), 'r') as f:
    train_text.append(f.read().lower())

for file in neg_train_files:
  with open('data/train/neg/{}'.format(file), 'r') as f:
    train_text.append(f.read().lower())

for file in test_files:
  with open('data/test/{}'.format(file), 'r') as f:
    test_raw.append(f.read().lower())

In [59]:
# Training set Y vector
pos_goal = np.ones((12500))
neg_goal = np.zeros((12500))
Y_train = np.append(pos_goal, neg_goal, axis = 0)

### Read lexicon of sentiment words

In [60]:
sent_words = []

with open(pos_words, 'r', encoding="latin-1") as f:
    for line in f:
        if (line[0] != ';'):
            sent_words.append(line.strip())

with open(neg_words, 'r', encoding="latin-1") as f:
    for line in f:
        if (line[0] != ';'):
            sent_words.append(line.strip())
            
# print(sent_words)

## Data processer preparation

In [61]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import Normalizer

bin_vec = CountVectorizer(binary=True)
tfidf_vec = TfidfVectorizer()
bigram_vec = CountVectorizer(ngram_range=(1,1))
sentc_vec = CountVectorizer(binary=True)
sentt_vec = TfidfVectorizer()

### Lemmatization

In [36]:
from nltk import word_tokenize, sent_tokenize        
from nltk.stem import WordNetLemmatizer 

#Lemmatizer integrated with the tokenizer 
wnl = WordNetLemmatizer()
class LemmaTokenizer(object):
    def __call__(self, text):
        return [wnl.lemmatize(t) for t in word_tokenize(sent_tokenize(text))]

sentt_vec = TfidfVectorizer(tokenizer=LemmaTokenizer())
tokenizer=LemmaTokenizer()

## Text processing

### Binary representation

In [37]:
# Tokenize and build the vocabulary
bin_vec.fit(train_text)

# Create the feautre matrices 
train_bin = bin_vec.transform(train_text)
test_bin = bin_vec.transform(test_raw)

### Monograms and Bigrams 

In [62]:
# Tokenize and build the vocabulary
bigram_vec.fit(train_text)

# Create the feautre matrices 
train_bigram = bigram_vec.transform(train_text)
test_bigram = bigram_vec.transform(test_raw)

### Tf-Idf 

In [39]:
# Tokenize and build the vocabulary
tfidf_vec.fit(train_text)

# Create the feautre matrices 
train_tfidf = tfidf_vec.fit_transform(train_text)
test_tfidf = tfidf_vec.transform(test_raw)

### Tf-Idf with only sentiment lexicon, Normalized

In [42]:
# Tokenize and build the vocabulary
sentt_vec.fit(sent_words)

# Create the feautre matrices 
train_s_tfidf = sentt_vec.transform(train_text)
test_s_tfidf = sentt_vec.transform(test_raw)

normalizer_tran = Normalizer().fit(X=train_s_tfidf)
X_train_sn = normalizer_tran.transform(train_s_tfidf)
X_test_sn = normalizer_tran.transform(test_s_tfidf)

### Normalized Tf-Idf

In [None]:
from sklearn.preprocessing import Normalizer, PolynomialFeatures
from sklearn.decomposition import TruncatedSVD

def process (X_train, X_test, p, var):
    poly = PolynomialFeatures(p)
    pca = TruncatedSVD(n_components = 10000)
    normalizer_tranformer = Normalizer().fit(X_train)
    X_train = normalizer_tranformer.transform(X_train)
    X_test= normalizer_tranformer.transform(X_test)


    pca.fit(X_train)
    X_train = pca.transform(X_train)
    X_test = pca.transform(X_test)
    '''
    poly.fit(X=X_train)
    X_train = poly.transform(X_train)
    X_test = poly.transform(X_test)
    
    pca.fit(X=X_train)
    X_train = pca.transform(X_train)
    X_test = pca.transform(X_test)
    '''
    return (X_train, X_test)

In [None]:
(X_train_normalized, X_test_normalized) = process(train_tfidf, test_tfidf, 2, 0.95)
print (X_train_normalized.shape)

In [12]:
from sklearn.preprocessing import Normalizer
normalizer_tranformer = Normalizer().fit(X=train_tfidf)
X_train_normalized = normalizer_tranformer.transform(train_tfidf)
X_test_normalized = normalizer_tranformer.transform(test_tfidf)

## Bernoulli Naive Bayes model from scratch

In [23]:
import math

class NaiveBayesScratch():
    """Bernouli Naive Bayes"""
    def train(self, X_train, Y_train): #bow = bag of words
#         X_train = X_train.toarray()
        Y_train = Y_train.reshape(len(Y_train),1)
        num_pos = np.sum(Y_train)
        num_neg = len(Y_train) - num_pos
        
        self.theta_1_ = num_pos/float(len(Y_train))
        self.theta_j1_ = (X_train*Y_train).sum(axis=0) # sum along training examples
        self.theta_j0_ = (X_train*(1-Y_train)).sum(axis=0)
        
        # Laplace smoothing
        self.theta_j1_ = (self.theta_j1_ + 1)/(float(num_pos) + 2)
        self.theta_j0_ = (self.theta_j0_ + 1)/(float(num_neg) + 2)
        
        # Prepare for predictions
        self.predict_theta_1_ = math.log(self.theta_1_/(1-self.theta_1_))

        self.predict_pos_ = np.log(self.theta_j1_/self.theta_j0_)
        self.predict_pos_ = self.predict_pos_.reshape(len(self.predict_pos_),1)
        
        self.predict_neg_ = np.log((1-self.theta_j1_)/(1-self.theta_j0_))
        self.predict_neg_ = self.predict_neg_.reshape(len(self.predict_neg_),1)
    
    def predict(self, X):
#         X = X.toarray()
        pos = np.dot(X, self.predict_pos_)
        neg = np.dot(1-X, self.predict_neg_)
        delta_predictions = self.predict_theta_1_ + pos + neg
        binary_predictions = delta_predictions > 0
        return binary_predictions

In [15]:
naive_bayes = NaiveBayesScratch()
naive_bayes.train(train_bin, Y_train)

KeyboardInterrupt: 

In [None]:
mini_X_test = train_bin
mini_Y_test = Y_train
mini_Y_test = mini_Y_test.reshape(mini_Y_test.shape[0],1)

predictions = naive_bayes.predict(mini_X_test)
# print(predictions==mini_Y_test)
score1 = np.sum(predictions==mini_Y_test, axis=0)/float(mini_Y_test.shape[0])
print(score1)

## Cross Validation

### To test standard Naive Bayes:

In [26]:
# Define train(), evaluate() functions
import random
import math

def k_cross_validate(X_train, Y_train, k):
    """
    X_train: n x m array
    Y_train: n x 1 array
    k: number of folds
    """
    indeces = random.sample(range(Y_train.shape[0]), Y_train.shape[0])
    step = int(X_train.shape[0]/k)
    scores = []
    for k_fold in range(k):
        k_validate_indeces = indeces[k_fold*step:(k_fold+1)*step]
        k_train_indeces = [i for i in range(X_train.shape[0]) if i not in k_validate_indeces]
        
        b = NaiveBayesScratch()
        b.train(X_train[k_train_indeces,:], Y_train[k_train_indeces,:])
        predictions = b.predict(X_train[k_validate_indeces,:])
        score = np.sum(predictions==Y_train[k_validate_indeces,:]) / float(Y_train[k_validate_indeces,:].shape[0])
        scores.append(score)
    mean = np.array(scores).mean()
    print("Scores: {}".format(scores))
    print("Scores Mean: {}".format(mean))

    
formatted_Y_train = Y_train.reshape(Y_train.shape[0], 1)
k_cross_validate(train_bin.toarray(), formatted_Y_train, 3)

Scores: [0.858634345373815, 0.842913716548662, 0.8471138845553822]
Scores Mean: 0.8495539821592865


### Cross validation function for sklearn models

In [44]:
from sklearn.model_selection import cross_val_score
# Function to cross validate a scikitlearn model
def crossvalidate(model, X_train, Y_train, fold) :
    scores = cross_val_score(model, X_train, Y_train, cv=fold)
    mean = scores.mean()
    print("Scores: {}".format(scores))
    print("Scores Mean: {}".format(mean))

## Models

### Naive Bayes (to make sure it worked)

In [69]:
from sklearn.naive_bayes import BernoulliNB
bayes_clf = BernoulliNB()
crossvalidate(bayes_clf, train_bin, Y_train, 3)

Scores: [0.85493161 0.84581234 0.8524964 ]
Scores Mean: 0.8510801133028897


In [70]:
from sklearn.naive_bayes import MultinomialNB
bayes_clf = MultinomialNB()
crossvalidate(bayes_clf, train_tfidf, Y_train, 3)

Scores: [0.86741061 0.85805136 0.85633701]
Scores Mean: 0.8605996589883947


### Logistic Regression

In [66]:
from sklearn import linear_model
regr_clf = linear_model.LogisticRegression()
crossvalidate(regr_clf, train_tfidf, Y_train, 3)



Scores: [0.88840893 0.88348932 0.8799808 ]
Scores Mean: 0.8839596816892191


### Decision Trees

In [67]:
from sklearn import tree
dec_clf = tree.DecisionTreeClassifier()
crossvalidate(dec_clf, train_tfidf, Y_train, 3)

Scores: [0.70410367 0.70074394 0.70199232]
Scores Mean: 0.7022799769873428


I wonder if we can play with the parameters more 

### Support Vector Machines

In [68]:
from sklearn import svm
svm_clf = svm.LinearSVC()
crossvalidate(svm_clf, train_tfidf, Y_train, 3)

Scores: [0.89464843 0.89188865 0.88886222]
Scores Mean: 0.8917997649962367


### Ensemble Stacking Method

In [104]:
# test_tfidf = tfidf_vec.transform(test_raw)

validate_indeces = random.sample(range(Y_train.shape[0]), 1000)
train_indeces = [i for i in range(len(train_text)) if i not in validate_indeces]

train_text1 = [train_text[i] for i in train_indeces]
validate_text1 = [train_text[i] for i in validate_indeces]

tfidf_vec.fit(train_text1)
train_tfidf = tfidf_vec.fit_transform(train_text1)
validate_tfidf = tfidf_vec.fit_transform(validate_text1)

Y_train1 = Y_train[train_indeces]

svm_clf = svm.LinearSVC()
regr_clf = linear_model.LogisticRegression()
bayes_clf = MultinomialNB()

svm_clf.fit(train_tfidf, Y_train_rand)
bayes_clf.fit(train_tfidf, Y_train_rand)
y1 = svm_clf.predict(train_tfidf)
y2 = bayes_clf.predict(train_tfidf)

y1_validate = svm_clf.predict(validate_tfidf)
y2_validate = bayes_clf.predict(validate_tfidf)

ValueError: X has 18029 features per sample; expecting 73626

In [None]:
y = np.concatenate((np.array(y1).reshape(len(y1),1),np.array(y2).reshape(len(y2),1)),axis=1)
y_validate = np.concatenate((np.array(y1_validate).reshape(len(y1_validate),1),np.array(y2_validate).reshape(len(y2_validate),1)),axis=1)

In [95]:
print(y.shape)

(24000, 2)


In [96]:
regr_clf = linear_model.LogisticRegression()
regr_clf.fit(y,Y_train_rand)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [98]:
np.sum(regr_clf.predict(y_validate)==Y_train[validate_indeces])/np.shape(Y_train)[0]

ValueError: X has 17568 features per sample; expecting 2