# Mini Project 2: IMDB Sentiment Analysis

February 22, 2019

Akshal Aniche, Jacob Sanz-Robinson, Raphael Hotter

COMP 551

## Imports

In [8]:
import numpy as np
from sklearn.model_selection import cross_val_score

## Load the data

In [9]:
import os
pos_train_files = os.listdir('data/train/pos/')
neg_train_files = os.listdir('data/train/neg/')
test_files = os.listdir('data/test/')

# Remove .DS_Store files
while '.DS_Store' in pos_train_files:
  pos_train_files.remove('.DS_Store')
while '.DS_Store' in neg_train_files:
  neg_train_files.remove('.DS_Store')
while '.DS_Store' in test_files:
  test_files.remove('.DS_Store')

In [None]:
# Reads data into 2 arrays
train_text, test_raw = [], []

for file in pos_train_files:
  with open('data/train/pos/{}'.format(file), 'r') as f:
    train_text.append(f.read())

for file in neg_train_files:
  with open('data/train/neg/{}'.format(file), 'r') as f:
    train_text.append(f.read())

for file in test_files:
  with open('data/test/{}'.format(file), 'r') as f:
    test_raw.append(f.read())

In [None]:
# Training set Y vector
pos_goal = np.ones((12500))
neg_goal = np.zeros((12500))
Y_train = np.append(pos_goal, neg_goal, axis = 0)

## Data processer preparation

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

bin_vec = CountVectorizer(binary=True)
tfidf_vec = TfidfVectorizer()
bigram_vec = CountVectorizer(ngram_range=(1,2))

## Text processing

### Binary representation

In [None]:
# Tokenize and build the vocabulary
bin_vec.fit(train_text)

# Create the feautre matrices 
train_bin = bin_vec.transform(train_text)
test_bin = bin_vec.transform(test_raw)

### Monograms and Bigrams 

In [None]:
# Tokenize and build the vocabulary
bigram_vec.fit(train_text)

# Create the feautre matrices 
train_bigram = bigram_vec.transform(train_text)
test_bigram = bigram_vec.transform(test_raw)

### Tf-Idf 

In [None]:
# Tokenize and build the vocabulary
tfidf_vec.fit(train_text)

# Create the feautre matrices 
train_tfidf = tfidf_vec.transform(train_text)
test_tfidf = tfidf_vec.transform(test_raw)

### Normalized Tf-Idf

In [None]:
from sklearn.preprocessing import Normalizer
normalizer_tranformer = Normalizer().fit(X=train_tfidf)
X_train_normalized = normalizer_tranformer.transform(train_tfidf)
X_test_normalized = normalizer_tranformer.transform(test_tfidf)

## Bernoulli Naive Bayes model from scratch

In [6]:
numPos = len(pos_train_files)
numNeg = len(neg_train_files)
numTot = len(train_text)
#print(numPos, numNeg, numTot)

#Prior probability calculation
probPos = float(numPos)/numTot
probNeg = float(numNeg)/numTot
#print(probPos, probNeg)

In [7]:
theta_j1 = [0] * len(bin_vec.get_feature_names()) #instantiate array of 0's. One cell per each word in bag of words.
exampleNum = 0
while exampleNum < 500:#12499: # for each review
    currentWordNum = 0 #index to keep track of current word's index
    for wordOccurs in train_bin[exampleNum].toarray().sum(axis=0):
        if wordOccurs != 0: #if a word occurs, then we add to its count in the array
            theta_j1[currentWordNum] = theta_j1[currentWordNum] + 1
        currentWordNum = currentWordNum + 1
    exampleNum = exampleNum + 1
    #if exampleNum%100 == 0:
        #print exampleNum

theta_j1_np = np.array(theta_j1)
theta_j1_final = theta_j1_np/float(numPos)

NotFittedError: CountVectorizer - Vocabulary wasn't fitted.

In [None]:
theta_j0 = [0] * len(bin_vec.get_feature_names()) #same as above for theta_j1
exampleNum = 12500
while exampleNum < 13200:#24999: 
    currentWordNum = 0 
    for wordOccurs in train_bin[exampleNum].toarray().sum(axis=0):
        if wordOccurs != 0: 
            theta_j0[currentWordNum] = theta_j0[currentWordNum] + 1
        currentWordNum = currentWordNum + 1
    exampleNum = exampleNum + 1
    #if exampleNum%100 == 0:
        #print exampleNum

theta_j0_np = np.array(theta_j0)
theta_j0_final = theta_j0_np/float(numNeg)

## Models

### Naive Bayes (to make sure it worked)

In [None]:
from sklearn.naive_bayes import MultinomialNB
bayes_clf = MultinomialNB()

#### Cross Validation

In [None]:
scores = cross_val_score(bayes_clf, X_train_normalized, Y_train, cv=4)
mean = scores.mean()
print("Scores: {}".format(scores))
print("Scores Mean: {}".format(mean))

### Logistic Regression

In [None]:
from sklearn import linear_model
regr_clf = linear_model.LogisticRegression(solver='lbfgs')

#### Cross Validation

In [None]:
scores = cross_val_score(regr_clf, X_train_normalized, Y_train, cv=4)
mean = scores.mean()
print("Scores: {}".format(scores))
print("Scores Mean: {}".format(mean))

### Decision Trees

In [None]:
from sklearn import tree
dec_clf = tree.DecisionTreeClassifier()

In [None]:
scores = cross_val_score(dec_clf, X_train_normalized, Y_train, cv=4)
mean = scores.mean()
print("Scores: {}".format(scores))
print("Scores Mean: {}".format(mean))

I wonder if we can play with the parameters more 

### Support Vector Machines

In [None]:
from sklearn import svm
svm_clf = svm.LinearSVC()

In [None]:
scores = cross_val_score(svm_clf, X_train_normalized, Y_train, cv=4)
mean = scores.mean()
print("Scores: {}".format(scores))
print("Scores Mean: {}".format(mean))

# Cross Validation

To test standard Naive Bayes:

In [None]:
# Define train(), evaluate() functions
def k_cross_validate(X_train, Y_train, k):
    full_matrix = np.concatenate((X_train, Y_train), axis=1)
    np.random.shuffle(full_matrix) # shuffles rows
    X_train = full_matrix[:,:-1]
    Y_train = full_matrix[:,-1]
    groups = []
    num_rows = X_train.shape[0]
    spacing = int(num_rows/k)
    
    # split into groups
    for i in range(k-1):
        X_mini_train = np.concatenate((X_train[i*spacing:(i+1)spacing, :]), axis=0)
              
              
            groups.append((X_train[i*spacing:(i+1)spacing, :], X_train[i*spacing:(i+1)spacing, :]))
    groups.append((X_train[(k-1)*spacing, :], Y_train[(k-1)*spacing, :]))
        