# Mini Project 2: IMDB Sentiment Analysis

February 22, 2019

Akshal Aniche, Raphael Hotter, Jacob Sanz-Robinson

COMP 551

## Imports

In [13]:
import numpy as np

## Load the data

In [14]:
import os
pos_train_files = os.listdir('data/train/pos/')
neg_train_files = os.listdir('data/train/neg/')
test_files = os.listdir('data/test/')

pos_words = 'opinion-lexicon-English/positive-words.txt'
neg_words = 'opinion-lexicon-English/negative-words.txt'


# Remove .DS_Store files
while '.DS_Store' in pos_train_files:
  pos_train_files.remove('.DS_Store')
while '.DS_Store' in neg_train_files:
  neg_train_files.remove('.DS_Store')
while '.DS_Store' in test_files:
  test_files.remove('.DS_Store')

test_files.sort(key=lambda x : int(x[:-4]))

In [15]:
# Reads data into 2 arrays
train_text, test_raw = [], []

for file in pos_train_files:
  with open('data/train/pos/{}'.format(file), 'r') as f:
    train_text.append(f.read().lower())

for file in neg_train_files:
  with open('data/train/neg/{}'.format(file), 'r') as f:
    train_text.append(f.read().lower())

for file in test_files:
  with open('data/test/{}'.format(file), 'r') as f:
    test_raw.append(f.read().lower())

In [16]:
# Training set Y vector
pos_goal = np.ones((12500))
neg_goal = np.zeros((12500))
Y_train = np.append(pos_goal, neg_goal, axis = 0)

### Read lexicon of sentiment words

In [17]:
sent_words = []

with open(pos_words, 'r', encoding="latin-1") as f:
    for line in f:
        if (line[0] != ';'):
            sent_words.append(line.strip())

with open(neg_words, 'r', encoding="latin-1") as f:
    for line in f:
        if (line[0] != ';'):
            sent_words.append(line.strip())
            
# print(sent_words)

## Data processer preparation

### Lemmatization

In [18]:
from nltk import word_tokenize, sent_tokenize, pos_tag     
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet

def get_pos(word):
    #Map POS tag to first character lemmatize() accepts
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    
    return tag_dict.get(tag, wordnet.NOUN)

#Lemmatizer integrated with the tokenizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        
    def __call__(self, text):
        sent = [s for s in sent_tokenize(text)]
        words = []
        for s in sent:
            words = words + word_tokenize(s)
        return [self.wnl.lemmatize(t, get_pos(t)) for t in words]

In [24]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
bin_vec = CountVectorizer(binary=True)
tfidf_vec = TfidfVectorizer()
bigram_vec = TfidfVectorizer(ngram_range=(1,2))
sentc_vec = CountVectorizer(binary=True)
sentt_vec = TfidfVectorizer()

## Text processing

### Binary representation

In [20]:
# Tokenize and build the vocabulary
bin_vec.fit(train_text)

# Create the feautre matrices 
train_bin = bin_vec.transform(train_text)
test_bin = bin_vec.transform(test_raw)

### TF-IDF

In [21]:
# Tokenize and build the vocabulary
tfidf_vec.fit(train_text)

# Create the feature matrices 
train_tfidf = tfidf_vec.fit_transform(train_text)
test_tfidf = tfidf_vec.transform(test_raw)

In [22]:
# Tokenize and build the vocabulary
bigram_vec.fit(train_text)

# Create the feature matrices 
train_bigram = bigram_vec.fit_transform(train_text)
test_bigram = bigram_vec.transform(test_raw)

### Tf-Idf with only sentiment lexicon

In [23]:
# Tokenize and build the vocabulary
sentt_vec.fit(sent_words)

# Create the feautre matrices 
train_s_tfidf = sentt_vec.transform(train_text)
test_s_tfidf = sentt_vec.transform(test_raw)

# normalizer_tran = Normalizer().fit(X=train_s_tfidf)
# X_train_sn = normalizer_tran.transform(train_s_tfidf)
# X_test_sn = normalizer_tran.transform(test_s_tfidf)

KeyboardInterrupt: 

### Combining Features

In [None]:
from sklearn.pipeline import FeatureUnion

tfidf_vec = TfidfVectorizer()
bigram_vec = CountVectorizer(ngram_range=(1,2))

combined_features = FeatureUnion([("bigram_vec", bigram_vec), ("tfidf", tfidf_vec)])
X = combined_features.fit(train_text).transform(train_text)

### Normalized Tf-Idf

In [None]:
from sklearn.preprocessing import Normalizer, PolynomialFeatures
from sklearn.decomposition import TruncatedSVD

def process (X_train, X_test, p, var):
    poly = PolynomialFeatures(p)
    pca = TruncatedSVD(n_components = 10000)
    normalizer_tranformer = Normalizer().fit(X_train)
    X_train = normalizer_tranformer.transform(X_train)
    X_test= normalizer_tranformer.transform(X_test)


    pca.fit(X_train)
    X_train = pca.transform(X_train)
    X_test = pca.transform(X_test)
    '''
    poly.fit(X=X_train)
    X_train = poly.transform(X_train)
    X_test = poly.transform(X_test)
    
    pca.fit(X=X_train)
    X_train = pca.transform(X_train)
    X_test = pca.transform(X_test)
    '''
    return (X_train, X_test)

In [None]:
print(test_bigram.shape)

In [None]:
pca = TruncatedSVD(n_components = 500)
pca.fit(train_tfidf)
X_train = pca.transform(train_tfidf)


In [None]:
pca = TruncatedSVD(n_components = 100)
pca.fit(train_bigram)
X_train_2 = pca.transform(train_bigram)


In [None]:
poly = PolynomialFeatures(2)
poly.fit(X_train)
X_poly = poly.transform(X_train)

In [None]:
poly = PolynomialFeatures(2)
poly.fit(X_train_2)
X_poly_2 = poly.transform(X_train_2)

In [None]:
from sklearn.preprocessing import Normalizer
normalizer_tranformer = Normalizer().fit(X=train_tfidf)
X_train_normalized = normalizer_tranformer.transform(train_tfidf)
X_test_normalized = normalizer_tranformer.transform(test_tfidf)

In [None]:
from sklearn.preprocessing import Normalizer
normalizer_tranformer = Normalizer().fit(X=train_bigram)
X_train_normalized_2 = normalizer_tranformer.transform(train_bigram)
X_test_normalized = normalizer_tranformer.transform(test_tfidf)

## Bernoulli Naive Bayes model from scratch

In [None]:
import math

class NaiveBayesScratch():
    """Bernouli Naive Bayes"""
    def train(self, X_train, Y_train): #bow = bag of words
#         X_train = X_train.toarray()
        Y_train = Y_train.reshape(len(Y_train),1)
        num_pos = np.sum(Y_train)
        num_neg = len(Y_train) - num_pos
        
        self.theta_1_ = num_pos/float(len(Y_train))
        self.theta_j1_ = (X_train*Y_train).sum(axis=0) # sum along training examples
        self.theta_j0_ = (X_train*(1-Y_train)).sum(axis=0)
        
        # Laplace smoothing
        self.theta_j1_ = (self.theta_j1_ + 1)/(float(num_pos) + 2)
        self.theta_j0_ = (self.theta_j0_ + 1)/(float(num_neg) + 2)
        
        # Prepare for predictions
        self.predict_theta_1_ = math.log(self.theta_1_/(1-self.theta_1_))

        self.predict_pos_ = np.log(self.theta_j1_/self.theta_j0_)
        self.predict_pos_ = self.predict_pos_.reshape(len(self.predict_pos_),1)
        
        self.predict_neg_ = np.log((1-self.theta_j1_)/(1-self.theta_j0_))
        self.predict_neg_ = self.predict_neg_.reshape(len(self.predict_neg_),1)
    
    def predict(self, X):
#         X = X.toarray()
        pos = np.dot(X, self.predict_pos_)
        neg = np.dot(1-X, self.predict_neg_)
        delta_predictions = self.predict_theta_1_ + pos + neg
        binary_predictions = delta_predictions > 0
        return binary_predictions

In [None]:
naive_bayes = NaiveBayesScratch()
naive_bayes.train(train_bin, Y_train)

In [None]:
mini_X_test = train_bin
mini_Y_test = Y_train
mini_Y_test = mini_Y_test.reshape(mini_Y_test.shape[0],1)

predictions = naive_bayes.predict(mini_X_test)
# print(predictions==mini_Y_test)
score1 = np.sum(predictions==mini_Y_test, axis=0)/float(mini_Y_test.shape[0])
print(score1)

## Cross Validation

### To test standard Naive Bayes:

In [None]:
# Define train(), evaluate() functions
import random
import math

def k_cross_validate(X_train, Y_train, k):
    """
    X_train: n x m array
    Y_train: n x 1 array
    k: number of folds
    """
    indeces = random.sample(range(Y_train.shape[0]), Y_train.shape[0])
    step = int(X_train.shape[0]/k)
    scores = []
    for k_fold in range(k):
        k_validate_indeces = indeces[k_fold*step:(k_fold+1)*step]
        k_train_indeces = [i for i in range(X_train.shape[0]) if i not in k_validate_indeces]
        
        b = NaiveBayesScratch()
        b.train(X_train[k_train_indeces,:], Y_train[k_train_indeces,:])
        predictions = b.predict(X_train[k_validate_indeces,:])
        score = np.sum(predictions==Y_train[k_validate_indeces,:]) / float(Y_train[k_validate_indeces,:].shape[0])
        scores.append(score)
    mean = np.array(scores).mean()
    print("Scores: {}".format(scores))
    print("Scores Mean: {}".format(mean))

    
formatted_Y_train = Y_train.reshape(Y_train.shape[0], 1)
k_cross_validate(train_bin.toarray(), formatted_Y_train, 3)

### Cross validation function for sklearn models

In [None]:
from sklearn.model_selection import cross_val_score
# Function to cross validate a scikitlearn model
def crossvalidate(model, X_train, Y_train, fold) :
    scores = cross_val_score(model, X_train, Y_train, cv=fold)
    mean = scores.mean()
    print("Scores: {}".format(scores))
    print("Scores Mean: {}".format(mean))

## Models

### Naive Bayes (to make sure it worked)

In [None]:
from sklearn.naive_bayes import BernoulliNB
bayes_clf = BernoulliNB()
crossvalidate(bayes_clf, train_bin, Y_train, 3)

In [None]:
from sklearn.naive_bayes import MultinomialNB
bayes_clf = MultinomialNB()
crossvalidate(bayes_clf, train_tfidf, Y_train, 3)

### Logistic Regression

In [None]:
from sklearn import linear_model
regr_clf = linear_model.LogisticRegression(solver = 'lbfgs')
crossvalidate(regr_clf, train_tfidf, Y_train, 3)

In [None]:
from sklearn import linear_model
regr_clf = linear_model.LogisticRegression()
crossvalidate(regr_clf, train_bigram, Y_train, 3)

### Decision Trees

In [None]:
from sklearn import tree
dec_clf = tree.DecisionTreeClassifier()
crossvalidate(dec_clf, train_s_tfidf, Y_train, 3)

### Support Vector Machines

In [None]:
from sklearn import svm
svm_clf = svm.LinearSVC()
crossvalidate(svm_clf, train_bigram_nm, Y_train, 3)

### Random and Extra Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf2 = RandomForestClassifier()
crossvalidate(clf2, train_bigram, Y_train, 3)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier()
crossvalidate(clf, train_bigram, Y_train, 3)

## Multiple Features

### Logistic Regression

In [None]:
from sklearn import linear_model
regr_clf = linear_model.LogisticRegression()
crossvalidate(regr_clf, X, Y_train, 3)

### Support Vector Machines

In [None]:
#Simple TFIDF features 
from sklearn import svm
svm_clf = svm.LinearSVC()
crossvalidate(svm_clf, train_tfidf, Y_train, 3)

In [None]:
#TFIDF PCA to 100
from sklearn import svm
svm_clf = svm.LinearSVC()
crossvalidate(svm_clf, X_train, Y_train, 3)

In [None]:
#TFIDF + PCA + NORMALIZED
from sklearn import svm
svm_clf = svm.LinearSVC()
crossvalidate(svm_clf, X_train_normalized, Y_train, 3)

In [None]:
#TFIDF + PCA + POLYNOMIAL DEGREE 2
from sklearn import svm
svm_clf = svm.LinearSVC()
crossvalidate(svm_clf, X_poly, Y_train, 3)

In [None]:
#Simple TFIDF features including 2-grams
from sklearn import svm
svm_clf = svm.LinearSVC()
crossvalidate(svm_clf, train_bigram, Y_train, 3)

In [None]:
#TFIDF PCA to 100 bigrams
from sklearn import svm
svm_clf = svm.LinearSVC()
crossvalidate(svm_clf, X_train_2, Y_train, 3)

In [None]:
#TFIDF + PCA + NORMALIZED bigrams
from sklearn import svm
svm_clf = svm.LinearSVC()
crossvalidate(svm_clf, X_train_normalized_2, Y_train, 3)

In [None]:
from sklearn import svm
svm_clf = svm.LinearSVC()
crossvalidate(svm_clf, X, Y_train, 3)

In [None]:
#TFIDF + PCA + POLYNOMIAL DEGREE 2 bigrams
from sklearn import svm
svm_clf = svm.LinearSVC()
crossvalidate(svm_clf, X_poly_2, Y_train, 3)

## Ensemble Methods — Take 2

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier

clf1 = LogisticRegression()
clf2 = MultinomialNB()
clf3 = LinearSVC()

eclf = VotingClassifier(estimators=[('lr', clf1), ('nb', clf2), ('svm', clf3)], voting='hard')
crossvalidate(eclf, p, Y_train, 3)

## Ensemble Stacking Method - Failure

In [None]:
import random
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn import linear_model

# test_tfidf = tfidf_vec.transform(test_raw)
tfidf_vec = TfidfVectorizer()

validate_indeces = random.sample(range(Y_train.shape[0]), 1000)
train_indeces = [i for i in range(len(train_text)) if i not in validate_indeces]

train_text1 = [train_text[i] for i in train_indeces]
validate_text1 = [train_text[i] for i in validate_indeces]

tfidf_vec.fit(train_text1)
train_tfidf = tfidf_vec.transform(train_text1)
validate_tfidf = tfidf_vec.transform(validate_text1)

Y_train1 = Y_train[train_indeces]

svm_clf = svm.LinearSVC()
regr_clf = linear_model.LogisticRegression()
bayes_clf = MultinomialNB()

svm_clf.fit(train_tfidf, Y_train1)
bayes_clf.fit(train_tfidf, Y_train1)
y1 = svm_clf.predict(train_tfidf)
y2 = bayes_clf.predict_proba(train_tfidf)

y1_validate = svm_clf.predict(validate_tfidf)
y2_validate = bayes_clf.predict_proba(validate_tfidf)

In [None]:
y1 = np.array(y1).reshape(len(y1),1)
y2 = np.array(y2[:,1]).reshape(len(y2),1)
y = np.concatenate((y1,y2,y1*y2),axis=1)

y1_validate = np.array(y1_validate)
y2_validate = np.array(y2_validate)[:,1]
y1_validate = y1_validate.reshape(y1_validate.shape[0],1)
y2_validate = y2_validate.reshape(y2_validate.shape[0],1)

# y1_reshaped = np.array(y1_validate).reshape(len(y1_validate),1)
# y2_reshaped = np.array(y2_validate).reshape(len(y2_validate),1)

y_validate = np.concatenate((y1_validate,y2_validate),axis=1)

In [None]:
print(y)

In [None]:
regr_clf = linear_model.LogisticRegression()
regr_clf.fit(y, Y_train1)

In [None]:
np.array(regr_clf.predict(y_validate)) - Y_train[validate_indeces]

In [None]:
np.sum((np.array(regr_clf.predict(y_validate))-Y_train[validate_indeces])==0)/np.shape(Y_train[validate_indeces])[0]

# Test set prediction

In [26]:
def predict(model, X_test):
    Y_test = model.predict(X_test)
    return Y_test

def write_prediction(model, X_test):
    Y_test = (model.predict(X_test)).tolist()
    with open("submission.csv", "w+") as f:
        f.write("Id,Category\n")
        for (i in range (25000)):
            s = "" + str(i) + ',' + str(Y_test[i]) + "\n"
            f.write(s)
        


SyntaxError: invalid syntax (<ipython-input-26-bd1a49a435b5>, line 9)