In [1]:
import pandas as pd 
import numpy as np
import math
from sklearn import feature_extraction
from sklearn import svm
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

In [2]:
def preprocess(review):
    processedReview = ""
    words = review.split(' ')
    for word in words:
        processedWord = ""
        word = word.lower()
        for i in range(len(word)):
            if not (ord(word[i]) < 97 or ord(word[i]) > 122):
                processedWord += word[i]
        processedReview += processedWord + " "
    
    return processedReview[:len(processedReview)-1]

In [3]:
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()
def load_file(filename):
    file = open(filename, 'r')
    X = []
    y = []
    for line in file:
        split = line.split('\t')
        label = split[1]
        words = tokenizer.tokenize(split[0])
        for i in range(len(words)):
            words[i] = lemmatizer.lemmatize(words[i])
        review = " ".join(words)
        X.append(review)
        y.append(label)
    return X,y

In [4]:
X_train, y_train = load_file("IMDB-train.txt")
X_dev, y_dev = load_file("IMDB-valid.txt")
X_test, y_test = load_file("IMDB-test.txt")

print(X_train[0])
print(y_train[0])

For a movie that get no respect there sure are a lot of memorable quote listed for this gem Imagine a movie where Joe Piscopo is actually funny Maureen Stapleton is a scene stealer The Moroni character is an absolute scream Watch for Alan The Skipper Hale jr a a police Sgt
1



In [5]:
vectorizer = feature_extraction.text.CountVectorizer(ngram_range = (1,1), binary=True)
vectorizer_freq = TfidfVectorizer(ngram_range = (1,2))
vectorizer.fit(X_train + X_dev + X_test)
vectorizer_freq.fit(X_train + X_dev + X_test)
#bag of words vectors
X_train_bow = vectorizer.transform(X_train)
X_dev_bow = vectorizer.transform(X_dev)
X_test_bow = vectorizer.transform(X_test)

# # frequency bag of words vectors
# X_train_fbow = (vectorizer_freq.transform(X_train)).toarray()
# X_dev_fbow = (vectorizer_freq.transform(X_dev)).toarray()
# X_test_fbow = (vectorizer_freq.transform(X_test)).toarray()

In [6]:
# bag of words
alpha_values = np.linspace(1e-4, 1.1, 2000)
bestAlpha = 0
bestValidError = 0
X_final = vectorizer.transform(X_train + X_dev)
y_final = y_train + y_dev

for alpha in alpha_values:
    clf = BernoulliNB(alpha=alpha)
    clf.fit(X_train_bow, y_train)
    y_pred = clf.predict(X_dev_bow)
    score = metrics.accuracy_score(y_dev, y_pred)
    print(score)
    if score>bestValidError :
        bestValidError = score
        bestAlpha = alpha

clf = BernoulliNB(alpha=bestAlpha)
clf.fit(X_final, y_final)
y_pred = clf.predict(X_test_bow)
score = metrics.accuracy_score(y_test, y_pred)
print("best results for:")
print(bestAlpha)
print(score)

0.8544
0.8587
0.8613
0.8626
0.8626
0.8626
0.8629
0.8632
0.8636
0.864
0.8645
0.8648
0.8648
0.8647
0.8651
0.8653
0.8654
0.8655
0.8657
0.8657
0.8663
0.8666
0.8667
0.8665
0.8665
0.8669
0.8673
0.8672
0.8676
0.8674
0.8678
0.8681
0.8682
0.8683
0.8684
0.8687
0.8689
0.8689
0.8694
0.8692
0.8692
0.8693
0.8696
0.8696
0.8696
0.8696
0.8696
0.8698
0.8695
0.8699
0.87
0.8701
0.87
0.8702
0.87
0.8704
0.8703
0.8706
0.8707
0.871
0.8709
0.8709
0.8711
0.8713
0.8714
0.8713
0.8712
0.8713
0.871
0.8712
0.8712
0.8711
0.8712
0.8712
0.8714
0.8715
0.8715
0.8713
0.8713
0.8714
0.8713
0.8712
0.8712
0.8712
0.8711
0.8713
0.8713
0.8714
0.8714
0.8714
0.8715
0.8715
0.8714
0.8714
0.8714
0.8714
0.8715
0.8717
0.8716
0.8716
0.8716
0.8716
0.8718
0.8717
0.8718
0.8719
0.8719
0.8719
0.8718
0.8718
0.8717
0.8717
0.8716
0.8717
0.8716
0.8715
0.8714
0.8714
0.8714
0.8714
0.8715
0.8715
0.8715
0.8715
0.8714
0.8713
0.8713
0.8713
0.8713
0.8711
0.871
0.8709
0.8709
0.8708
0.8708
0.8706
0.8705
0.8705
0.8705
0.8703
0.8703
0.8703
0.8704
0.8705
0.

0.8666
0.8666
0.8665
0.8665
0.8665
0.8665
0.8664
0.8664
0.8664
0.8664
0.8664
0.8665
0.8665
0.8665
0.8665
0.8665
0.8665
0.8665
0.8665
0.8665
0.8665
0.8665
0.8665
0.8665
0.8665
0.8665
0.8665
0.8665
0.8665
0.8665
0.8666
0.8666
0.8666
0.8666
0.8666
0.8666
0.8666
0.8667
0.8667
0.8667
0.8667
0.8668
0.8668
0.8667
0.8667
0.8667
0.8666
0.8666
0.8666
0.8666
0.8666
0.8666
0.8666
0.8666
0.8666
0.8666
0.8666
0.8666
0.8666
0.8666
0.8666
0.8666
0.8666
0.8666
0.8666
0.8665
0.8664
0.8664
0.8664
0.8664
0.8664
0.8664
0.8664
0.8664
0.8664
0.8664
0.8664
0.8663
0.8663
0.8663
0.8662
0.8662
0.8662
0.8662
0.8662
0.8662
0.8662
0.8662
0.8662
0.8662
0.8663
0.8663
0.8663
0.8663
0.8663
0.8663
0.8663
0.8663
0.8663
0.8663
0.8663
0.8663
0.8663
0.8663
0.8663
0.8663
0.8663
0.8663
0.8663
0.8663
0.8663
0.8663
0.8663
0.8663
0.8663
0.8663
0.8664
0.8664
0.8664
0.8664
0.8664
0.8664
0.8664
0.8664
0.8664
0.8664
0.8664
0.8664
0.8663
0.8662
0.8662
0.8662
0.8662
0.8662
0.8662
0.8662
0.8662
0.8662
0.8662
0.8662
0.8661
0.8661
0.8661

In [None]:
# 2 grain classification
#frequency bag of words
smoothing_values = np.arange(0,1,0.001)
bestValue =0
best = 0
for value in smoothing_values:
    print("smoothing : " + str(value))
    clf = GaussianNB(var_smoothing=value)
    clf.fit(X_train_fbow, y_train)
    y_pred = clf.predict(X_dev_fbow)
    score = metrics.accuracy_score(y_dev, y_pred)
    print("accuracy : " + str(score))
    if score > best:
        best = score
        bestValue = value

X_final = vectorizer_freq.transform(X_train + X_dev)
y_final = y_train + y_dev
clf = GaussianNB(var_smoothing=bestValue)
clf.fit(X_final, y_final)
y_pred = clf.predict(X_test_fbow)
score = metrics.accuracy_score(y_test, y_pred)
print("best test result for smoothing : " + str(bestValue))
print(score)

smoothing : 0.0
