### A3 - NLP for Yelp and IMDB

In [67]:
# Imports
import pandas as pd
import os.path
import re
import numpy as np
import string
from sklearn import metrics
from scipy.sparse import lil_matrix
from scipy.sparse import vstack

In [68]:
# Functions to open files
def open_with_panda(filename):
    return pd.read_csv(filename, sep='\t', names=["Comment", "Evaluation"]).as_matrix()


def open_vocab_file(filename):
    return pd.read_csv(yelp_vocab_file, sep='\t', header=None, skip_blank_lines=True).as_matrix()

In [69]:
# Function to generate encodings
def encode_reviews(file_to_create, X, Y, vocabFile, overwrite=False):
    if not (os.path.isfile(file_to_create)) or overwrite:
        # Grab vocab encodings
        vocab = open_vocab_file(vocabFile)
        vocab_dict = dict([(vocab[i][0], int(vocab[i][1])) for i in range(len(vocab))])
        list_to_write = []

        cleaned_reviews = [re.sub(r'[^\w\s\d]|_','',X[i].lower()) for i in range(len(X))]
        final_reviews = [re.sub(r'\s+',' ',cleaned_reviews[i]).strip() for i in range(len(cleaned_reviews))]

        for i in range(len(final_reviews)):
            build_string = ""
            for x in final_reviews[i].split(" "):
                if x in vocab_dict.keys():
                    build_string += str(vocab_dict[x]) + " "
            list_to_write.append(build_string.strip() + "\t" + str(Y[i]))

        with open(file_to_create,'w') as f:
            f.write("\n".join(list_to_write))

In [70]:
# Function to generate vocab
def vocabulary(file, reviews, overwrite=False):
    if not (os.path.isfile(file)) or overwrite:
        # Let's create the vocab.txt file
        cleaned_reviews = [re.sub(r'[^\w\s\d]|_','',reviews[i].lower()) for i in range(len(reviews))]
        final_reviews = [re.sub(r'\s+',' ',cleaned_reviews[i]).strip() for i in range(len(cleaned_reviews))]

        found_punct = False
        for c in string.punctuation:
            for rev in final_reviews:
                if c in rev:
                    # print(rev + "\n")
                    found_punct = True

        if found_punct:
            raise Exception('Found punctuation')

        words = " ".join(final_reviews).split(" ")
        unique_words, count = np.unique(words, return_counts=True)
        unique_words_array = np.asarray((unique_words,count)).T
        frequency_unique_words = np.flip(unique_words_array[unique_words_array[:,-1].astype(int).argsort()],0)

        with open(file,'w') as f:
            for i in range(10000):
                f.write(
                    frequency_unique_words[i][0] + "\t" +
                    str(i) + "\t" +
                    frequency_unique_words[i][-1] + "\n"
                )


In [71]:
# File paths

yelp_train_file = "hwk3_datasets/yelp-train.txt"
yelp_valid_file = "hwk3_datasets/yelp-valid.txt"
yelp_test_file = "hwk3_datasets/yelp-test.txt"

imdb_valid_file = "hwk3_datasets/IMDB-valid.txt"
imdb_test_file = "hwk3_datasets/IMDB-test.txt"
imdb_train_file = "hwk3_datasets/IMDB-train.txt"

yelp_vocab_file = "data/yelp-vocab.txt"
imdb_vocab_file = "data/IMDB-vocab.txt"

wr_yelp_train_file = "data/yelp-train.txt"
wr_yelp_valid_file = "data/yelp-valid.txt"
wr_yelp_test_file = "data/yelp-test.txt"

wr_imdb_valid_file = "data/IMDB-valid.txt"
wr_imdb_test_file = "data/IMDB-test.txt"
wr_imdb_train_file = "data/IMDB-train.txt"

In [72]:
# Training data

data_yelp_train = open_with_panda(yelp_train_file)
yelp_train_X = data_yelp_train[:,0]
yelp_train_Y = data_yelp_train[:,-1]

data_imdb_train = open_with_panda(imdb_train_file)
imdb_train_X = data_imdb_train[:,0]
imdb_train_Y = data_imdb_train[:,-1]

In [73]:
# Generate vocab
vocabulary(yelp_vocab_file, yelp_train_X.flatten())
vocabulary(imdb_vocab_file, imdb_train_X.flatten())

In [74]:
# Encode yelp reviews

encode_reviews(wr_yelp_train_file, yelp_train_X, yelp_train_Y, yelp_vocab_file)

data_yelp_valid = open_with_panda(yelp_valid_file)
data_yelp_test = open_with_panda(yelp_test_file)

encode_reviews(wr_yelp_valid_file, data_yelp_valid[:,0], data_yelp_valid[:,-1], yelp_vocab_file)
encode_reviews(wr_yelp_test_file, data_yelp_test[:,0], data_yelp_test[:,-1], yelp_vocab_file)

In [75]:
# Encode IMDB reviews
encode_reviews(wr_imdb_train_file, imdb_train_X, imdb_train_Y, imdb_vocab_file)
data_imdb_valid = open_with_panda(imdb_valid_file)
data_imdb_test = open_with_panda(imdb_test_file)

encode_reviews(wr_imdb_valid_file, data_imdb_valid[:,0], data_imdb_valid[:,-1], imdb_vocab_file)
encode_reviews(wr_imdb_test_file, data_imdb_test[:,0], data_imdb_test[:,-1], imdb_vocab_file)

In [76]:
# Function - unif classifier
def uniform_classifier(classes, y_test):
    from random import randint
    l = []
    for i in range(len(y_test)):
        rand = randint(0, len(classes)-1)
        l.append(classes[rand])
    y_pred = np.array(l, dtype=np.int16)
    y_test = np.array(y_test, dtype=np.int16)
    return metrics.f1_score(y_test, y_pred, average=None) # Get F1 score for each of the classes, thus Average=None

In [77]:
# Function - majority classifier
def majority_classifier(classes, y_train, y_test):
    counter = dict([(classes[i], 0) for i in range(len(classes))])

    for y in y_train:
        counter[y]+=1

    majority_class = max(counter, key=counter.get)
    y_pred = np.full((len(y_test),1), majority_class, dtype=np.int16)
    y_test = np.array(y_test, dtype=np.int16)
    
    return metrics.f1_score(y_test, y_pred, average=None)

In [78]:
# Binary bag of words: read encoding file and generate
def binary_bag_of_words(filename):
    features = 10000
    with open(filename, 'r') as f:
        lines = f.readlines()
        X = lil_matrix((len(lines), features), dtype=np.int32)
        for i in range(len(lines)):
            encodings = lines[i].split(" ")
            encodings[-1] = encodings[-1].split("\t")[0]

            if len(encodings) == 1 and encodings[0] == '':
                continue  # We have only 1 encoding, which means the review has no vocab words

            for j in range(len(encodings)):
                e = int(encodings[j])
                X[i,e] = 1
        return X

In [79]:
# Frequency bag of words - with division
def frequency_bag_of_words(filename):
    features = 10000
    with open(filename, 'r') as f:
        lines = f.readlines()
        X = lil_matrix((len(lines), features), dtype=np.int32)
        for i in range(len(lines)):
            encodings = lines[i].split(" ")
            encodings[-1] = encodings[-1].split("\t")[0]
            
            if len(encodings) == 1 and encodings[0] == '':
                continue  # We have only 1 encoding, which means the review has no vocab words
            
            for j in range(len(encodings)):
                e = int(encodings[j])
                X[i,e] += 1
        sum_vector = X.sum(axis=1)
        for i in range(len(sum_vector)):
            if sum_vector[i] == 0:
                sum_vector[i] = 1  #just so we don't divide by 0

        return lil_matrix(X / sum_vector, dtype=np.float64)

In [80]:
yelp_classes = [i for i in range(1,6)]
imdb_classes = [0,1]

In [81]:
print("\nTest\n")
unif = uniform_classifier(yelp_classes, data_yelp_test[:,-1].flatten())
maj = majority_classifier(yelp_classes, yelp_train_Y, data_yelp_test[:,-1].flatten()) # Will get an F1 error, since we'll have 0/0 in some cases

print(unif)
print(np.mean(unif))

print(maj)
print(np.mean(maj))

print("\nTraining\n")
unif = uniform_classifier(yelp_classes, yelp_train_Y)
maj = majority_classifier(yelp_classes, yelp_train_Y, yelp_train_Y) # Will get an F1 error, since we'll have 0/0 in some cases

print(unif)
print(np.mean(unif))

print(maj)
print(np.mean(maj))

print("\nValidation\n")
unif = uniform_classifier(yelp_classes, data_yelp_valid[:,-1].flatten())
maj = majority_classifier(yelp_classes, yelp_train_Y, data_yelp_valid[:,-1].flatten()) # Will get an F1 error, since we'll have 0/0 in some cases

print(unif)
print(np.mean(unif))

print(maj)
print(np.mean(maj))


Test

[ 0.11347518  0.12828947  0.16239316  0.26102941  0.22736031]
0.178509506686
[ 0.         0.         0.         0.5196151  0.       ]
0.103923019985

Training

[ 0.12863706  0.12445309  0.16195698  0.26178279  0.2518199 ]
0.185729962271
[ 0.          0.          0.          0.52133502  0.        ]
0.104267004647

Validation

[ 0.13986014  0.11636364  0.22047244  0.27797834  0.24206349]
0.199347609716
[ 0.          0.          0.          0.52507375  0.        ]
0.105014749263


  'precision', 'predicted', average, warn_for)


In [33]:
# Yelp Binary Bag of Words

binary_bag_train = binary_bag_of_words(wr_yelp_train_file)
binary_bag_valid = binary_bag_of_words(wr_yelp_valid_file)
binary_bag_test = binary_bag_of_words(wr_yelp_test_file)

In [34]:
# Get data ready for cross gridsearchCV with validation data

yelp_train_valid_X = vstack([binary_bag_train, binary_bag_valid])
train_test_fold = np.full((binary_bag_train.shape[0]), 0)
valid_test_fold = np.full((binary_bag_valid.shape[0]), 1)
test_fold = np.append(train_test_fold, valid_test_fold, axis=0)

yelp_valid_Y = np.array(data_yelp_valid[:,-1], dtype=np.int32)

yelp_train_valid_Y = np.array(np.append(yelp_train_Y, yelp_valid_Y, axis=0), dtype=np.int32)

In [49]:
# Naive Bayes for Yelp Binary Bag of Words
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit

clf = BernoulliNB()
tuned_parameters = [{'alpha': np.arange(0.01, 10.01, 0.01)}]
ps = PredefinedSplit(test_fold=test_fold)

clf = GridSearchCV(clf, tuned_parameters, cv=ps, refit=True, scoring='f1_micro')
clf.fit(yelp_train_valid_X, yelp_train_valid_Y)

print(clf.best_estimator_)

print("\nTest\n")

y_pred = clf.predict(binary_bag_test)
f1 = metrics.f1_score(np.array(data_yelp_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

print("\nTraining\n")

y_pred = clf.predict(binary_bag_train)
f1 = metrics.f1_score(np.array(yelp_train_Y, dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

print("\nValidation\n")

y_pred = clf.predict(binary_bag_valid)
f1 = metrics.f1_score(np.array(yelp_valid_Y, dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))



BernoulliNB(alpha=0.029999999999999999, binarize=0.0, class_prior=None,
      fit_prior=True)

Test

[ 0.416       0.26380368  0.24045802  0.4469526   0.52705283]
0.378853424955

Training

[ 0.78423237  0.75395431  0.73790776  0.70096463  0.70933056]
0.737277924229

Validation

[ 0.73939394  0.70857143  0.70138889  0.6539075   0.61744966]
0.684142283459


In [60]:
# Linear SVC/SVM
from sklearn.svm import LinearSVC
clf = LinearSVC()

# We have 10000 features and 8000 in training + valid; so solve dual
tuned_parameters = [
    {'penalty': ['l2'], 
     'loss':['squared_hinge', 'hinge'], 
     'dual':[True], 
     'C': np.arange(0.1, 0.2, 0.1),
#     'C':np.arange(0.1, 4.1, 0.1)
    }]
ps = PredefinedSplit(test_fold=test_fold)

clf = GridSearchCV(clf, tuned_parameters, cv=ps, refit=True, scoring='f1_micro')
clf.fit(yelp_train_valid_X, yelp_train_valid_Y)

print(clf.best_estimator_)

print("\nTest\n")

y_pred = clf.predict(binary_bag_test)
f1 = metrics.f1_score(np.array(data_yelp_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

print("\nTraining\n")

y_pred = clf.predict(binary_bag_train)
f1 = metrics.f1_score(np.array(yelp_train_Y, dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

print("\nValidation\n")

y_pred = clf.predict(binary_bag_valid)
f1 = metrics.f1_score(np.array(yelp_valid_Y, dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

LinearSVC(C=0.10000000000000001, class_weight=None, dual=True,
     fit_intercept=True, intercept_scaling=1, loss='hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

Test

[ 0.51803279  0.31454006  0.31751825  0.49215407  0.58948864]
0.446346759278

Training

[ 0.94777563  0.93354684  0.89937107  0.8874308   0.89497161]
0.91261918937

Validation

[ 0.97590361  0.96808511  0.91909385  0.89615932  0.89274448]
0.930397273736


In [63]:
# Decision tree for yelp binary bag of words
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()

tuned_parameters = [
    {'criterion': ['gini','entropy'],
     'splitter':['best','random'],
#      'max_depth':[None,10,20,30,40,50,60,70,80,90],
     'max_depth':[None],
#      'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10,20,30,40,50],
     'min_samples_leaf':[9],
#      'max_features': [None,3,5,7,9,10,15,20],  
    }]
ps = PredefinedSplit(test_fold=test_fold)

clf = GridSearchCV(clf, tuned_parameters, cv=ps, refit=True, scoring='f1_micro')
clf.fit(yelp_train_valid_X, yelp_train_valid_Y)

print(clf.best_estimator_)

print("\nTest\n")

y_pred = clf.predict(binary_bag_test)
f1 = metrics.f1_score(np.array(data_yelp_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

print("\nTraining\n")

y_pred = clf.predict(binary_bag_train)
f1 = metrics.f1_score(np.array(yelp_train_Y, dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

print("\nValidation\n")

y_pred = clf.predict(binary_bag_valid)
f1 = metrics.f1_score(np.array(yelp_valid_Y, dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=9, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='random')

Test

[ 0.20080321  0.22155689  0.20332717  0.42153644  0.45232816]
0.299910374372

Training

[ 0.51836306  0.45200698  0.49058756  0.69106938  0.71034483]
0.572474362363

Validation

[ 0.55345912  0.40718563  0.52264808  0.69736842  0.69856459]
0.575845169243


In [82]:
# Yelp frequency bag of words
freq_bag_train = frequency_bag_of_words(wr_yelp_train_file)
freq_bag_valid = frequency_bag_of_words(wr_yelp_valid_file)
freq_bag_test = frequency_bag_of_words(wr_yelp_test_file)

In [83]:
# Get data ready for cross gridsearchCV with validation data

yelp_train_valid_X = vstack([freq_bag_train, freq_bag_valid])
train_test_fold = np.full((freq_bag_train.shape[0]), 0)
valid_test_fold = np.full((freq_bag_valid.shape[0]), 1)
test_fold = np.append(train_test_fold, valid_test_fold, axis=0)

yelp_valid_Y = np.array(data_yelp_valid[:,-1], dtype=np.int32)

yelp_train_valid_Y = np.array(np.append(yelp_train_Y, yelp_valid_Y, axis=0), dtype=np.int32)

In [85]:
# Naive Bayes for Yelp Binary Bag of Words
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
tuned_parameters = [{}]
ps = PredefinedSplit(test_fold=test_fold)

clf = GridSearchCV(clf, tuned_parameters, cv=ps, refit=True,scoring='f1_micro')
clf.fit(yelp_train_valid_X.todense(), yelp_train_valid_Y)

# y_pred = clf.predict(freq_bag_test.todense())

# print(clf.best_estimator_)
# f1 = metrics.f1_score(np.array(data_yelp_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None)
# print(f1)
# print(np.mean(f1))

print(clf.best_estimator_)

print("\nTest\n")

y_pred = clf.predict(freq_bag_test.todense())
f1 = metrics.f1_score(np.array(data_yelp_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

print("\nTraining\n")

y_pred = clf.predict(freq_bag_train.todense())
f1 = metrics.f1_score(np.array(yelp_train_Y, dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

print("\nValidation\n")

y_pred = clf.predict(freq_bag_valid.todense())
f1 = metrics.f1_score(np.array(yelp_valid_Y, dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

GaussianNB(priors=None)

Test

[ 0.17940199  0.140625    0.21021021  0.33888487  0.37697161]
0.249218735688

Training

[ 0.74095103  0.72258838  0.73378265  0.7920077   0.83705052]
0.765276055511

Validation

[ 0.75675676  0.734375    0.67980296  0.72759227  0.79341865]
0.738389125345


In [86]:
# Linear SVC/SVM
from sklearn.svm import LinearSVC
clf = LinearSVC()

# We have 10000 features and 8000 in training + valid; so solve dual
tuned_parameters = [
    {'penalty': ['l2'], 
     'loss':['squared_hinge', 'hinge'], 
     'dual':[True], 
#      'C': np.arange(0.1, 0.2, 0.1),
    'C':np.arange(0.1, 4.1, 0.1)
    }]
ps = PredefinedSplit(test_fold=test_fold)

clf = GridSearchCV(clf, tuned_parameters, cv=ps, refit=True, scoring='f1_micro')
clf.fit(yelp_train_valid_X, yelp_train_valid_Y)

print(clf.best_estimator_)

print("\nTest\n")

y_pred = clf.predict(freq_bag_test)
f1 = metrics.f1_score(np.array(data_yelp_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

print("\nTraining\n")

y_pred = clf.predict(freq_bag_train)
f1 = metrics.f1_score(np.array(yelp_train_Y, dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

print("\nValidation\n")

y_pred = clf.predict(freq_bag_valid)
f1 = metrics.f1_score(np.array(yelp_valid_Y, dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

LinearSVC(C=0.10000000000000001, class_weight=None, dual=True,
     fit_intercept=True, intercept_scaling=1, loss='hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

Test

[ 0.43137255  0.25095057  0.14795918  0.428125    0.62099448]
0.375880355635

Training

[ 0.63716814  0.52057613  0.40836941  0.50374404  0.65550466]
0.545072476966

Validation

[ 0.62318841  0.60130719  0.43049327  0.50386399  0.62693683]
0.557157937215


In [99]:
# Decision tree for yelp binary bag of words
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()

tuned_parameters = [
    {
#   'criterion': ['gini','entropy'],
    'criterion': ['entropy'],
#      'splitter':['best','random'],
        'splitter':['best'],
     'max_depth':[50],
#      'max_depth':[None,10,20,30,40,50,60,70,80,90],
     'min_samples_leaf':[40],
#      'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10,20,30,40,50],
     'max_features': [None],  
    }]
ps = PredefinedSplit(test_fold=test_fold)

clf = GridSearchCV(clf, tuned_parameters, cv=ps, refit=True, scoring='f1_micro')
clf.fit(yelp_train_valid_X, yelp_train_valid_Y)

print(clf.best_estimator_)

print("\nTest\n")

y_pred = clf.predict(freq_bag_test)
f1 = metrics.f1_score(np.array(data_yelp_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

print("\nTraining\n")

y_pred = clf.predict(freq_bag_train)
f1 = metrics.f1_score(np.array(yelp_train_Y, dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

print("\nValidation\n")

y_pred = clf.predict(freq_bag_valid)
f1 = metrics.f1_score(np.array(yelp_valid_Y, dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=50,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=40, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

Test

[ 0.192       0.15974441  0.18947368  0.43612903  0.45750708]
0.286970841513

Training

[ 0.36926361  0.25855513  0.27848101  0.53763441  0.58487195]
0.405761221837

Validation

[ 0.44444444  0.18055556  0.3         0.53506494  0.60843373]
0.413699734001


## IMDB

In [101]:
print("\nTest\n")
unif_imdb = uniform_classifier(imdb_classes, data_imdb_test[:,-1].flatten())

print(unif_imdb)
print(np.mean(unif_imdb))

print("\nTrain\n")
unif_imdb = uniform_classifier(imdb_classes, imdb_train_Y)

print(unif_imdb)
print(np.mean(unif_imdb))

print("\nValidation\n")
unif_imdb = uniform_classifier(imdb_classes, data_imdb_valid[:,-1].flatten())

print(unif_imdb)
print(np.mean(unif_imdb))


Test

[ 0.500618    0.49737148]
0.498994740647

Train

[ 0.50490716  0.49959786]
0.502252508516

Validation

[ 0.50019873  0.49376006]
0.496979396276


In [102]:
# IMDB binary bag of words

binary_bag_train = binary_bag_of_words(wr_imdb_train_file)
binary_bag_valid = binary_bag_of_words(wr_imdb_valid_file)
binary_bag_test = binary_bag_of_words(wr_imdb_test_file)

In [103]:
# Get data ready for cross gridsearchCV with validation data

imdb_train_valid_X = vstack([binary_bag_train, binary_bag_valid])
train_test_fold = np.full((binary_bag_train.shape[0]), 0)
valid_test_fold = np.full((binary_bag_valid.shape[0]), 1)
test_fold = np.append(train_test_fold, valid_test_fold, axis=0)

imdb_valid_Y = np.array(data_imdb_valid[:,-1], dtype=np.int32)

imdb_train_valid_Y = np.array(np.append(imdb_train_Y, imdb_valid_Y, axis=0), dtype=np.int32)

In [104]:
# Naive Bayes for IMDB Binary Bag of Words
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit

clf = BernoulliNB()
tuned_parameters = [{'alpha': np.arange(0.01, 10.01, 0.01)}]
ps = PredefinedSplit(test_fold=test_fold)

clf = GridSearchCV(clf, tuned_parameters, cv=ps, refit=True, scoring='f1')
clf.fit(imdb_train_valid_X, imdb_train_valid_Y)

print(clf.best_estimator_)

print("\nTest\n")

y_pred = clf.predict(binary_bag_test)
f1 = metrics.f1_score(np.array(data_imdb_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

print("\nTraining\n")

y_pred = clf.predict(binary_bag_train)
f1 = metrics.f1_score(np.array(imdb_train_Y, dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

print("\nValidation\n")

y_pred = clf.predict(binary_bag_valid)
f1 = metrics.f1_score(np.array(imdb_valid_Y, dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

BernoulliNB(alpha=0.71000000000000008, binarize=0.0, class_prior=None,
      fit_prior=True)

Test

[ 0.83681572  0.83766208]
0.837238899735

Training

[ 0.85830014  0.86178969]
0.860044915158

Validation

[ 0.85910792  0.8622565 ]
0.860682210511


In [108]:
# Linear SVC/SVM
from sklearn.svm import LinearSVC
clf = LinearSVC()

# We have 10000 features and 25000 in training + valid; so solve primal (which can't be used with hinge loss)
tuned_parameters = [
    {'penalty': ['l2'], 
     'loss':['squared_hinge'], 
     'dual':[False], 
#     'C':np.arange(0.01, 4.01, 0.01),
     'C':np.arange(0.01, 0.21, 0.01),
    }]
ps = PredefinedSplit(test_fold=test_fold)

clf = GridSearchCV(clf, tuned_parameters, cv=ps, refit=True, scoring='f1')
clf.fit(imdb_train_valid_X, imdb_train_valid_Y)

print(clf.best_estimator_)

print("\nTest\n")

y_pred = clf.predict(binary_bag_test)
f1 = metrics.f1_score(np.array(data_imdb_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

print("\nTraining\n")

y_pred = clf.predict(binary_bag_train)
f1 = metrics.f1_score(np.array(imdb_train_Y, dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

print("\nValidation\n")

y_pred = clf.predict(binary_bag_valid)
f1 = metrics.f1_score(np.array(imdb_valid_Y, dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

LinearSVC(C=0.01, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

Test

[ 0.87187953  0.87354952]
0.872714522448

Training

[ 0.93123825  0.93222134]
0.931729794206

Validation

[ 0.93252891  0.93326703]
0.932897970164


In [119]:
# Decision tree for imdb binary bag of words
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()

tuned_parameters = [
    {
  'criterion': ['gini','entropy'],
     'splitter':['best','random'],
     'max_depth':[None,10,20,30,40,50,60,70,80,90],
     'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10,20,30,40,50],
     'max_features': [None],  
    }]
ps = PredefinedSplit(test_fold=test_fold)

clf = GridSearchCV(clf, tuned_parameters, cv=ps, refit=True, scoring='f1')
clf.fit(imdb_train_valid_X, imdb_train_valid_Y)

print(clf.best_estimator_)

print("\nTest\n")

y_pred = clf.predict(binary_bag_test)
f1 = metrics.f1_score(np.array(data_imdb_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

print("\nTraining\n")

y_pred = clf.predict(binary_bag_train)
f1 = metrics.f1_score(np.array(imdb_train_Y, dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

print("\nValidation\n")

y_pred = clf.predict(binary_bag_valid)
f1 = metrics.f1_score(np.array(imdb_valid_Y, dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=20,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=40, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='random')

Test

[ 0.72195083  0.71794045]
0.719945642845

Training

[ 0.73056485  0.7216914 ]
0.726128125065

Validation

[ 0.72851081  0.72119185]
0.724851328952


In [112]:
# Yelp frequency bag of words
freq_bag_train = frequency_bag_of_words(wr_imdb_train_file)
freq_bag_valid = frequency_bag_of_words(wr_imdb_valid_file)
freq_bag_test = frequency_bag_of_words(wr_imdb_test_file)

In [113]:
# Get data ready for cross gridsearchCV with validation data

imdb_train_valid_X = vstack([freq_bag_train, freq_bag_valid])
train_test_fold = np.full((freq_bag_train.shape[0]), 0)
valid_test_fold = np.full((freq_bag_valid.shape[0]), 1)
test_fold = np.append(train_test_fold, valid_test_fold, axis=0)

imdb_valid_Y = np.array(data_imdb_valid[:,-1], dtype=np.int32)

imdb_train_valid_Y = np.array(np.append(imdb_train_Y, imdb_valid_Y, axis=0), dtype=np.int32)

In [114]:
# Naive Bayes for IMDB Binary Bag of Words
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
tuned_parameters = [{}]
ps = PredefinedSplit(test_fold=test_fold)

clf = GridSearchCV(clf, tuned_parameters, cv=ps, refit=True, scoring='f1')
clf.fit(imdb_train_valid_X.todense(), imdb_train_valid_Y)

# y_pred = clf.predict(freq_bag_test.todense())

print(clf.best_estimator_)

print("\nTest\n")

y_pred = clf.predict(freq_bag_test.todense())
f1 = metrics.f1_score(np.array(data_imdb_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

print("\nTraining\n")

y_pred = clf.predict(freq_bag_train.todense())
f1 = metrics.f1_score(np.array(imdb_train_Y, dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

print("\nValidation\n")

y_pred = clf.predict(freq_bag_valid.todense())
f1 = metrics.f1_score(np.array(imdb_valid_Y, dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

GaussianNB(priors=None)

Test

[ 0.65187433  0.66867047]
0.660272399802

Training

[ 0.7930265   0.81047254]
0.801749520405

Validation

[ 0.79233361  0.812262  ]
0.802297804792


In [118]:
# Linear SVC/SVM
from sklearn.svm import LinearSVC
clf = LinearSVC()

# We have 10000 features and 25000 in training + valid; so solve primal (can't use hinge loss with primal problem)
tuned_parameters = [
    {'penalty': ['l2'], 
     'loss':['squared_hinge'], 
     'dual':[False], 
    'C':np.arange(0.5, 10.5, 0.5),
    }]
ps = PredefinedSplit(test_fold=test_fold)

clf = GridSearchCV(clf, tuned_parameters, cv=ps, refit=True, scoring='f1')
clf.fit(imdb_train_valid_X, imdb_train_valid_Y)

print(clf.best_estimator_)

print("\nTest\n")

y_pred = clf.predict(freq_bag_test)
f1 = metrics.f1_score(np.array(data_imdb_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

print("\nTraining\n")

y_pred = clf.predict(freq_bag_train)
f1 = metrics.f1_score(np.array(imdb_train_Y, dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

print("\nValidation\n")

y_pred = clf.predict(freq_bag_valid)
f1 = metrics.f1_score(np.array(imdb_valid_Y, dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

LinearSVC(C=10.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

Test

[ 0.86714636  0.86844724]
0.867796799836

Training

[ 0.88369908  0.88522419]
0.884461633815

Validation

[ 0.88048411  0.88249876]
0.881491437756


In [120]:
# Decision tree for yelp binary bag of words
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()

tuned_parameters = [
    {
  'criterion': ['gini','entropy'],
     'splitter':['best','random'],
     'max_depth':[None,10,20,30,40,50,60,70,80,90],
     'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10,20,30,40,50],
     'max_features': [None],  
    }]
ps = PredefinedSplit(test_fold=test_fold)

clf = GridSearchCV(clf, tuned_parameters, cv=ps, refit=True, scoring='f1')
clf.fit(imdb_train_valid_X, imdb_train_valid_Y)

print(clf.best_estimator_)

print("\nTest\n")

y_pred = clf.predict(freq_bag_test)
f1 = metrics.f1_score(np.array(data_imdb_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

print("\nTraining\n")

y_pred = clf.predict(freq_bag_train)
f1 = metrics.f1_score(np.array(imdb_train_Y, dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

print("\nValidation\n")

y_pred = clf.predict(freq_bag_valid)
f1 = metrics.f1_score(np.array(imdb_valid_Y, dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=20,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='random')

Test

[ 0.6904073   0.74741238]
0.718909842242

Training

[ 0.78822469  0.83097579]
0.809600238642

Validation

[ 0.78618679  0.82790362]
0.807045205524
