In [1]:
import scipy
import sklearn
import json
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from collections import Counter
from sklearn import linear_model

In [2]:
# training data: ./train.tsv
# test data:     ./test.tsv

# Load training and test data

In [3]:
dataframe = pd.read_csv('./train.tsv', sep = '\t')
print(dataframe)

       label                                             review
0          0  Leaks: Liss seems to be totally incompetent: m...
1          1  Replacement Peeler: Loved my old one. Loaned i...
2          0  Not what I was expecting: I chose to rate this...
3          1  Watch face is hard to read: Although I don't o...
4          0  Disappointing: I was eager to read this book s...
...      ...                                                ...
29991      1  Love EW: I must admit that I am a total TV afi...
29992      1  Easy to follow and delicious recipes!: I compl...
29993      1  The Beauty and Mystery of Veronique: Perhaps t...
29994      1  I love it.: Brilliant, hilarious, quick and ea...
29995      0  broken...: bad choice...2d film would not play...

[29996 rows x 2 columns]


In [4]:
train_ratio = 0.8 # 80% for training, 20% for validation
random_seed = 100

train_dataframe = dataframe.sample(frac=train_ratio, random_state=random_seed)
valid_dataframe = dataframe.drop(train_dataframe.index)
print('training set size:', len(train_dataframe))
print('validation set size:', len(valid_dataframe))

training set size: 23997
validation set size: 5999


In [5]:
test_dataframe = pd.read_csv('./test.tsv', sep = '\t')
print (test_dataframe)

        id                                             review
0        1  Human Hurricane!: Would you like to sleep in t...
1        2  A Mom: I bought this with all kinds of expecta...
2        3  Good Read: I judge all books that I read by a ...
3        4  It's awesome: DVD set is exactly what you'd bu...
4        5  Great Movie!!!: This definatly the best Godzil...
...    ...                                                ...
5995  5996  Beautiful and Spiritual: This is a very beauti...
5996  5997  Another Cash In: This cd is pure dreck and it'...
5997  5998  Concept drawings-very good: The concept drawin...
5998  5999  I hear i all the time is awsome: this is great...
5999  6000  Not so great Performance: This mouse is very s...

[6000 rows x 2 columns]


# Try the trivial baseline: predict the majority label of the training set

In [6]:
Counter(train_dataframe['label'])

Counter({0: 11965, 1: 12032})

In [7]:
# Looks like label 1 has slightly more counts than label 0 in training data
# So the 'majority guess' prediction is an array filled with 1s
majority_guess_pred = [1 for i in range(len(valid_dataframe))]
accuracy = accuracy_score(valid_dataframe['label'], majority_guess_pred)
print ('Majority guess accuracy:', accuracy)

Majority guess accuracy: 0.5099183197199533


In [8]:
# helper function: write out prediction values into a csv format file
# params:
#     df: dataframe, where each row is a test example, with column 'id' as data id
#     pred: a list or 1-d array of prediction values
#     filepath: the output file path
# return:
#     None

def write_test_prediction(df, pred, filepath):
    with open(filepath, 'w') as outfile:
        outfile.write('{},{}\n'.format('id', 'label'))
        for index, row in df.iterrows():
            outfile.write('{},{}\n'.format(row['id'], pred[index]))
    print (len(df), 'predictions are written to', filepath)

In [9]:
majority_guess_pred_test = [1 for i in range(len(test_dataframe))]
write_test_prediction(test_dataframe, majority_guess_pred_test, './majority_guess.csv')

6000 predictions are written to ./majority_guess.csv


# Build feature extractor

## use all unigrams from training data as features

In [158]:
vectorizer = CountVectorizer(ngram_range=(1, 3))
vectorizer.fit(train_dataframe['review'])

CountVectorizer(ngram_range=(1, 3))

# Extract feature vectors for training, validation, and test data 

In [159]:
from sklearn import preprocessing
from sklearn.preprocessing import MaxAbsScaler
train_X = vectorizer.transform(train_dataframe['review'])
valid_X = vectorizer.transform(valid_dataframe['review'])
test_X = vectorizer.transform(test_dataframe['review'])

train_X_standard=MaxAbsScaler().fit(train_X).transform(train_X)
valid_X_standard=MaxAbsScaler().fit(valid_X).transform(valid_X)
test_X_standard=MaxAbsScaler().fit(test_X).transform(test_X)
#print (train_X.shape)
#print (valid_X.shape)
#print (test_X.shape)

print (train_X_standard.shape)
print (valid_X_standard.shape)
print (test_X_standard.shape)

(23997, 1949941)
(5999, 1949941)
(6000, 1949941)


# Regularized Classification

In [162]:
model_regularized = LogisticRegression(penalty='l2',C = 1.7, solver='liblinear')
model_regularized.fit(train_X_standard, train_Y)

LogisticRegression(C=1.7, solver='liblinear')

In [163]:
valid_Y_hat_regularized = model_regularized.predict(valid_X_standard)
valid_Y = valid_dataframe['label'].to_numpy()
accuracy = accuracy_score(valid_Y, valid_Y_hat_regularized)
print ('Regularized logistic regression, accuracy on validation set:', accuracy)

Regularized logistic regression, accuracy on validation set: 0.9051508584764127


In [164]:
for i in np.arange(0.1, 2, 0.1):
    model_reg = LogisticRegression(penalty='l2',C = i, solver='liblinear')
    model_reg.fit(train_X_standard, train_Y)
    valid_Y_hat_reg = model_reg.predict(valid_X_standard)
    accuracy = accuracy_score(valid_Y, valid_Y_hat_reg)
    print ('Regularized logistic regression, accuracy on validation set:', "i=",i,",", accuracy)

Regularized logistic regression, accuracy on validation set: i= 0.1 , 0.9001500250041674
Regularized logistic regression, accuracy on validation set: i= 0.2 , 0.9034839139856643
Regularized logistic regression, accuracy on validation set: i= 0.30000000000000004 , 0.903817302883814
Regularized logistic regression, accuracy on validation set: i= 0.4 , 0.9034839139856643
Regularized logistic regression, accuracy on validation set: i= 0.5 , 0.9039839973328888
Regularized logistic regression, accuracy on validation set: i= 0.6 , 0.9039839973328888
Regularized logistic regression, accuracy on validation set: i= 0.7000000000000001 , 0.9043173862310385
Regularized logistic regression, accuracy on validation set: i= 0.8 , 0.9043173862310385
Regularized logistic regression, accuracy on validation set: i= 0.9 , 0.9041506917819636
Regularized logistic regression, accuracy on validation set: i= 1.0 , 0.9043173862310385
Regularized logistic regression, accuracy on validation set: i= 1.1 , 0.90415069

In [165]:
all_train_Y = dataframe['label']
#all_train_X = vectorizer.transform(dataframe['review'])
all_train_X_standard = MaxAbsScaler().fit(valid_X).transform((vectorizer.transform(dataframe['review'])))
model_regularized.fit(all_train_X_standard, all_train_Y)
test_Y_hat_regularized = model_regularized.predict(test_X_standard)
write_test_prediction(test_dataframe, test_Y_hat_regularized, './logistic_reg_regression.csv')

6000 predictions are written to ./logistic_reg_regression.csv


# Naive Bayes

In [28]:
from sklearn.naive_bayes import BernoulliNB

In [166]:
for i in np.arange(1, 10, 0.5):
    bnb = BernoulliNB(alpha=i)
    bnb.fit(train_X_standard, train_Y)
    valid_Y_hat_nb = bnb.predict(valid_X_standard)
    accuracy = accuracy_score(valid_Y, valid_Y_hat_nb)
    print ('Naive Bayes, accuracy on validation set:', accuracy)

Naive Bayes, accuracy on validation set: 0.868144690781797
Naive Bayes, accuracy on validation set: 0.8584764127354559
Naive Bayes, accuracy on validation set: 0.8514752458743123
Naive Bayes, accuracy on validation set: 0.8428071345224204
Naive Bayes, accuracy on validation set: 0.8359726621103517
Naive Bayes, accuracy on validation set: 0.8271378563093849
Naive Bayes, accuracy on validation set: 0.8174695782630439
Naive Bayes, accuracy on validation set: 0.8104684114019003
Naive Bayes, accuracy on validation set: 0.8003000500083347
Naive Bayes, accuracy on validation set: 0.7931321886981163
Naive Bayes, accuracy on validation set: 0.786964494082347
Naive Bayes, accuracy on validation set: 0.7799633272212035
Naive Bayes, accuracy on validation set: 0.7711285214202367
Naive Bayes, accuracy on validation set: 0.7612935489248208
Naive Bayes, accuracy on validation set: 0.7526254375729288
Naive Bayes, accuracy on validation set: 0.7442907151191865
Naive Bayes, accuracy on validation set: 0

# KNN

In [30]:
from sklearn.neighbors import KNeighborsClassifier

In [167]:
for i in np.arange(1,9,1):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(train_X_standard, train_Y)
    valid_Y_hat_knn = knn.predict(valid_X_standard)
    accuracy = accuracy_score(valid_Y, valid_Y_hat_knn)
    print ('K-Nearest Neighbor, accuracy on validation set:',"n=",i,",", accuracy)

K-Nearest Neighbor, accuracy on validation set: n= 1 , 0.5117519586597766
K-Nearest Neighbor, accuracy on validation set: n= 2 , 0.49141523587264546
K-Nearest Neighbor, accuracy on validation set: n= 3 , 0.518919819969995
K-Nearest Neighbor, accuracy on validation set: n= 4 , 0.49874979163193867
K-Nearest Neighbor, accuracy on validation set: n= 5 , 0.5242540423403901
K-Nearest Neighbor, accuracy on validation set: n= 6 , 0.5522587097849642
K-Nearest Neighbor, accuracy on validation set: n= 7 , 0.5175862643773962
K-Nearest Neighbor, accuracy on validation set: n= 8 , 0.5327554592432072


# Support Vector Machine

In [32]:
from sklearn.svm import LinearSVC

In [172]:
for i in np.arange(0.01,0.1,0.002):
    SVC = LinearSVC(max_iter=6000, C=i)
    SVC.fit(train_X_standard, train_Y)
    valid_Y_hat_SVC = SVC.predict(valid_X_standard)
    accuracy = accuracy_score(valid_Y, valid_Y_hat_SVC)
    print ('SVC, accuracy on validation set:',"i=",i,",", accuracy)

SVC, accuracy on validation set: i= 0.01 , 0.9046507751291882
SVC, accuracy on validation set: i= 0.012 , 0.9051508584764127
SVC, accuracy on validation set: i= 0.014 , 0.9051508584764127
SVC, accuracy on validation set: i= 0.016 , 0.9053175529254875
SVC, accuracy on validation set: i= 0.018000000000000002 , 0.9051508584764127
SVC, accuracy on validation set: i= 0.02 , 0.9053175529254875
SVC, accuracy on validation set: i= 0.022 , 0.9053175529254875
SVC, accuracy on validation set: i= 0.024 , 0.9054842473745625
SVC, accuracy on validation set: i= 0.026000000000000002 , 0.9054842473745625
SVC, accuracy on validation set: i= 0.028000000000000004 , 0.9058176362727122
SVC, accuracy on validation set: i= 0.03 , 0.9061510251708618
SVC, accuracy on validation set: i= 0.032 , 0.9066511085180864
SVC, accuracy on validation set: i= 0.034 , 0.906984497416236
SVC, accuracy on validation set: i= 0.036000000000000004 , 0.9073178863143857
SVC, accuracy on validation set: i= 0.038 , 0.9073178863143857

In [169]:
for i in np.arange(0.05,0.2,0.01):
    SVC = LinearSVC(penalty='l1',loss='squared_hinge',max_iter=6000, C=i,dual=False)
    SVC.fit(train_X_standard, train_Y)
    valid_Y_hat_SVC = SVC.predict(valid_X_standard)
    accuracy = accuracy_score(valid_Y, valid_Y_hat_SVC)
    print ('SVC, accuracy on validation set:',"i=",i,",", accuracy)

SVC, accuracy on validation set: i= 0.05 , 0.8809801633605601
SVC, accuracy on validation set: i= 0.060000000000000005 , 0.8856476079346558
SVC, accuracy on validation set: i= 0.07 , 0.8879813302217037
SVC, accuracy on validation set: i= 0.08000000000000002 , 0.8909818303050508
SVC, accuracy on validation set: i= 0.09000000000000001 , 0.892982163693949
SVC, accuracy on validation set: i= 0.1 , 0.895149191531922
SVC, accuracy on validation set: i= 0.11000000000000001 , 0.8968161360226704
SVC, accuracy on validation set: i= 0.12000000000000001 , 0.8971495249208201
SVC, accuracy on validation set: i= 0.13 , 0.8971495249208201
SVC, accuracy on validation set: i= 0.14 , 0.8961493582263711
SVC, accuracy on validation set: i= 0.15000000000000002 , 0.8969828304717453
SVC, accuracy on validation set: i= 0.16000000000000003 , 0.8963160526754459
SVC, accuracy on validation set: i= 0.17000000000000004 , 0.8959826637772962
SVC, accuracy on validation set: i= 0.18000000000000005 , 0.8954825804300717

In [173]:
SVC = LinearSVC(max_iter=6000, C=0.042)
SVC.fit(train_X_standard, train_Y)
valid_Y_hat_SVC = SVC.predict(valid_X_standard)
accuracy = accuracy_score(valid_Y, valid_Y_hat_SVC)
print ('SVC, accuracy on validation set:', accuracy)

SVC, accuracy on validation set: 0.9074845807634606


In [174]:
SVC.fit(all_train_X_standard, all_train_Y)
test_Y_hat_svc = SVC.predict(test_X_standard)
write_test_prediction(test_dataframe, test_Y_hat_svc, './svc_regression.csv')

6000 predictions are written to ./svc_regression.csv
