In [1]:
from __future__ import division
import numpy as np 
import pickle
import scipy.io as sio
import scipy.sparse as ss 

from sklearn.preprocessing import binarize
from sklearn.svm import LinearSVC, NuSVC, SVC 
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import metrics

np.random.seed(12345)

In [2]:
# training data:
# train_feats = BOW linguistic features
# train_image_feats = BOC image categorical features
# train_y = FOILED or NOT FOILED => IF FOILED = 1, else 0
train_feats = sio.mmread('data/train_feats.mtx')
train_image_feats = sio.mmread('data/train_image_feats.mtx')
train_target = np.array(sio.mmread('data/train_y.mtx').todense()).tolist()[0]
#############
# validation data: Same pattern as training; 
# The testing data comes from the karpathy 5k validation set only. 
val_feats = sio.mmread('data/test_feats.mtx')
val_image_feats = sio.mmread('data/test_image_feats.mtx')
val_target = np.array(sio.mmread('data/test_y.mtx').todense()).tolist()[0]

In [31]:
# Data processing, concatinating images with
# linguistic features and image features 
#X_train = ss.hstack([(train_feats), train_image_feats])
#X_train = train_feats
X_train = train_image_feats
#X_train = ss.hstack([binarize(train_feats), train_image_feats])

#X_val = ss.hstack([(val_feats), val_image_feats])
#X_val = ss.hstack([binarize(val_feats), val_image_feats])
#X_val = val_feats
X_val = val_image_feats

Y_train = np.array(train_target)
Y_test = np.array(val_target)

In [18]:
# Logistic Regression with 'l1' penalty
logistic = LogisticRegression(penalty='l2')
logistic.fit(X_train, Y_train)
print 'Accuracy = ', metrics.accuracy_score(Y_test, logistic.predict(X_val))
target_names = ['REAL', 'FAKE']
print metrics.classification_report(Y_test, logistic.predict(X_val), 
                                    target_names=target_names)

Accuracy =  0.751394829831
             precision    recall  f1-score   support

       REAL       0.76      0.73      0.75     75278
       FAKE       0.74      0.78      0.76     75278

avg / total       0.75      0.75      0.75    150556



In [7]:
# Linear Support Vector Classifier with l2 regularizer and hinge loss
linearsvc = LinearSVC(penalty='l2', loss='hinge', verbose=1)
linearsvc.fit(X_train, Y_train)
print 'Accuracy = ', metrics.accuracy_score(Y_test, linearsvc.predict(X_val))
target_names = ['REAL', 'FAKE']
print metrics.classification_report(Y_test, linearsvc.predict(X_val), 
                                    target_names=target_names)

[LibLinear]Accuracy =  0.75741916629
             precision    recall  f1-score   support

       REAL       0.79      0.70      0.74     75278
       FAKE       0.73      0.82      0.77     75278

avg / total       0.76      0.76      0.76    150556





In [8]:
# Decision tree classifier
decisiontree = DecisionTreeClassifier(random_state=0)
decisiontree.fit(X_train, Y_train)
print 'Accuracy = ', metrics.accuracy_score(Y_test, decisiontree.predict(X_val))
target_names = ['REAL', 'FAKE']
print metrics.classification_report(Y_test, decisiontree.predict(X_val), 
                                    target_names=target_names)

Accuracy =  0.960865060177
             precision    recall  f1-score   support

       REAL       0.96      0.96      0.96     75278
       FAKE       0.96      0.96      0.96     75278

avg / total       0.96      0.96      0.96    150556



In [9]:
# standard Gradient Boosting Classifier 
gb = GradientBoostingClassifier()
gb.fit(X_train, Y_train)
print 'Accuracy = ', metrics.accuracy_score(Y_test, gb.predict(X_val.toarray()))
target_names = ['REAL', 'FAKE']
print metrics.classification_report(Y_test, gb.predict(X_val.toarray()), 
                                    target_names=target_names)

Accuracy =  0.826456600866
             precision    recall  f1-score   support

       REAL       0.76      0.96      0.85     75278
       FAKE       0.95      0.69      0.80     75278

avg / total       0.85      0.83      0.82    150556



In [10]:
# extremely randomized tree classifier.
ert = ExtraTreeClassifier(splitter='best')
ert.fit(X_train, Y_train)
print 'Accuracy = ', metrics.accuracy_score(Y_test, ert.predict(X_val.toarray()))
target_names = ['REAL', 'FAKE']
print metrics.classification_report(Y_test, ert.predict(X_val), 
                                    target_names=target_names)

Accuracy =  0.76075347379
             precision    recall  f1-score   support

       REAL       0.76      0.77      0.76     75278
       FAKE       0.76      0.75      0.76     75278

avg / total       0.76      0.76      0.76    150556



In [11]:
pickle.dump(decisiontree, open('decisiontreeclassifiermodel.pkl', 'w'))

In [32]:
# MLP Classifier as in the FOIL paper
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1)
mlp.fit(X_train, Y_train)
print 'Accuracy = ', metrics.accuracy_score(Y_test, mlp.predict(X_val.toarray()))
target_names = ['REAL', 'FAKE']
print metrics.classification_report(Y_test, mlp.predict(X_val), 
                                    target_names=target_names, digits=4)

Accuracy =  0.50014612503
             precision    recall  f1-score   support

       REAL     0.5001    0.5106    0.5053     75278
       FAKE     0.5001    0.4897    0.4949     75278

avg / total     0.5001    0.5001    0.5001    150556



In [30]:
cmat = metrics.confusion_matrix(Y_test, mlp.predict(X_val.todense()))
print Y_test, mlp.predict(X_val.todense())
print cmat 
print cmat.diagonal()/cmat.sum(axis=1)



[0 0 1 ..., 1 1 1] [0 0 1 ..., 1 1 1]
[[66554  8724]
 [ 7317 67961]]
[ 0.88410957  0.90280029]
