In [1]:
from __future__ import division
import numpy as np 
import pickle
import scipy.io as sio
import scipy.sparse as ss 

from sklearn.preprocessing import binarize
from sklearn.svm import LinearSVC, NuSVC, SVC 
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import metrics

np.random.seed(12345)

In [2]:
# training data:
# train_feats = BOW linguistic features
# train_image_feats = BOC image categorical features
# train_y = FOILED or NOT FOILED => IF FOILED = 1, else 0
train_feats = sio.mmread('data/train_feats.mtx')
train_image_feats = sio.mmread('data/train_image_feats_yolococo.mtx')
train_target = np.array(sio.mmread('data/train_y.mtx').todense()).tolist()[0]
#############
# validation data: Same pattern as training; 
# The testing data comes from the karpathy 5k validation set only. 
val_feats = sio.mmread('data/test_feats.mtx')
val_image_feats = sio.mmread('data/test_image_feats_yolococo.mtx')
val_target = np.array(sio.mmread('data/test_y.mtx').todense()).tolist()[0]

In [3]:
# Data processing, concatinating images with
# linguistic features and image features 
#X_train = ss.hstack([(train_feats), train_image_feats])
X_train = ss.hstack([binarize(train_feats), train_image_feats])

#X_val = ss.hstack([(val_feats), val_image_feats])
X_val = ss.hstack([binarize(val_feats), val_image_feats])

Y_train = np.array(train_target)
Y_test = np.array(val_target)

In [26]:
!rm -fr train_img_foil_yolo_real_bin
%mkdir train_img_foil_yolo_real_bin 
path = 'train_img_foil_yolo_real_bin' 
from tqdm import tqdm
train_img_feats = train_image_feats.tocsr().todense()

for r in tqdm(range(len(train_img_feats))):
    np.save(path + '/' + str(r+1) + '.npy', binarize(train_img_feats[r]).tolist()[0])
    
!rm -fr test_img_foil_yolo_real_bin
%mkdir test_img_foil_yolo_real_bin 
path = 'test_img_foil_yolo_real_bin' 
from tqdm import tqdm
test_img_feats = val_image_feats.tocsr().todense()

for r in tqdm(range(len(test_img_feats))):
    np.save(path + '/' + str(r+1) + '.npy', binarize(test_img_feats[r]).tolist()[0])
    

100%|██████████| 306458/306458 [01:33<00:00, 3274.87it/s]
100%|██████████| 150556/150556 [00:46<00:00, 3238.14it/s]


In [30]:
(86.62 + 88.18) / 2 

87.4

In [4]:
# Logistic Regression with 'l1' penalty
logistic = LogisticRegression(penalty='l1')
logistic.fit(X_train, Y_train)
print 'Accuracy = ', metrics.accuracy_score(Y_test, logistic.predict(X_val))
target_names = ['REAL', 'FAKE']
print metrics.classification_report(Y_test, logistic.predict(X_val), 
                                    target_names=target_names)

Accuracy =  0.751912909482
             precision    recall  f1-score   support

       REAL       0.77      0.73      0.75     75278
       FAKE       0.74      0.78      0.76     75278

avg / total       0.75      0.75      0.75    150556



In [5]:
# Linear Support Vector Classifier with l2 regularizer and hinge loss
linearsvc = LinearSVC(penalty='l2', loss='hinge', verbose=1)
linearsvc.fit(X_train, Y_train)
print 'Accuracy = ', metrics.accuracy_score(Y_test, linearsvc.predict(X_val))
target_names = ['REAL', 'FAKE']
print metrics.classification_report(Y_test, linearsvc.predict(X_val), 
                                    target_names=target_names)

[LibLinear]Accuracy =  0.756655330907
             precision    recall  f1-score   support

       REAL       0.79      0.70      0.74     75278
       FAKE       0.73      0.81      0.77     75278

avg / total       0.76      0.76      0.76    150556





In [4]:
# Decision tree classifier
decisiontree = DecisionTreeClassifier(random_state=0)
decisiontree.fit(X_train, Y_train)
print 'Accuracy = ', metrics.accuracy_score(Y_test, decisiontree.predict(X_val))
target_names = ['REAL', 'FAKE']
print metrics.classification_report(Y_test, decisiontree.predict(X_val), 
                                    target_names=target_names)

Accuracy =  0.945076914902
             precision    recall  f1-score   support

       REAL       0.95      0.94      0.94     75278
       FAKE       0.94      0.95      0.95     75278

avg / total       0.95      0.95      0.95    150556



In [7]:
# standard Gradient Boosting Classifier 
gb = GradientBoostingClassifier()
gb.fit(X_train, Y_train)
print 'Accuracy = ', metrics.accuracy_score(Y_test, gb.predict(X_val.toarray()))
target_names = ['REAL', 'FAKE']
print metrics.classification_report(Y_test, gb.predict(X_val.toarray()), 
                                    target_names=target_names)

Accuracy =  0.826476527007
             precision    recall  f1-score   support

       REAL       0.76      0.96      0.85     75278
       FAKE       0.95      0.69      0.80     75278

avg / total       0.85      0.83      0.82    150556



In [8]:
# extremely randomized tree classifier.
ert = ExtraTreeClassifier(splitter='best')
ert.fit(X_train, Y_train)
print 'Accuracy = ', metrics.accuracy_score(Y_test, ert.predict(X_val.toarray()))
target_names = ['REAL', 'FAKE']
print metrics.classification_report(Y_test, ert.predict(X_val), 
                                    target_names=target_names)

Accuracy =  0.751275272988
             precision    recall  f1-score   support

       REAL       0.75      0.76      0.75     75278
       FAKE       0.76      0.74      0.75     75278

avg / total       0.75      0.75      0.75    150556



In [None]:
pickle.dump(decisiontree, open('decisiontreeclassifiermodel.pkl', 'w'))

In [7]:
# MLP Classifier as in the FOIL paper
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1)
mlp.fit(X_train, Y_train)
print 'Accuracy = ', metrics.accuracy_score(Y_test, mlp.predict(X_val.toarray()))
target_names = ['REAL', 'FAKE']
print metrics.classification_report(Y_test, mlp.predict(X_val), 
                                    target_names=target_names)


Accuracy =  0.948444432636
             precision    recall  f1-score   support

       REAL       0.96      0.94      0.95     75278
       FAKE       0.94      0.96      0.95     75278

avg / total       0.95      0.95      0.95    150556



In [10]:
cmat = metrics.confusion_matrix(Y_test, mlp.predict(X_val.toarray()))
print zip(target_names, cmat.diagonal()/cmat.sum(axis=1))

[('REAL', 0.94007545365179734), ('FAKE', 0.95681341162092515)]


In [6]:
print metrics.classification_report(Y_test, mlp.predict(X_val), 
                                    target_names=target_names, digits=4)

             precision    recall  f1-score   support

       REAL     0.9568    0.9414    0.9490     75278
       FAKE     0.9423    0.9575    0.9499     75278

avg / total     0.9496    0.9495    0.9495    150556

