In [1]:
from __future__ import division
import numpy as np 
import pickle
import scipy.io as sio
import scipy.sparse as ss 

from sklearn.preprocessing import binarize
from sklearn.svm import LinearSVC, NuSVC, SVC 
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import metrics

np.random.seed(12345)

In [2]:
# training data:
# train_feats = BOW linguistic features
# train_image_feats = BOC image categorical features
# train_y = FOILED or NOT FOILED => IF FOILED = 1, else 0
train_feats = sio.mmread('data_new/train_feats.mtx')
train_image_feats = sio.mmread('data_new/train_image_feats.mtx')
train_target = np.array(sio.mmread('data_new/train_y.mtx').todense()).tolist()[0]
#############
# validation data: Same pattern as training; 
# The testing data comes from the karpathy 5k validation set only. 
val_feats = sio.mmread('data_new/test_feats.mtx')
val_image_feats = sio.mmread('data_new/test_image_feats.mtx')
val_target = np.array(sio.mmread('data_new/test_y.mtx').todense()).tolist()[0]

In [3]:
# Data processing, concatinating images with
# linguistic features and image features 
X_train = ss.hstack([(train_feats), train_image_feats])
#X_train = ss.hstack([binarize(train_feats), train_image_feats])
#X_train = train_feats

X_val = ss.hstack([(val_feats), val_image_feats])
#X_val = val_feats
#X_val = ss.hstack([binarize(val_feats), val_image_feats])

Y_train = np.array(train_target)
Y_test = np.array(val_target)

In [26]:
# Logistic Regression with 'l1' penalty
logistic = LogisticRegression(penalty='l1')
logistic.fit(X_train, Y_train)
print 'Accuracy = ', metrics.accuracy_score(Y_test, logistic.predict(X_val))
target_names = ['REAL', 'FAKE']
print metrics.classification_report(Y_test, logistic.predict(X_val), 
                                    target_names=target_names)

Accuracy =  0.741440741101
             precision    recall  f1-score   support

       REAL       0.68      0.88      0.77     30263
       FAKE       0.84      0.60      0.70     31483

avg / total       0.76      0.74      0.74     61746



In [27]:
cmat = metrics.confusion_matrix(Y_test, logistic.predict(X_val))
print cmat 
print cmat.diagonal()/cmat.sum(axis=1)


#cmat.sum(axis=1)

[[26750  3513]
 [12452 19031]]
[ 0.88391766  0.60448496]


In [7]:
# Linear Support Vector Classifier with l2 regularizer and hinge loss
linearsvc = LinearSVC(penalty='l2', loss='hinge', verbose=1)
linearsvc.fit(X_train, Y_train)
print 'Accuracy = ', metrics.accuracy_score(Y_test, linearsvc.predict(X_val))
target_names = ['REAL', 'FAKE']
print metrics.classification_report(Y_test, linearsvc.predict(X_val), 
                                    target_names=target_names)

[LibLinear]Accuracy =  0.75741916629
             precision    recall  f1-score   support

       REAL       0.79      0.70      0.74     75278
       FAKE       0.73      0.82      0.77     75278

avg / total       0.76      0.76      0.76    150556





In [9]:
# Decision tree classifier
decisiontree = DecisionTreeClassifier(random_state=0, class_weight='balanced')
decisiontree.fit(X_train, Y_train)
print 'Accuracy = ', metrics.accuracy_score(Y_test, decisiontree.predict(X_val))
target_names = ['REAL', 'FAKE']
print metrics.classification_report(Y_test, decisiontree.predict(X_val), 
                                    target_names=target_names)


Accuracy =  0.738476986363
             precision    recall  f1-score   support

       REAL       0.68      0.87      0.77     30263
       FAKE       0.83      0.61      0.70     31483

avg / total       0.76      0.74      0.73     61746



In [8]:
#print metrics.confusion_matrix(Y_test, decisiontree.predict(X_val))
help(DecisionTreeClassifier)

Help on class DecisionTreeClassifier in module sklearn.tree.tree:

class DecisionTreeClassifier(BaseDecisionTree, sklearn.base.ClassifierMixin)
 |  A decision tree classifier.
 |  
 |  Read more in the :ref:`User Guide <tree>`.
 |  
 |  Parameters
 |  ----------
 |  criterion : string, optional (default="gini")
 |      The function to measure the quality of a split. Supported criteria are
 |      "gini" for the Gini impurity and "entropy" for the information gain.
 |  
 |  splitter : string, optional (default="best")
 |      The strategy used to choose the split at each node. Supported
 |      strategies are "best" to choose the best split and "random" to choose
 |      the best random split.
 |  
 |  max_features : int, float, string or None, optional (default=None)
 |      The number of features to consider when looking for the best split:
 |  
 |          - If int, then consider `max_features` features at each split.
 |          - If float, then `max_features` is a percentage and
 |

In [9]:
# standard Gradient Boosting Classifier 
gb = GradientBoostingClassifier()
gb.fit(X_train, Y_train)
print 'Accuracy = ', metrics.accuracy_score(Y_test, gb.predict(X_val.toarray()))
target_names = ['REAL', 'FAKE']
print metrics.classification_report(Y_test, gb.predict(X_val.toarray()), 
                                    target_names=target_names)

Accuracy =  0.826456600866
             precision    recall  f1-score   support

       REAL       0.76      0.96      0.85     75278
       FAKE       0.95      0.69      0.80     75278

avg / total       0.85      0.83      0.82    150556



In [10]:
# extremely randomized tree classifier.
ert = ExtraTreeClassifier(splitter='best')
ert.fit(X_train, Y_train)
print 'Accuracy = ', metrics.accuracy_score(Y_test, ert.predict(X_val.toarray()))
target_names = ['REAL', 'FAKE']
print metrics.classification_report(Y_test, ert.predict(X_val), 
                                    target_names=target_names)

Accuracy =  0.76075347379
             precision    recall  f1-score   support

       REAL       0.76      0.77      0.76     75278
       FAKE       0.76      0.75      0.76     75278

avg / total       0.76      0.76      0.76    150556



In [11]:
pickle.dump(decisiontree, open('decisiontreeclassifiermodel.pkl', 'w'))

In [5]:
# MLP Classifier as in the FOIL paper
mlp = MLPClassifier(solver='adam', alpha=1e-5, random_state=1)
mlp.fit(X_train, Y_train)
print 'Accuracy = ', metrics.accuracy_score(Y_test, mlp.predict(X_val.toarray()))
target_names = ['REAL', 'FAKE']
print metrics.classification_report(Y_test, mlp.predict(X_val), 
                                    target_names=target_names, digits=4)

Accuracy =  0.753020438571
             precision    recall  f1-score   support

       REAL     0.6923    0.8930    0.7799     30263
       FAKE     0.8574    0.6185    0.7186     31483

avg / total     0.7765    0.7530    0.7487     61746



Help on function classification_report in module sklearn.metrics.classification:

classification_report(y_true, y_pred, labels=None, target_names=None, sample_weight=None, digits=2)
    Build a text report showing the main classification metrics
    
    Read more in the :ref:`User Guide <classification_report>`.
    
    Parameters
    ----------
    y_true : 1d array-like, or label indicator array / sparse matrix
        Ground truth (correct) target values.
    
    y_pred : 1d array-like, or label indicator array / sparse matrix
        Estimated targets as returned by a classifier.
    
    labels : array, shape = [n_labels]
        Optional list of label indices to include in the report.
    
    target_names : list of strings
        Optional display names matching the labels (same order).
    
    sample_weight : array-like of shape = [n_samples], optional
        Sample weights.
    
    digits : int
        Number of digits for formatting output floating point values
    
   