In [1]:
from __future__ import division
import json
import scipy.sparse as ss
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import scipy.io as sio

from collections import defaultdict, Counter

from sklearn.preprocessing import binarize
from sklearn.svm import LinearSVC, NuSVC, SVC 
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from tqdm import tqdm
from operator import itemgetter
np.random.seed(123)

In [2]:
# loading the newdata file 
newdata = [w.strip().split('\t') for w in open('newdata/mcic-coco/data/mc_mscoco_train_foilformat.txt')]

# loading gold image features 
ourimagetrainfeats = [((w.strip().split()[0]), np.array(map(float, w.strip().split()[1:]))) 
                      for w in open('data/mscoco_boc_gt_train2014.txt')]

# converting into a dictionary as image feats are constant across 5 different captions / image
ourimagetrainfeats = dict(ourimagetrainfeats)

# getting images
newdataimages = [ourimagetrainfeats[l[0]] for l in newdata]

# Retaining Captions
training_annotations = [l[1] for l in newdata]

training_labels = [l[2] for l in newdata]

# Loading MSCOCO object category names
categories = [w.strip().split('\t')[1] for w in open('data/categories.txt')]

# Printing dataset samples 
print len(training_annotations), len(newdata)


1998645 1998645


In [3]:
training_annotations[0]

'two girls and one boy playing video games .'

In [4]:
# Using tf-idf based feature extractor
tf_vectorizer = CountVectorizer(max_features=None, lowercase=True) # Use the full training vocab and lowercase

# Fit the training dataset and save the model 
tf_model = tf_vectorizer.fit(training_annotations) 

# Obtain training features - this is a sparse matrix
training_feats = tf_model.transform(training_annotations)

# Obtain the outputs (0/1)
training_y = [0 if w == 'REAL' else 1 for w in training_labels]

# just obtaining the image features - iterating over image id and just saving in the list
#training_image_feats = [ourimagetrainfeats[i['image_id']] for i in foil_train['annotations']]
training_image_feats = newdataimages

# converting into a sparse matrix
training_image_feats_sparse = ss.csr_matrix(np.array(training_image_feats))

In [5]:
# Printing the exact bag of words active for the first sample
print np.array(tf_model.get_feature_names())[([np.array(training_feats[0].todense()) > 0][0][0]).tolist()].tolist()

# printing the original sentence
print 'Original Sentence: ',   training_annotations[0]

# checking if Y is correct -> Original = 0 and Fake = 1 
print 'VALUE: ', training_y[0], training_labels[0]

[u'and', u'boy', u'games', u'girls', u'one', u'playing', u'two', u'video']
Original Sentence:  two girls and one boy playing video games .
VALUE:  1 FAKE


In [6]:
# loading the file 
newtestdata = [w.strip().split('\t') for w in open('newdata/mcic-coco/data/mc_mscoco_test_foilformat.txt')]


# loading gold image features 
ourimagetestfeats = [((w.strip().split()[0]), np.array(map(float, w.strip().split()[1:]))) 
                      for w in open('data/mscoco_boc_gt_train2014.txt')]

# retaining captions 
testing_annotations = [l[1] for l in newtestdata]



# converting into dict
ourimagetestfeats = dict(ourimagetestfeats)

# testing labels
testing_labels = [l[2] for l in newtestdata]



# using the model that was fit on the training data, extracting the test features on the test data
testing_feats = tf_model.transform(testing_annotations) 

# test outputs 
testing_y = [0 if w == 'REAL' else 1 for w in testing_labels]                    

# ust obtaining the image features - iterating over image id and just saving in the list
testing_image_feats = [ourimagetrainfeats[l[0]] for l in newtestdata]

#converting into a sparse array
testing_image_feats_sparse = ss.csr_matrix(np.array(testing_image_feats))

In [7]:
# Only Language Features: 

# just putting into a standard X_train, X_test, Y_train, Y_test stuff
X_train = training_feats
X_test = testing_feats

Y_train = np.array(training_y)
Y_test = np.array(testing_y)

In [33]:
testing_labels[:5], sum(Y_train), len(Y_train), sum(Y_test), len(Y_test)

(['FAKE', 'FAKE', 'FAKE', 'FAKE', 'REAL'], 1598916, 1998645, 29328, 36660)

In [34]:
# using a MultiLayerPerceptron model - default settings

mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1)

# fitting over training data
mlp.fit(X_train, Y_train)

# printing overall accuracy 
print 'Accuracy = ', metrics.accuracy_score(Y_test, mlp.predict(X_test.toarray()))

# printing the precision and recall over each class 
target_names = ['REAL', 'FAKE']
print metrics.classification_report(Y_test, mlp.predict(X_test.toarray()), 
                                    target_names=target_names, digits=4)

# printing accuracy over each class
cmat = metrics.confusion_matrix(Y_test, mlp.predict(X_test.toarray()))
print zip(target_names, cmat.diagonal()/cmat.sum(axis=1))

Accuracy =  0.8058101473
             precision    recall  f1-score   support

       REAL     0.5516    0.1552    0.2423      7332
       FAKE     0.8210    0.9685    0.8886     29328

avg / total     0.7671    0.8058    0.7594     36660

[('REAL', 0.15521003818876158), ('FAKE', 0.96846017457719591)]


In [2]:
(0.82 + 0.55) / 2

0.685

In [8]:
# using a DecisionTreeClassifier 
dtc = DecisionTreeClassifier(class_weight='balanced')

# fitting over training data
dtc.fit(X_train, Y_train)


# printing overall accuracy 
print 'Accuracy = ', metrics.accuracy_score(Y_test, dtc.predict(X_test.toarray()))

# printing the precision and recall over each class 
target_names = ['REAL', 'FAKE']
print metrics.classification_report(Y_test, dtc.predict(X_test.toarray()), 
                                    target_names=target_names, digits=4)

# printing accuracy over each class
cmat = metrics.confusion_matrix(Y_test, dtc.predict(X_test.toarray()))
print zip(target_names, cmat.diagonal()/cmat.sum(axis=1))


Accuracy =  0.589934533552
             precision    recall  f1-score   support

       REAL     0.1523    0.2300    0.1832      7332
       FAKE     0.7793    0.6799    0.7262     29328

avg / total     0.6539    0.5899    0.6176     36660

[('REAL', 0.22995090016366612), ('FAKE', 0.67993044189852703)]


In [9]:
# Only Image Features

X_train = training_image_feats
X_test = testing_image_feats

Y_train = np.array(training_y)
Y_test = np.array(testing_y)

In [10]:
# using a MultiLayerPerceptron model - default settings

mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1)
mlp.fit(X_train, Y_train)
print 'Accuracy = ', metrics.accuracy_score(Y_test, mlp.predict(X_test))
target_names = ['REAL', 'FAKE']
print metrics.classification_report(Y_test, mlp.predict(X_test), 
                                    target_names=target_names, digits=4)
cmat = metrics.confusion_matrix(Y_test, mlp.predict(X_test))
print zip(target_names, cmat.diagonal()/cmat.sum(axis=1))

Accuracy =  0.8
             precision    recall  f1-score   support

       REAL     0.0000    0.0000    0.0000      7332
       FAKE     0.8000    1.0000    0.8889     29328

avg / total     0.6400    0.8000    0.7111     36660

[('REAL', 0.0), ('FAKE', 1.0)]


  'precision', 'predicted', average, warn_for)


In [37]:
# text and image features
 
X_train = ss.hstack([(training_feats), training_image_feats_sparse])
X_test = ss.hstack([(testing_feats), testing_image_feats_sparse])

Y_train = np.array(training_y)
Y_test = np.array(testing_y)

In [38]:
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1)
mlp.fit(X_train, Y_train)
print 'Accuracy = ', metrics.accuracy_score(Y_test, mlp.predict(X_test.toarray()))
target_names = ['REAL', 'FAKE']
print metrics.classification_report(Y_test, mlp.predict(X_test.toarray()), 
                                    target_names=target_names, digits=4)
cmat = metrics.confusion_matrix(Y_test, mlp.predict(X_test.toarray()))
print zip(target_names, cmat.diagonal()/cmat.sum(axis=1))

Accuracy =  0.813502454992
             precision    recall  f1-score   support

       REAL     0.6118    0.1847    0.2837      7332
       FAKE     0.8265    0.9707    0.8928     29328

avg / total     0.7835    0.8135    0.7710     36660

[('REAL', 0.18466993998908893), ('FAKE', 0.97071058374249863)]


In [18]:
#testmat = X_test.toarray()
carr = np.array(range(len(Y_test)))
carr[Y_test > 0]

array([     2,      3,      7, ..., 150553, 150554, 150555])

In [29]:
rarr = np.array(range(testmat.shape[1]))
rarr[(testmat[2] > 0).tolist()]

array([ 1196, 10109, 11385, 11455, 13985, 17267, 17917])

In [28]:
mlp.predict_proba(testmat[2].reshape(-1,1).T).argmax() 

1

In [37]:
tmat = testmat 

In [60]:
tmat[2, 17267] 

1

In [82]:
tmat[2, 1196] = 1
tmat[2, 10109] = 1
tmat[2, 11385] = 1
tmat[2, 11455] = 1
tmat[2, 17917] = 1

In [85]:
mlp.predict_proba(tmat[2].reshape(-1,1).T)[0][1] - mlp.predict_proba(tmat[2].reshape(-1,1).T)[0][0]

0.85962384428013072

In [72]:
np.array(tf_model.get_feature_names())[([np.array(testing_feats[2].todense()) > 0][0][0]).tolist()].tolist()

[u'back', u'man', u'of', u'on', u'riding', u'the', u'truck']

In [77]:
tf_model.get_feature_names()[17917]

u'truck'

In [20]:
def detect_foil(X, Y, model):
    
    tmat = X.toarray()
    
    carr = np.array(range(len(Y)))
    yvals = carr[Y_test > 0]
    
    
    rarr = np.array(range(tmat.shape[1]))
    retvals = [] 
    for s in tqdm(yvals):
        tmpdict = {} 
        marr = tmat[s]
        mvals = rarr[(marr > 0).tolist()]
        for w in mvals:
            tmp = marr[w]
            marr[w] = 0 
            tmpdict[w] = model.predict_proba(marr.reshape(-1, 1).T)[0][1] - model.predict_proba(marr.reshape(-1, 1).T)[0][0]
            marr[w] = tmp
            
        sorted_tmpdict = sorted(tmpdict.items(), key=itemgetter(1))
#        from IPython.core.debugger import Tracer; Tracer()() 
        retvals.append(sorted_tmpdict[0][0])
        
    return retvals


    
    
    
    

In [21]:
retvals = detect_foil(X_test, Y_test, mlp)

100%|██████████| 75278/75278 [11:36<00:00, 108.07it/s]


In [97]:
len(carr[Y_test > 0]), len(retvals)

(75278, 75278)

In [16]:
retvals

[17917, 17729, 800, 2857, 7806, 18122, 1225, 18036, 17522, 15257]

In [22]:
featnames = tf_model.get_feature_names() 
farr = []
for i in carr[Y_test > 0]:
    farr.append(featnames.index(foil_test['annotations'][i]['foil_word']))

In [23]:
len(retvals), len(farr)

(75278, 75278)

In [25]:
metrics.accuracy_score(retvals, farr)

0.953532240495231