In [27]:
%matplotlib inline
%load_ext autoreload

import os
import subprocess

import ujson
import gensim
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing import sequence
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D

from fnc_score import report_score

from preprocessor import FNCDataPreProcessor

sns.set(color_codes=True)

Using TensorFlow backend.


In [13]:
path_fnr = '/home/ubuntu/FakeNewsRecognition/'
path_fasttext = '/home/ubuntu/fastText/'

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Data-preprocessing" data-toc-modified-id="Data-preprocessing-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Data preprocessing</a></span></li><li><span><a href="#Train-Simple-CNN" data-toc-modified-id="Train-Simple-CNN-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Train Simple CNN</a></span><ul class="toc-item"><li><span><a href="#Evaluate" data-toc-modified-id="Evaluate-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Evaluate</a></span></li></ul></li><li><span><a href="#Training-FastText-classifier" data-toc-modified-id="Training-FastText-classifier-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Training FastText classifier</a></span><ul class="toc-item"><li><span><a href="#Prepare-FNC-training-data" data-toc-modified-id="Prepare-FNC-training-data-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Prepare FNC training data</a></span></li><li><span><a href="#Training-classifier" data-toc-modified-id="Training-classifier-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Training classifier</a></span></li><li><span><a href="#Results" data-toc-modified-id="Results-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Results</a></span></li></ul></li></ul></div>

# Data preprocessing

In [7]:
path_data = path_fnr + 'data/8_fnc-1/'

In [8]:
path_train_bodies = path_data + 'train_bodies.csv'
path_train_stances = path_data + 'train_stances.csv'

path_competition_test_bodies = path_data + 'competition_test_bodies.csv'
path_competition_test_stances = path_data + 'competition_test_stances.csv'

In [None]:
preprocessor_train = FNCDataPreProcessor(path_train_bodies, path_train_stances)
preprocessor_competition = FNCDataPreProcessor(path_competition_test_bodies, path_competition_test_stances)

In [None]:
X_train, X_test, y_train, y_test = preprocessor_train.training_data(feature_vec_body_size=250)

In [None]:
X_comp_train, X_comp_test, y_comp_train, y_comp_test = preprocessor_competition.training_data(feature_vec_body_size=250)

In [None]:
list(set(preprocessor_train.stances_raw['Stance']))

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Train Simple CNN

In [None]:
def cnn_simple_model(input_shape, filters=250, kernel_size=3, hidden_dims=250):
    model = Sequential()
    model.add(Conv1D(filters, kernel_size, input_shape=(input_shape[0], input_shape[1]), padding='valid', 
               activation='relu', strides=1))
    model.add(GlobalMaxPooling1D())
#     model.add(Conv1D(filters, kernel_size, activation='relu'))
#     model.add(GlobalMaxPooling1D())
    # We add a vanilla hidden layer:
    model.add(Dense(hidden_dims))
    model.add(Dropout(0.2))
    model.add(Activation('relu'))
    # We project onto a single unit output layer, and squash it with a sigmoid:
    model.add(Dense(4))
    model.add(Activation('sigmoid'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [None]:
input_shape = 271, 300

filters = 250
kernel_size = 3
hidden_dims = 250

batch_size = 32
epochs = 3

In [None]:
with tf.device('/gpu:0'):
    cnn_model = cnn_simple_model(input_shape, filters, kernel_size, hidden_dims)
    cnn_model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(X_test, y_test))

In [None]:
cnn_model.save('data/fnc-1-categorical-3-epochs.model')

## Evaluate 

In [None]:
cnn_model = load_model('data/fnc-1.model')

In [None]:
y_pred = cnn_model.predict(X_train, verbose=1)
y_comp_pred = cnn_model.predict(X_comp_train, verbose=1)

In [None]:
labels = list(set(preprocessor_train.stances_raw['Stance']))
def to_labels(m, labels):
    l = []
    for row in np.argmax(m, axis=1):
        l.append(labels[row])
    return l

In [None]:
y_train_labeled = to_labels(y_train, labels)
y_pred_labeled = to_labels(y_pred, labels)

y_comp_train_labeled = to_labels(y_comp_train, labels)
y_comp_pred_labeled = to_labels(y_comp_pred, labels)

In [None]:
y_comp_train_labeled = to_labels(y_comp_train, labels)
y_comp_train_labeled

In [None]:
report_score(y_train_labeled,      y_pred_labeled)
report_score(y_comp_train_labeled, y_comp_pred_labeled)

#  Training FastText classifier

https://github.com/facebookresearch/fastText

Mostly from tutorial: https://github.com/facebookresearch/fastText/blob/master/tutorials/supervised-learning.md

fastText folder with compiled ./fasttext should be located in the parent = `~` directory

Change log: 
1. First I tried without any cleaning, but the output seemed very much random

## Prepare FNC training data

FastText supervised algorithm requires data in the following format: 
```
__label__sauce __label__cheese How much does potato starch affect a cheese sauce recipe?
__label__food-safety __label__acidity Dangerous pathogens capable of growing in acidic environments
__label__cast-iron __label__stove How do I cover up the white spots on my cast iron stove?
...
```

In [9]:
path_data_fasttext = path_data + 'fasttext/'
path_data_fasttext_supervised_train = path_data_fasttext + 'supervised.train'
path_data_fasttext_supervised_competition = path_data_fasttext + 'supervised.competition_train'

In [None]:
def to_fasttext_classifier_simple(pp, path):
    with open(path, 'w') as _out:
        for stance_id, headline, body_id, stance in tqdm(pp.stances_raw.itertuples()):
            body = pp.bodies_raw.loc[pp.bodies_raw['Body ID'] == body_id].articleBody.values[0]
            body = body.replace('\n', '')
            _out.write('__label__%s %s -- %s\n' % (stance, headline, body))
            
def to_fasttext_classifier_tagged(pp, path):
    with open(path, 'w') as _out:
        bar = tqdm(total=len(pp.stances_raw))
        for stance_idx, (stance_id, headline, body_id, stance) in enumerate(pp.stances_raw.itertuples()):
            body_idx = pp.bodies_raw.loc[pp.bodies_raw['Body ID'] == body_id].index.values[0]
            tagged_headline = ' '.join(pp.stances_tagged[stance_idx])
            tagged_body = ' '.join(pp.bodies_tagged[body_idx])
            _out.write('__label__%s %s -- %s\n' % (stance, tagged_headline, tagged_body))
            bar.update()

In [None]:
to_fasttext_classifier_tagged(preprocessor_train, path_data_fasttext_supervised_train)

In [None]:
to_fasttext_classifier_tagged(preprocessor_competition, path_data_fasttext_supervised_competition)

## Training classifier

In [28]:
path_data_fasttext_model_train = path_data_fasttext + 'model_train'

In [32]:
subprocess.call('cd %s && ./fasttext supervised -epoch 25 -wordNgrams 5 -input %s -output %s' % 
                (path_fasttext, path_data_fasttext_supervised_train, path_data_fasttext_model_train), shell=True)

0

In [33]:
subprocess.call('cd %s && ./fasttext test "%s.bin" %s' % 
                (path_fasttext, path_data_fasttext_model_train, path_data_fasttext_supervised_train), shell=True)

0

In [34]:
subprocess.call('cd %s && ./fasttext test "%s.bin" %s' % 
                (path_fasttext, path_data_fasttext_model_train, path_data_fasttext_supervised_competition), shell=True)

0

## Results

Precision and recall for the same data:

```
N       49972
P@1     0.863
R@1     0.863
Number of examples: 49972
```

Precision and recall for the competition data:

```
N       25413
P@1     0.593
R@1     0.593
Number of examples: 25413
```

**Not much worse than CNN above!**