In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os

import ujson
import gensim
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from keras.preprocessing import sequence
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D

from cleaner import Cleaner
from tagger import Tagger

sns.set(color_codes=True)

Using TensorFlow backend.


## Data Cleaning

In [3]:
data_path = 'data/8_fnc-1/'
train_bodies = pd.read_csv(data_path + 'competition_test_bodies.csv')
train_stances = pd.read_csv(data_path + 'competition_test_stances.csv')

In [4]:
def cached(path, fn):
    if not os.path.exists(path):
        with open(path, 'w') as _out:
            result = fn()
            ujson.dump(result, _out)
            return result
    
    with open(path, 'r') as _in:
         return ujson.load(_in)

In [5]:
train_bodies_tagged = cached(data_path + 'ctrain_bodies_tagged.json',
                             lambda: Tagger.batch_perform(Cleaner.batch_perform(train_bodies['articleBody']), 4, 100))

In [6]:
train_stances_tagged = cached(data_path + 'ctrain_stances_tagged.json',
                              lambda: Tagger.batch_perform(Cleaner.batch_perform(train_stances['Headline']), 4, 100))

## Load word2vec
https://drive.google.com/file/u/1/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing

In [7]:
word2vec = KeyedVectors.load_word2vec_format('/home/ubuntu/GoogleNews-vectors-negative300.bin', 
                                            binary=True)

In [8]:
def tagged_to_w2v(tagged, model):
    w2v = []
    missed = 0
    for words in tagged:
        words_w2v = [model[w] for w in words if w in model]
        w2v.append(words_w2v)
        
        missed += len(words) - len(words_w2v)

    return w2v, missed

In [9]:
train_bodies_w2v, train_bodies_missed_words = tagged_to_w2v(train_bodies_tagged, word2vec)
print('Missed words', train_bodies_missed_words, '/', sum(len(ws) for ws in train_bodies_tagged))

Missed words 11002 / 163025


In [10]:
train_stances_w2v, train_stances_missed_words = tagged_to_w2v(train_stances_tagged, word2vec)
print('Missed words', train_stances_missed_words, '/', sum(len(ws) for ws in train_stances_tagged))

Missed words 14728 / 194455


## Construct training vectors

In [11]:
body_max_len = max(len(ws) for ws in train_bodies_w2v)
stance_max_len = max(len(ws) for ws in train_stances_w2v)

In [12]:
body_max_len, stance_max_len

(1727, 20)

In [13]:
(len(train_stances_w2v), body_max_len, stance_max_len, 300)

(25413, 1727, 20, 300)

In [14]:
x_train = np.zeros((len(train_stances_w2v), 500 + stance_max_len + 1, 300), dtype=np.float32)
x_train.shape

(25413, 521, 300)

In [15]:
for stance_id, headline, body_id, stance in tqdm(train_stances.itertuples()):
    headline = np.array(train_stances_w2v[stance_id]) 
    if headline.shape[0] == 0:
        headline = np.zeros((0, 300))

    x_train_row = np.concatenate(
        (headline, np.zeros((0, 300)), 
         (train_bodies_w2v[train_bodies.loc[train_bodies['Body ID'] == body_id].index.values[0]])[:500]),
        axis=0)
    x_train[stance_id][:x_train_row.shape[0]] = x_train_row

25413it [00:19, 1286.92it/s]


In [16]:
y_train = np.zeros((len(train_stances), len(set(train_stances['Stance']))))

In [17]:
possible_stances = list(set(train_stances['Stance']))
for idx, stance in enumerate(train_stances['Stance']):
    y_train[idx][possible_stances.index(stance)] = 1

## Distribute Data

In [18]:
x_test = x_train[:100]
y_test = y_train[:100]
x_train = x_train[:-100]
y_train = y_train[:-100]

## Train CNN

In [27]:
filters = 250
kernel_size = 3
hidden_dims = 250

batch_size = 32
epochs = 10

In [20]:
cnn = Sequential()

In [21]:
cnn.add(Conv1D(filters, kernel_size, input_shape=(x_train.shape[1], x_train.shape[2]), padding='valid', 
               activation='relu', strides=1))

In [22]:
cnn.add(GlobalMaxPooling1D())

In [23]:
# We add a vanilla hidden layer:
cnn.add(Dense(hidden_dims))
cnn.add(Dropout(0.2))
cnn.add(Activation('relu'))

In [24]:
# We project onto a single unit output layer, and squash it with a sigmoid:
cnn.add(Dense(4))
cnn.add(Activation('sigmoid'))

In [25]:
cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
cnn.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.05)

Train on 24047 samples, validate on 1266 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10

In [None]:
cnn.save('data/fnc-1.model')

## Test

In [28]:
cnn = load_model('data/fnc-1.model')
scores = cnn.evaluate(x_test, y_test, verbose=1)
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])

ValueError: Error when checking input: expected conv1d_1_input to have shape (None, 523, 300) but got array with shape (100, 521, 300)