In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os

import ujson
import gensim
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D

from cleaner import Cleaner
from tagger import Tagger

sns.set(color_codes=True)

## Data Cleaning

In [None]:
data_path = 'data/8_fnc-1/'
train_bodies = pd.read_csv(data_path + 'train_bodies.csv')
train_stances = pd.read_csv(data_path + 'train_stances.csv')

In [None]:
def cached(path, fn):
    if not os.path.exists(path):
        with open(path, 'w') as _out:
            result = fn()
            ujson.dump(result, path)
            return result
    
    with open(path, 'r') as _in:
         return ujson.load(_in)

In [None]:
train_bodies_tagged = cached(data_path + 'train_bodies_tagged.json',
                             lambda: Tagger.batch_perform(Cleaner.batch_perform(train_bodies['articleBody']), 2, 100))

In [None]:
train_stances_tagged = cached(data_path + 'train_stances_tagged.json',
                              lambda: Tagger.batch_perform(Cleaner.batch_perform(train_stances['Headline']), 2, 100))

## Load word2vec
https://drive.google.com/file/u/1/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing

In [None]:
word2vec = KeyedVectors.load_word2vec_format('/Users/several27/GoogleNews-vectors-negative300.bin', 
                                            binary=True)

In [None]:
def tagged_to_w2v(tagged, model):
    w2v = []
    missed = 0
    for words in tagged:
        words_w2v = [model[w] for w in words if w in model]
        w2v.append(words_w2v)
        
        missed += len(words) - len(words_w2v)

    return w2v, missed

In [None]:
train_bodies_w2v, train_bodies_missed_words = tagged_to_w2v(train_bodies_tagged, word2vec)
print('Missed words', train_bodies_missed_words, '/', sum(len(ws) for ws in train_bodies_tagged))

In [None]:
train_stances_w2v, train_stances_missed_words = tagged_to_w2v(train_stances_tagged, word2vec)
print('Missed words', train_stances_missed_words, '/', sum(len(ws) for ws in train_stances_tagged))

## Construct training vectors

In [None]:
body_max_len = max(len(ws) for ws in train_bodies_w2v)
stance_max_len = max(len(ws) for ws in train_stances_w2v)

In [None]:
body_max_len, stance_max_len

In [None]:
x_train = np.zeros((len(train_stances_w2v), body_max_len + stance_max_len + 1, 300))
x_train.shape

In [None]:
for stance_id, headline, body_id, stance in tqdm(train_stances.itertuples()):
    x_train_row = np.concatenate(
        (np.array(train_stances_w2v[stance_id]), np.zeros((0, 300)), 
         train_bodies_w2v[train_bodies.loc[train_bodies['Body ID'] == body_id].index.values[0]]),
        axis=0)
    x_train[stance_id][:x_train_row.shape[0]] = x_train_row

In [None]:
y_train = np.zeros((len(train_stances), len(set(train_stances['Stance']))))

In [None]:
possible_stances = list(set(train_stances['Stance']))
for idx, stance in enumerate(train_stances['Stance']):
    y_train[idx][possible_stances.index(stance)] = 1

## Train CNN

In [None]:
filters = 250
kernel_size = 3
hidden_dims = 250

batch_size = 32
epochs = 2

In [None]:
cnn = Sequential()

In [None]:
cnn.add(Conv1D(filters, kernel_size, input_shape=(x_train.shape[1], x_train.shape[2]), padding='valid', 
               activation='relu', strides=1))

In [None]:
cnn.add(GlobalMaxPooling1D())

In [None]:
# We add a vanilla hidden layer:
cnn.add(Dense(hidden_dims))
cnn.add(Dropout(0.2))
cnn.add(Activation('relu'))

In [None]:
# We project onto a single unit output layer, and squash it with a sigmoid:
cnn.add(Dense(4))
cnn.add(Activation('sigmoid'))

In [None]:
cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
cnn.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1) # validation_data=(x_test, y_test))