In [3]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [2]:
import numpy as np

import pandas as pd
from utils.dataset import DataSet
from utils.generate_test_splits import generate_hold_out_split, read_ids

In [3]:
d = DataSet()
generate_hold_out_split(d)
trainID = set(read_ids("training_ids.txt", "splits"))
valID = set(read_ids("hold_out_ids.txt", "splits"))

Reading dataset
Total stances: 49972
Total bodies: 1683


# Hyperparams

In [4]:
MAX_SENT_PER_ART = 20
MAX_SENT_LEN = 30
MAX_VOCAB = 20000

# Set up training and validation data

In [5]:
train_stances = [stance for stance in d.stances if stance['Body ID'] in trainID]
train_headlines = [stance['Headline'] for stance in train_stances]
train_labels = [stance['Stance'] for stance in train_stances]
train_body = [d.articles[stance['Body ID']] for stance in train_stances]

val_stances = [stance for stance in d.stances if stance['Body ID'] in valID]
val_headlines = [stance['Headline'] for stance in val_stances]
val_labels = [stance['Stance'] for stance in val_stances]
val_body = [d.articles[stance['Body ID']] for stance in val_stances]

# Vectorization and Tokenization

In [17]:
from keras.preprocessing.text import Tokenizer 
tokenizer = Tokenizer(num_words=MAX_VOCAB)
tokenizer.fit_on_texts(train_body)
tokenizer.fit_on_texts(train_headlines)

from nltk import tokenize

sent_tok_art = []
for article in train_body:
    sent_tok_art.append(tokenize.sent_tokenize(article))

vsent_tok_art = []
for article in val_body:
    vsent_tok_art.append(tokenize.sent_tokenize(article))

from keras.preprocessing.text import text_to_word_sequence

X_train = np.zeros((len(train_stances), MAX_SENT_PER_ART, MAX_SENT_LEN), dtype='int32')

for i, article in enumerate(sent_tok_art):
    for j, sentence in enumerate(article[:MAX_SENT_PER_ART]):
        words = text_to_word_sequence(sentence)
        for k, word in enumerate(words[:MAX_SENT_LEN]):
            X_train[i][j][k] = tokenizer.word_index[word]

In [19]:
X_train[0, :, :]

array([[ 1018,     4,  8323,    35,  6624,    22,    53,  1649,   423,
          479,    34,   570,   443,  1057,     3,   291,     4,  4218,
          156,     1,   138,  7268,     1,  2350,  2217,     5,     1,
         9844,     4,     3],
       [    1,  2350,   619,     4,  2591,    12,     5,     3,   190,
            7,  1183,  1198,   825,     6,  3765,    22,     1,   619,
            4,   224,  1517,    30,  6624,    50,   103,  2868,  2118,
           22,   289,  3039],
       [ 2350,    21,  3158,  7268,     5,   320,   212,  1665,     3,
          720,  3738,     7,   569,  8184,  2725,     6, 19864,  4352,
         8149,   181,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0],
       [    1,  3738,  5777,   768,     6,   741, 19865,    11,  4631,
            4,   765,   162,   171,     4,     1,  4675,  7634,   144,
         8323,   188,   828,    19, 12176,   886,    74,  1631,     0,
            0,     0,     0],
       [    1,   873,     8,