In [1]:
import numpy as np
import pandas as pd
from utils.dataset import DataSet
from utils.generate_test_splits import generate_hold_out_split, read_ids

In [2]:
d = DataSet()
generate_hold_out_split(d, training=0.8)
trainID = set(read_ids("training_ids.txt", "splits"))
valID = set(read_ids("hold_out_ids.txt", "splits"))

Reading dataset
Total stances: 49972
Total bodies: 1683


# Hyperparams

In [3]:
MAX_SENT_PER_ART = 5
MAX_SENT_LEN = 20
MAX_VOCAB = 50000
VECTOR_SIZE = 100

import gensim.downloader as api
wv = api.load(f'glove-wiki-gigaword-{VECTOR_SIZE}')

# Set up training and validation data

In [4]:
train_stances = [stance for stance in d.stances if stance['Body ID'] in trainID]
train_headlines = [stance['Headline'] for stance in train_stances]
train_labels = [stance['Stance'] for stance in train_stances]
train_body = [d.articles[stance['Body ID']] for stance in train_stances]

val_stances = [stance for stance in d.stances if stance['Body ID'] in valID]
val_headlines = [stance['Headline'] for stance in val_stances]
val_labels = [stance['Stance'] for stance in val_stances]
val_body = [d.articles[stance['Body ID']] for stance in val_stances]

# Vectorization and Tokenization

In [5]:
import nltk
from nltk import tokenize
from keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.layers import TextVectorization
nltk.download('punkt')

vectorizer = TextVectorization(max_tokens=MAX_VOCAB, output_sequence_length=MAX_SENT_LEN)
vectorizer.adapt(train_body + train_headlines + val_body + val_headlines)

voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

[nltk_data] Downloading package punkt to /home/sw26wong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
sent_tok_art = []
for article in train_body:
    sent_tok_art.append(tokenize.sent_tokenize(article))

vsent_tok_art = []
for article in val_body:
    vsent_tok_art.append(tokenize.sent_tokenize(article))

In [7]:
X_train_body = np.zeros((len(train_stances), MAX_SENT_PER_ART, MAX_SENT_LEN), dtype='int32')

for i, article in enumerate(sent_tok_art):
    for j, sentence in enumerate(article[:MAX_SENT_PER_ART]):
        words = text_to_word_sequence(sentence)
        for k, word in enumerate(words[:MAX_SENT_LEN]):
            X_train_body[i][j][k] = word_index.get(word, 1) # get else UNK

X_train_head = np.zeros((len(train_stances), MAX_SENT_LEN), dtype='int32')

for i, headline in enumerate(train_headlines):
    words = text_to_word_sequence(headline)
    for j, word in enumerate(words[:MAX_SENT_LEN]):
        X_train_head[i][j] = word_index.get(word, 1)

X_val_body = np.zeros((len(val_stances), MAX_SENT_PER_ART, MAX_SENT_LEN), dtype='int32')

for i, article in enumerate(vsent_tok_art):
    for j, sentence in enumerate(article[:MAX_SENT_PER_ART]):
        words = text_to_word_sequence(sentence)
        for k, word in enumerate(words[:MAX_SENT_LEN]):
            X_val_body[i][j][k] = word_index.get(word, 1)

X_val_head = np.zeros((len(val_stances), MAX_SENT_LEN), dtype='int32')

for i, headline in enumerate(val_headlines):
    words = text_to_word_sequence(headline)
    for j, word in enumerate(words[:MAX_SENT_LEN]):
        X_val_head[i][j] = word_index.get(word, 1)

In [8]:
targets = pd.Series(train_labels)
one_hot = pd.get_dummies(targets,sparse = True)
one_hot_labels = np.asarray(one_hot)
y_train = one_hot_labels

In [9]:
targets = pd.Series(val_labels)
one_hot = pd.get_dummies(targets,sparse = True)
one_hot_labels = np.asarray(one_hot)
y_val = one_hot_labels

In [10]:
LABELS = ['agree', 'disagree', 'discuss', 'unrelated']

# Create Embedding Matrix from Glove embeddings

In [11]:
vocab_size = len(word_index)
embedding_matrix = np.zeros((vocab_size+1, VECTOR_SIZE))

for word, i in word_index.items():
    try:
        v = wv[word]
        embedding_matrix[i] = v
    except KeyError:
        pass

# Reduce memory usage

In [12]:
import gc
del wv
gc.collect()

88

# Model

In [13]:
from keras.models import Sequential
from keras.layers import Dense,LSTM, TimeDistributed, Activation
from keras.layers import Flatten, Permute, merge, Input
from keras.layers import Embedding
from keras.models import Model
from keras.layers import Input,Dense,multiply,concatenate,Dropout
from keras.layers import GRU, Bidirectional

from tensorflow.keras.optimizers import Adam


In [14]:
hidden_size = VECTOR_SIZE
trainable = True

sentence_input = Input(shape=(MAX_SENT_LEN,),dtype='int32')
sentence_embedding = Embedding(output_dim=hidden_size, 
                               input_dim=vocab_size+1, 
                               input_length=MAX_SENT_LEN,
                               weights=[embedding_matrix],
                               trainable=trainable,
                               mask_zero=False,)(sentence_input)

sentence_LSTM = Bidirectional(LSTM(hidden_size, return_sequences=True))(sentence_embedding)
sentence_dense = TimeDistributed(Dense(hidden_size))(sentence_LSTM)
sentence_dense = Flatten()(sentence_dense)
sentence_encoder = Model(sentence_input,sentence_dense)

body_input = Input(shape=(MAX_SENT_PER_ART,MAX_SENT_LEN,),dtype = 'int32')
body_encoder = TimeDistributed(sentence_encoder)(body_input)
body_LSTM = Bidirectional(LSTM(hidden_size,return_sequences=True))(body_encoder)
body_dense = TimeDistributed(Dense(hidden_size))(body_LSTM)
body_dense = Flatten()(body_dense)

heading_input = Input(shape=(MAX_SENT_LEN,), dtype = 'int32')
heading_embedding = Embedding(output_dim=hidden_size, input_dim=vocab_size+1, 
                                      input_length=MAX_SENT_LEN, 
                                      weights=[embedding_matrix],
                                      trainable=trainable,
                                      mask_zero=False,)(heading_input)
heading_dense = Dense(hidden_size,activation='relu')(heading_embedding)
heading_flatten = Flatten()(heading_dense)

concatenated_input = concatenate([body_dense,heading_flatten],name='article')
hidden_dense = Dense(hidden_size,activation='relu')(concatenated_input)
prediction = Dense(4 ,activation='softmax')(hidden_dense)
model = Model([body_input,heading_input],[prediction])
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['categorical_accuracy'])

In [15]:
model.fit([X_train_body,X_train_head],[y_train], validation_data=([X_val_body,X_val_head],[y_val]), epochs=10 , batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f6df4479a90>

In [16]:
cd = DataSet("competition_test")


test_stances = cd.stances
test_headlines = [stance['Headline'] for stance in test_stances]
test_labels = [stance['Stance'] for stance in test_stances]
test_body = [cd.articles[stance['Body ID']] for stance in test_stances]

X_test_body = np.zeros((len(cd.stances), MAX_SENT_PER_ART, MAX_SENT_LEN), dtype='int32')
sent_tok_test = []
for article in test_body:
    sent_tok_test.append(tokenize.sent_tokenize(article))

for i, article in enumerate(sent_tok_test):
    for j, sentence in enumerate(article[:MAX_SENT_PER_ART]):
        words = text_to_word_sequence(sentence)
        for k, word in enumerate(words[:MAX_SENT_LEN]):
            X_test_body[i][j][k] = word_index.get(word, 1)

X_test_head = np.zeros((len(test_stances), MAX_SENT_LEN), dtype='int32')

for i, headline in enumerate(test_headlines):
    words = text_to_word_sequence(headline)
    for j, word in enumerate(words[:MAX_SENT_LEN]):
        X_test_head[i][j] = word_index.get(word, 1)


predictions = model.predict([X_test_body,X_test_head])

predicted_label = [LABELS[max([0, 1, 2, 3], key=lambda x: p[x])] for p in predictions]

Reading dataset
Total stances: 25413
Total bodies: 904


In [17]:
score = sum([pl == a for pl, a in zip(predicted_label, test_labels)])/len(test_labels)
score

0.7029866603706765

In [18]:
# model.save(f"glove{VECTOR_SIZE}")