In [3]:
from keras.models import load_model
from sklearn.metrics import classification_report
from collections import Counter
from utils.dataset import DataSet
import numpy as np
MAX_SENT_PER_ART = 5
MAX_SENT_LEN = 20
MAX_VOCAB = 50000
VECTOR_SIZE = 100
LABELS = ['agree', 'disagree', 'discuss', 'unrelated']

In [4]:
import nltk
from nltk import tokenize
from keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.layers import TextVectorization
from utils.generate_test_splits import generate_hold_out_split, read_ids
nltk.download('punkt')


d = DataSet()
generate_hold_out_split(d, training=0.9)
trainID = set(read_ids("training_ids.txt", "splits"))
valID = set(read_ids("hold_out_ids.txt", "splits"))

train_stances = [stance for stance in d.stances if stance['Body ID'] in trainID]
train_headlines = [stance['Headline'] for stance in train_stances]
train_labels = [stance['Stance'] for stance in train_stances]
train_body = [d.articles[stance['Body ID']] for stance in train_stances]

val_stances = [stance for stance in d.stances if stance['Body ID'] in valID]
val_headlines = [stance['Headline'] for stance in val_stances]
val_labels = [stance['Stance'] for stance in val_stances]
val_body = [d.articles[stance['Body ID']] for stance in val_stances]

vectorizer = TextVectorization(max_tokens=MAX_VOCAB, output_sequence_length=MAX_SENT_LEN)
vectorizer.adapt(train_body + train_headlines + val_body + val_headlines)

voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

[nltk_data] Downloading package punkt to /home/sw26wong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Reading dataset
Total stances: 49972
Total bodies: 1683


In [5]:
cd = DataSet("competition_test")


test_stances = cd.stances
test_headlines = [stance['Headline'] for stance in test_stances]
test_labels = [stance['Stance'] for stance in test_stances]
test_body = [cd.articles[stance['Body ID']] for stance in test_stances]

X_test_body = np.zeros((len(cd.stances), MAX_SENT_PER_ART, MAX_SENT_LEN), dtype='int32')
sent_tok_test = []
for article in test_body:
    sent_tok_test.append(tokenize.sent_tokenize(article))

for i, article in enumerate(sent_tok_test):
    for j, sentence in enumerate(article[:MAX_SENT_PER_ART]):
        words = text_to_word_sequence(sentence)
        for k, word in enumerate(words[:MAX_SENT_LEN]):
            X_test_body[i][j][k] = word_index.get(word, 1)

X_test_head = np.zeros((len(test_stances), MAX_SENT_LEN), dtype='int32')

for i, headline in enumerate(test_headlines):
    words = text_to_word_sequence(headline)
    for j, word in enumerate(words[:MAX_SENT_LEN]):
        X_test_head[i][j] = word_index.get(word, 1)

Reading dataset
Total stances: 25413
Total bodies: 904


# Glove 100

In [6]:
model = load_model("glove100-embeddings-trainable")

In [7]:
predictions = model.predict([X_test_body,X_test_head])

predicted_label = [LABELS[max([0, 1, 2, 3], key=lambda x: p[x])] for p in predictions]

In [8]:
score = sum([pl == a for pl, a in zip(predicted_label, test_labels)])/len(test_labels)
score

0.7398969031598001

In [9]:
[(k, v*100/len(predicted_label)) for k,v in Counter(predicted_label).most_common()]

[('unrelated', 79.44359186243261),
 ('discuss', 14.453232597489475),
 ('agree', 5.973320741352851),
 ('disagree', 0.12985479872506198)]

In [10]:
[(k, v*100/len(train_labels + val_labels)) for k,v in  Counter(train_labels + val_labels).most_common()]

[('unrelated', 73.13095333386697),
 ('discuss', 17.82798367085568),
 ('agree', 7.3601216681341555),
 ('disagree', 1.6809413271432)]

In [11]:
[(k, v*100/len(test_labels)) for k,v in Counter(test_labels).most_common()]

[('unrelated', 72.20320308503521),
 ('discuss', 17.565812772990203),
 ('agree', 7.4882933931452405),
 ('disagree', 2.7426907488293395)]

In [12]:
print(classification_report(test_labels, predicted_label))

              precision    recall  f1-score   support

       agree       0.29      0.23      0.26      1903
    disagree       0.24      0.01      0.02       697
     discuss       0.53      0.44      0.48      4464
   unrelated       0.81      0.89      0.85     18349

    accuracy                           0.74     25413
   macro avg       0.47      0.39      0.40     25413
weighted avg       0.71      0.74      0.72     25413



# Google 300

In [23]:
gmodel = load_model("google300-embeddings-nontrainable")

In [None]:
predictions = gmodel.predict([X_test_body,X_test_head])

predicted_label = [LABELS[max([0, 1, 2, 3], key=lambda x: p[x])] for p in predictions]
score = sum([pl == a for pl, a in zip(predicted_label, test_labels)])/len(test_labels)
score

In [34]:
score

0.7313186164561445

In [31]:
[(k, v*100/len(predicted_label)) for k,v in Counter(predicted_label).most_common()]

[('unrelated', 82.71357179396372),
 ('discuss', 11.749891787667728),
 ('agree', 4.568527918781726),
 ('disagree', 0.9680084995868257)]

In [32]:
[(k, v*100/len(train_labels + val_labels)) for k,v in  Counter(train_labels + val_labels).most_common()]

[('unrelated', 73.13095333386697),
 ('discuss', 17.82798367085568),
 ('agree', 7.3601216681341555),
 ('disagree', 1.6809413271432)]

In [33]:
print('Glove 100', classification_report(test_labels, predicted_label))

Glove 100               precision    recall  f1-score   support

       agree       0.27      0.16      0.20      1903
    disagree       0.12      0.04      0.06       697
     discuss       0.53      0.36      0.43      4464
   unrelated       0.79      0.91      0.85     18349

    accuracy                           0.73     25413
   macro avg       0.43      0.37      0.39     25413
weighted avg       0.69      0.73      0.70     25413

