In [1]:
import sys
import numpy as np
import torch 
import transformers as ppb
import time
from scipy import spatial

import os
import re
import json

from sklearn.ensemble import GradientBoostingClassifier
from feature_engineering import refuting_features, polarity_features, hand_features, gen_or_load_feats
from feature_engineering import word_overlap_features
from utils.dataset import DataSet
from utils.generate_test_splits import kfold_split, get_stances_for_folds
from utils.score import report_score, LABELS, score_submission

from tqdm import tqdm

from utils.system import parse_params, check_version

In [2]:
MAX_LENGTH = 60

In [3]:
def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric
    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()

In [4]:
def gen_or_load_feats(feat_fn, headlines, bodies, feature_file, tokenizer, model):
    if not os.path.isfile(feature_file):
        feats = feat_fn(headlines, bodies, tokenizer, model)
        np.save(feature_file, feats)
    return np.load(feature_file)

In [5]:
def findMaxSentenceLength(d):
    maxLength = 0
    maxSent = -1
    articleNum = -1
    count = 0
    for articles in d.articles:
        article = d.articles[articles]
        split_list = article.split(".")
        clean_list = [clean(i) for i in split_list]
        for i in clean_list:
            words = i.split(" ")
            if len(words) > maxLength:
                maxLength = len(words) 
                maxSent = words
                articleNum = articles
            if len(i) > 512:
                count+=1
        
    print(maxLength)
    print(maxSent)
    print(articleNum)
    print(count)

In [6]:
def pad(embeddings):     
    max_len = MAX_LENGTH
    for i in embeddings:
        if len(i) > max_len:
            max_len = len(i)
    padded = np.array([i + [0]*(max_len-len(i)) for i in embeddings])
    return padded

In [7]:
def bert_features(headlines, bodies, tokenizer, model):
    X = []
    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
        start_time = time.time()
        split_headline = headline.split(".")
        split_body = body.split(".")
        # Splitting Headline and Body into seperate sentences
        headline_sentences = []
        body_sentences     = []
        for i in range(len(split_headline)):
            clean_headline = clean(split_headline[i])
            words = clean_headline.split(" ")
            words = words[0:MAX_LENGTH]
            headline_sentences.append(words)
        for i in range(len(split_body)):
            clean_body = clean(split_body[i])
            words = clean_body.split(" ")
            if len(words) > MAX_LENGTH:
                words = words[0:MAX_LENGTH]
            body_sentences.append(words)
            
        # Creating Embeds for each sentences
        headline_tokens = []
        body_tokens     = [] 
        for i in headline_sentences:
            encoded = tokenizer.encode(i, add_special_tokens = True)
            headline_tokens.append(encoded)
        for i in body_sentences:
            encoded = tokenizer.encode(i, add_special_tokens = True)
            body_tokens.append(encoded)
       
        # Padding Tokens
        padded_headline_embeds = pad(headline_tokens)
        padded_body_embeds     = pad(body_tokens)
        
        # np.array checks:
        # print("Shape of Headline Padded: ", np.array(padded_headline_embeds).shape)
        # print("Shape of Body Padded: ", np.array(padded_body_embeds).shape)

        # Preparing for input into Models
        headline_input = torch.LongTensor(np.array(padded_headline_embeds))
        headline_attention_mask = np.where(padded_headline_embeds != 0, 1, 0)
        headline_attention_mask = torch.tensor(headline_attention_mask)
        
        body_input     = torch.LongTensor(np.array(padded_body_embeds))
        body_attention_mask = np.where(padded_body_embeds != 0, 1, 0)
        body_attention_mask = torch.tensor(body_attention_mask)
        
        # Applying to BERT Model to get Embeddings
        with torch.no_grad():
            last_headline_hidden_states = model(headline_input, attention_mask=headline_attention_mask)
            last_body_hidden_states     = model(body_input, attention_mask=body_attention_mask)
        
        headline_features = last_headline_hidden_states[0][:,0,:].numpy()
        body_features = last_body_hidden_states[0][:,0,:].numpy()

        # Average embeddings for all the sentences
        h_average_features = np.expand_dims(np.average(headline_features, axis=0),axis=0)
        b_average_features = np.expand_dims(np.average(body_features, axis=0), axis=0)
        
        # Cosine similarity and Maximum similarities
        cosine_sim = np.array(spatial.distance.cosine(h_average_features, b_average_features), ndmin=2)
        bert_feature = np.concatenate((h_average_features, b_average_features), axis=1)
        bert_feature = np.concatenate((bert_feature, cosine_sim), axis = 1)
        # Creating Bert Features
        # print("Bert Features Shape: ", bert_feature.shape)
        X.append(bert_feature)     
        print("Data Loaded in {} second".format(time.time() - start_time))
    return X

In [8]:
def generate_features_with_tokenizer(stances,dataset,name, tokenizer, model):
    h, b, y = [],[],[]

    for stance in stances:
        y.append(LABELS.index(stance['Stance']))
        h.append(stance['Headline'])
        b.append(dataset.articles[stance['Body ID']])
    X_bert    = gen_or_load_feats(bert_features, h, b, "features/bert_features."+name+".npy", tokenizer, model)
    X = np.c_[X_bert]
    return X,y

In [9]:
d = DataSet()
with open('holdout.json', 'r') as read_file:
    hold_out_stances = json.loads(read_file.read())
    print(len(hold_out_stances))
fold_stances=[]
for i in range(10):
    with open('fold'+str(i)+'.json', 'r') as read_file:
        fold = json.loads(read_file.read())
        fold_stances.append(fold)
        print(len(fold))

Reading dataset
Total stances: 49972
Total bodies: 1683
9622
4663
4039
3644
4273
3944
3388
4124
3783
4644
3848


In [10]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

In [11]:
bertTokenizer = tokenizer_class.from_pretrained(pretrained_weights)
bert_model = model_class.from_pretrained(pretrained_weights)

In [13]:
fold=6
Xs[fold], ys[fold] = generate_features_with_tokenizer(fold_stances[fold],d,str(fold), bertTokenizer, bert_model)
Xs[fold].shape

(4124, 1537)

In [14]:
d = DataSet()
folds,hold_out = kfold_split(d,n_folds=10)
fold_stances, hold_out_stances = get_stances_for_folds(d,folds,hold_out)

Xs = dict()
ys = dict()

# Load/Precompute all features now
X_holdout,y_holdout = generate_features_with_tokenizer(hold_out_stances,d,"holdout", bertTokenizer, bert_model)
for fold in fold_stances:
    print(fold)
    Xs[fold],ys[fold] = generate_features_with_tokenizer(fold_stances[fold],d,str(fold), bertTokenizer, bert_model)


Reading dataset
Total stances: 49972
Total bodies: 1683
6
0
7
5
2
8
9
3
1
4


In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [40]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.hidden = nn.Linear(1537, 600)
        self.hidden2 = nn.Linear(600, 600)
        self.hidden3 = nn.Linear(600, 600)
        self.output = nn.Linear(600, 4)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.hidden(x)
        x = self.hidden2(F.relu(x))
        x = self.hidden3(F.relu(x))
        x = self.output(F.relu(x))
        return x

In [45]:
print(y_train)

[3 3 3 ... 3 3 3]


In [41]:
net = Net()

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=.5)

In [42]:
#test variable batch size
# batch_size = 50
epochs = 100
track_loss = []
for fold in fold_stances:
    ids = list(range(len(folds)))
    del ids[fold]
        
    X_train = np.vstack(tuple([Xs[i] for i in ids]))
    y_train = np.hstack(tuple([ys[i] for i in ids]))
    
    X_test = Xs[fold]
    y_test = ys[fold]

    inputs = torch.FloatTensor(X_train)
    labels = torch.LongTensor(y_train)
    
    for epoch in range(epochs):  # loop over the dataset multiple times
        # get the inputs; data is a list of [inputs, labels]
        running_loss = 0.0

        # print(labels)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        # print(out.size())
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        track_loss.append(running_loss)
        print('[Epoch: %d] loss: %.3f' % (epoch + 1, running_loss))
    
    outputs = net(torch.FloatTensor(X_test))
    _, o_predicted = torch.max(outputs, 1)
    o_predicted = o_predicted.tolist()
    predicted = [LABELS[int(a)] for a in o_predicted]
    actual = [LABELS[int(a)] for a in y_test]
    fold_score, _ = score_submission(actual, predicted)
    max_fold_score, _ = score_submission(actual, actual)

    score = fold_score/max_fold_score

    print("Score for fold "+ str(fold) + " was - " + str(score))
    if score > best_score:
        best_score = score
        best_fold = clf

print('Finished Training')

[Epoch: 1] loss: 1.384
[Epoch: 2] loss: 1.383
[Epoch: 3] loss: 1.381
[Epoch: 4] loss: 1.379
[Epoch: 5] loss: 1.376
[Epoch: 6] loss: 1.374
[Epoch: 7] loss: 1.372
[Epoch: 8] loss: 1.369
[Epoch: 9] loss: 1.367
[Epoch: 10] loss: 1.364
[Epoch: 11] loss: 1.362
[Epoch: 12] loss: 1.359
[Epoch: 13] loss: 1.357
[Epoch: 14] loss: 1.355
[Epoch: 15] loss: 1.352
[Epoch: 16] loss: 1.350
[Epoch: 17] loss: 1.348
[Epoch: 18] loss: 1.345
[Epoch: 19] loss: 1.343
[Epoch: 20] loss: 1.341
[Epoch: 21] loss: 1.338
[Epoch: 22] loss: 1.336
[Epoch: 23] loss: 1.334
[Epoch: 24] loss: 1.331
[Epoch: 25] loss: 1.329
[Epoch: 26] loss: 1.327
[Epoch: 27] loss: 1.325
[Epoch: 28] loss: 1.322
[Epoch: 29] loss: 1.320
[Epoch: 30] loss: 1.318
[Epoch: 31] loss: 1.315
[Epoch: 32] loss: 1.313
[Epoch: 33] loss: 1.311
[Epoch: 34] loss: 1.308
[Epoch: 35] loss: 1.306
[Epoch: 36] loss: 1.304
[Epoch: 37] loss: 1.302
[Epoch: 38] loss: 1.299
[Epoch: 39] loss: 1.297
[Epoch: 40] loss: 1.295
[Epoch: 41] loss: 1.292
[Epoch: 42] loss: 1.290
[

NameError: name 'best_score' is not defined

In [43]:
outputs = net(torch.FloatTensor(X_holdout))
_, o_predicted = torch.max(outputs, 1)
o_predicted = o_predicted.tolist()
predicted = [LABELS[int(a)] for a in o_predicted]
actual = [LABELS[int(a)] for a in y_holdout]
print("Scores on the dev set")
report_score(actual,predicted)

Scores on the dev set
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |     0     |     0     |     0     |    762    |
-------------------------------------------------------------
| disagree  |     0     |     0     |     0     |    162    |
-------------------------------------------------------------
|  discuss  |     0     |     0     |     0     |   1800    |
-------------------------------------------------------------
| unrelated |     0     |     0     |     0     |   6898    |
-------------------------------------------------------------
Score: 1724.5 out of 4448.5	(38.765876138024055%)


38.765876138024055

In [38]:
_, o_predicted = torch.max(outputs, 1)
print(o_predicted)

tensor([3, 3, 3,  ..., 3, 3, 3])
