In [None]:
import preprocess
import utils
import torch as th
import torch.nn as nn
import torch.functional as F
import pandas as pd
import csv
import pickle as pkl
from sklearn.utils import gen_batches

# Loading data

In [None]:
load_preprocessed_data = True

if load_preprocessed_data:
    path_to_data = 'data.pkl'
    path_to_vocab_dict = 'vocab_dict.pkl'
    
    data = utils.load_preprocessed_data(path_to_data)
    vocab_dict = utils.load_vocab_dict(path_to_vocab_dict)
    (headlines_train, stances_train, bodies_train) = data['train']
    (headlines_dev, stances_dev, bodies_dev) = data['dev']
else:
    train_stances_path = 'data/train_stances.csv'
    train_bodies_path = 'data/train_bodies.csv'

    stances_data =  pd.read_csv(train_stances_path)
    bodies_data = pd.read_csv(train_bodies_path)

    data = preprocess.extract_data(stances_data, bodies_data)
    vocab_dict = data['dict']
    (headlines_train, stances_train, bodies_train) = data['train']
    (headlines_dev, stances_dev, bodies_dev) = data['dev']

# Training the model

In [None]:
is_cuda = False

if is_cuda:
    device = th.device('cuda:0')
else:
    device = th.device('cpu')

In [None]:
class CBOW_classifier(nn.Module):
    def __init__(self, vocab_dict, embedding_dim, num_layers=0, hidden_dim=50, dropout=0.5):
        super(CBOW_classifier, self).__init__()     
        output_dim = 4
        self.embedding = nn.Embedding(len(vocab_dict), embedding_dim, padding_idx=vocab_dict['<pad>'])
        if num_layers > 0:
            first_layer = nn.Sequential(nn.Linear(2*embedding_dim, hidden_dim),nn.ReLU())
            hidden_layers = [nn.Sequential(nn.Linear(hidden_dim, hidden_dim),nn.ReLU()) for i in range(num_layers-1)]
            self.out = nn.Sequential(nn.Dropout(dropout), first_layer, *hidden_layers, nn.Dropout(dropout), nn.Linear(hidden_dim, output_dim))
        else:
            self.out = nn.Sequential(nn.Dropout(dropout),nn.Linear(2*embedding_dim, output_dim))        
        
    def forward(self, headlines, bodies):
        headlines_embedded = th.sum(self.embedding(headlines), axis=1)
        bodies_embedded = th.sum(self.embedding(bodies), axis=1)
        embeds = th.cat((headlines_embedded, bodies_embedded), 1)
        out = self.out(embeds)
        return out

In [None]:
embedding_dim = 1000
model = CBOW_classifier(vocab_dict, embedding_dim, num_layers=1, dropout=0.1).to(device)
lr = 0.001
loss_function = nn.CrossEntropyLoss()
optimizer = th.optim.Adam(model.parameters(), lr=lr)

In [None]:
num_epochs = 1
batch_size = 10
num_samples = len(headlines_train)

slices = list(gen_batches(num_samples, batch_size))
dev_slices = list(gen_batches(len(headlines_dev), batch_size))

for epoch in range(num_epochs):
    epoch_loss = 0
    model.train()
    for s in slices:

        headlines_batch = headlines_train[s].to(device)
        stances_batch = stances_train[s].to(device)
        bodies_batch = bodies_train[s].to(device)
        
        optimizer.zero_grad()  
        pred_labels = model(headlines_batch, bodies_batch)
        loss = loss_function(pred_labels, stances_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    accs = []
    for s in dev_slices:   
        acc = compute_accuracy(
            model, 
            headlines_dev[s].to(device), 
            stances_dev[s].to(device), 
            bodies_dev[s].to(device)
        )
        accs.append(acc)
    acc = sum(accs) / len(accs)
    
    s = random.choice(slices)
    trainacc = utils.compute_accuracy(
        model,
        headlines_train[s].to(device),
        stances_train[s].to(device),
        bodies_train[s].to(device)
    )
    
    print('Epoch:', epoch, "Accuracy: %f" % acc, "Train accuracy: %f" % trainacc)
    print('\tLoss:', epoch_loss / len(slices))

# Saving data

In [None]:
save_data = False

if save_data:
    vocab_dict_path = 'vocab_dict_new.pkl'
    model_weights_path = 'cbow_0xxxx.pth'
    data_path = 'data_new.pkl'
    
    utils.save_vocab_dict(vocab_dict_path, vocab_dict)
    utils.save_model_weights(model_weights_path, model)
    utils.save_preprocessed_data(data_path, {
        'train': (headlines_train, stances_train, bodies_train),
        'dev': (headlines_dev, stances_dev, bodies_dev)
    })

# Loading model

The following code loads the model. There is no need to run this part if the model was trained in this notebook.

In [None]:
load_model = False

if load_model:
    path_to_vocab_dict = 'vocab_dict.pkl'
    path_to_model_weights = 'cbow_09502.pth'

    embedding_dim = 1000
    model = CBOW_classifier(vocab_dict, embedding_dim, num_layers=1, dropout=0.1)

    vocab_dict = utils.load_vocab_dict(path_to_vocab_dict)
    model = utils.load_model_weights(model, path_to_model_weights)

# Evaluating the model

the following code loads test data, runs them through the model, and saves them to a CSV file

In [None]:
bodies_data = pd.read_csv('data/competition_test_bodies.csv')
stances_data =  pd.read_csv('data/competition_test_stances_unlabeled.csv')

In [None]:
headlines, _, bodies = preprocess.transform_data(stances_data, bodies_data, vocab_dict)

In [None]:
slices = list(gen_batches(len(headlines), 200))
predictions = []
for s in slices:   
    with th.no_grad():
        outputs = model.forward(headlines[s], bodies[s]).argmax(axis=1)
        predictions += outputs.tolist()

In [None]:
predictions_words = preprocess.transform_back_stances(predictions)

In [None]:
prediction_data = list(zip(
    stances_data['Headline'].values.tolist(), 
    stances_data['Body ID'].values.tolist(), 
    predictions_words
))

In [None]:
with open('predictions.csv', 'w') as pred_file:
    writer = csv.writer(pred_file)
    writer.writerow(['Headline', 'Body ID', 'Stance'])
    writer.writerows(prediction_data)