In [3]:
from matplotlib import pyplot as plt
import numpy as np
import csv
import pandas as pd
import sklearn
import torch as th
import torch.functional as F
import torch.nn as nn
import re
from sklearn.utils import gen_batches

In [4]:
train_bodies_data = pd.read_csv('data/train_bodies.csv')
train_stances_data =  pd.read_csv('data/train_stances.csv')

In [5]:
def get_article(stances, bodies, id=None, stance='unrelated'):
    if id is None:
        headline = stances[stances['Stance'] == stance].sample().iloc[0]
    else:
        headline = stances[stances['Body ID'] == id].iloc[0]

    body = bodies[bodies['Body ID'] == headline['Body ID']].iloc[0]
    return {
        'headline': headline['Headline'],
        'id': headline['Body ID'],
        'body': body['articleBody'],
        'stance': headline['Stance']
    }

In [6]:
def clean_str(string, tolower=True):
    """
    Tokenization/string cleaning.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    if tolower:
        string = string.lower()
    return string.strip()

In [7]:
def get_max_sent_length(data):
    sent_lengths=[]
    for sent in data:
        sent_lengths.append(len(sent))
    max_length = max(sent_lengths)

    return max_length

In [8]:
def fill_data(data):
    max_length = get_max_sent_length(data)
    for sent in data:
        while len(sent) < max_length:
            sent.append('<pad>')
    return data

In [9]:
def join_data(stances_data, bodies_data):
    headlines = []
    bodies = []
    stances = []
    for stance in stances_data.iterrows():
        stance = stance[1]
        headlines.append(stance['Headline'])
        body = bodies_data[bodies_data['Body ID'] == stance['Body ID']].iloc[0]
        bodies.append(body['articleBody'])
        stances.append(stance['Stance'])
        
    return headlines, stances, bodies

In [10]:
def clean_data(data):
    return list(map(clean_str, data))

In [11]:
stance_dict = {
    'agree': 0,
    'disagree': 1,
    'discuss': 2,
    'unrelated': 3
}

def transform_stances(stances):
    return [stance_dict[stance] for stance in stances]

In [12]:
train_headlines, train_stances, train_bodies = join_data(train_stances_data, train_bodies_data)

In [13]:
train_headlines_clean = clean_data(train_headlines)
train_stances_clean = transform_stances(train_stances)
train_bodies_clean = clean_data(train_bodies)

In [14]:
from sklearn.model_selection import train_test_split

headlines_train, headlines_dev, stances_train, stances_dev, bodies_train, bodies_dev = train_test_split(
    train_headlines_clean,
    train_stances_clean,
    train_bodies_clean,
    test_size=0.2,
    stratify=train_stances_clean
)

In [15]:
def create_vocabulary(data):
    vocab_dict = dict()
    vocab_dict['<pad>'] = 0
    
    i = 1
    for sent in data:
        for word in sent:
            if word not in vocab_dict:
                vocab_dict[word] = i
                i += 1
                
    vocab_dict['<unk>'] = i
    
    return vocab_dict

In [16]:
vocab_dict = create_vocabulary(headlines_train + bodies_train)

In [17]:
def data_to_num_tensor(data):
    data = [sent.split() for sent in data]
    data = fill_data(data)
    num_tensor = []
    for sent in data:
        new_sent=[]
        for word in sent:
            new_sent.append(vocab_dict[word] if word in vocab_dict else vocab_dict['<unk>'])
        num_tensor.append(new_sent)
    return th.tensor(num_tensor, dtype=th.long)

In [18]:
headlines_train = data_to_num_tensor(headlines_train)
bodies_train = data_to_num_tensor(bodies_train)
headlines_dev = data_to_num_tensor(headlines_dev)
bodies_dev = data_to_num_tensor(bodies_dev)

In [19]:
stances_train = th.tensor(stances_train, dtype=th.int64)
stances_dev = th.tensor(stances_dev, dtype=th.int64)

In [23]:
class CBOW_classifier(nn.Module):
    def __init__(self, vocab_dict, embedding_dim, num_layers=0, hidden_dim=50, dropout=0.5):
        super(CBOW_classifier, self).__init__()     
        output_dim = 4
        self.embedding = nn.Embedding(len(vocab_dict), embedding_dim, padding_idx=vocab_dict['<pad>'])
        if num_layers > 0:
            first_layer = nn.Sequential(nn.Linear(2*embedding_dim, hidden_dim),nn.ReLU())
            hidden_layers = [nn.Sequential(nn.Linear(hidden_dim, hidden_dim),nn.ReLU()) for i in range(num_layers-1)]
            self.out = nn.Sequential(nn.Dropout(dropout), first_layer, *hidden_layers, nn.Dropout(dropout), nn.Linear(hidden_dim, output_dim))
        else:
            self.out = nn.Sequential(nn.Dropout(dropout),nn.Linear(2*embedding_dim, output_dim))        
        
    def forward(self, headlines, bodies):
        headlines_embedded = th.sum(self.embedding(headlines), axis=1)
        bodies_embedded = th.sum(self.embedding(bodies), axis=1)
        embeds = th.cat((headlines_embedded, bodies_embedded), 1)
        out = self.out(embeds)
        return out

In [27]:
def compute_accuracy(model, headlines, stances, bodies):
    with th.no_grad():
        outputs = model.forward(headlines, bodies).argmax(axis=1)
        accuracy = (outputs == stances).sum().float() / stances.numel()
        return accuracy

In [25]:
embedding_dim = 100
model = CBOW_classifier(vocab_dict, embedding_dim, num_layers=0, dropout=0)
lr = 0.008
loss_function = nn.CrossEntropyLoss()
optimizer = th.optim.Adam(model.parameters(), lr=lr)

In [None]:
num_epochs = 20
batch_size = 20
num_samples = len(headlines_train)

slices = list(gen_batches(num_samples, batch_size))

for epoch in range(num_epochs):
    epoch_loss = 0
    
    for s in slices: 
        headlines_batch = headlines_train[s]
        stances_batch = stances_train[s]
        bodies_batch = bodies_train[s]
        
        optimizer.zero_grad()  
        pred_labels = model(headlines_batch, bodies_batch)
        loss = loss_function(pred_labels, stances_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        
    acc = compute_accuracy(model, headlines_dev, stances_dev, bodies_dev)
    trainacc = compute_accuracy(model, headlines_train, stances_train, bodies_train)
    print('Epoch:', epoch, "Accuracy: %f" % acc, "Train accuracy: %f" % trainacc)
    print('\tLoss:', epoch_loss)