In [1]:
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import DataLoader
import time

In [2]:
ls

Train Transformer 1.ipynb  dataloader.py
[1m[34m__pycache__[m[m/               models.py


# First, prepare the data

In [3]:
from dataloader import *

In [4]:
# load every dataset manually and create torch objects
comment_df = pd.read_csv("../data/attack_annotated_comments.tsv", sep ='\t')
body_df = pd.read_csv("../data/fake_news_bodies.csv")
stance_df = pd.read_csv("../data/fake_news_stances.csv")
vocab = Vocabulary([comment_df["comment"], body_df["articleBody"], stance_df["Headline"]])
annotation_df = pd.read_csv("../data/attack_annotations.tsv",  sep='\t')

wiki_dataset = WikiDataset(comment_df, annotation_df, vocab)
fake_news_dataset = FakeNewsDataset(body_df, stance_df, vocab)

In [None]:
wiki_dataset[0][0]

array([  2,   1, 239, ...,   0,   0,   0])

In [None]:
comment_df.head(2)

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split
0,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train
1,44816,`NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...,2002,False,article,random,train


In [None]:
body_df.head(2)

Unnamed: 0,Body ID,articleBody,sentence_as_idx
0,0,A small meteorite crashed into a wooded area i...,"[2, 126, 2627, 7088, 36713, 1541, 126, 64723, ..."
1,4,Last week we hinted at what was to come as Ebo...,"[2, 978, 9959, 301, 27551, 181, 184, 243, 21, ..."


In [None]:
stance_df.head(2)

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree


In [None]:
# check label correspondence
stance_df['Stance'].unique()

array(['unrelated', 'agree', 'disagree', 'discuss'], dtype=object)

In [None]:
annotation_df.head(2)

Unnamed: 0,rev_id,worker_id,quoting_attack,recipient_attack,third_party_attack,other_attack,attack
0,37675,1362,0.0,0.0,0.0,0.0,0.0
1,37675,2408,0.0,0.0,0.0,0.0,0.0


In [None]:
# check label bias
fake_news_dataset.y.mean(axis=0)

array([0.73130953, 0.07360122, 0.01680941, 0.17827984])

In [None]:
# check label bias
wiki_dataset.y.mean(axis=0)

array([0.88270731, 0.11729269])

# Once the data is loaded, prepare the model

In [None]:
from models import *

In [None]:
# simple transformer model, cant be used for classification
vocab_size = len(vocab)
embedding_dim = 64
nhead = 1
hidden_dim = 32
num_layers = 1
feedforward_dim = 64
def model_from_dataset(dataset):
    labels = len(dataset[0][1])
    model = TransformerClassifier(vocab_size, labels, embedding_dim, nhead, feedforward_dim, num_layers)
    return model

In [None]:
model = model_from_dataset([(1, np.array([0, 1, 0]))]) # fake dataset with 3 labels
nparr = np.random.randint(0, vocab_size, size=(5, 25))
x = torch.from_numpy(nparr)
print(x)
print(model(x))

tensor([[386972, 350739, 165344,  33602, 326965, 244760, 292190, 136231, 220246,
         412089,  18403, 381491,  22481, 230070,  90619, 397489, 415246, 422328,
         426964, 408006, 226114, 117256, 149884, 349256, 284531],
        [471688, 360183,  16286, 181230, 285180, 356766,  57675, 281774, 411546,
         111558, 429550, 269565, 110322, 236726, 431342, 230469, 449076,   2853,
         263082, 173514, 456615, 132677, 350083, 154398, 281975],
        [100763, 103934,  30005, 304052, 438678, 428912, 417915, 195973,  49225,
         114742, 303194, 139428,  93952, 213455, 247979, 200828, 337121, 279884,
         136175, 424495, 422201, 367188,  39378, 321086, 173925],
        [ 72771, 455781, 239238, 107118,   8934,  98826, 305918, 316325,  14903,
         216543, 228661,  85869, 138506, 310766, 470411, 471079, 364817, 278129,
         254200, 376921,  44928, 136969, 370958, 439094, 335791],
        [ 61388,  54067,  58478, 314517,  21440, 449031,   1118, 133647, 437064,
       

## Once the models are working, we can implement our train method

In [None]:
# method for training given model on given dataset
def train(model, dataset, batch_size=32, epochs=10, lr=0.1):
    # TEMPORARY CODE: make x's equal length
    max_len = 0
    for (x,y) in dataset:
        if len(x) > max_len:
            max_len = len(x)
    print("start")
    t = time.time()
    clean_dataset = []
    for (x,y) in dataset: clean_dataset.append((list(x) + (max_len - len(x))*[0], list(y)))
    print("stop", time.time() - t)
    print(clean_dataset)
    return
    # END OF TEMPORARY CODE
    dataloader = DataLoader(clean_dataset, batch_size=batch_size, shuffle=True)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
    model.train() # Turn on the train mode
    for epoch in range(epochs):
        total_loss = 0.
        start_time = time.time()
        for data in dataloader: # different shuffle each time
            x_batch, y_batch = data
            optimizer.zero_grad()
            output = model(x_batch)
            loss = criterion(output.view(-1, ntokens), y_batch)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()
            total_loss += loss.item()
        elapsed = time.time() - start_time
        print('| epoch {:3d} | {:5d}/{:5d} samples | '
              'lr {:02.2f} | time {:5.2f} | loss {:5.2f}'.format(epoch, i, 
                len(train_data), scheduler.get_lr()[0], elapsed, total_loss))

In [None]:
train(model, wiki_dataset)

start


In [None]:
wiki_dataset[0:2][1]