In [1]:
import torch
import torch.nn as nn
import pandas as pd

In [2]:
ls

Train Transformer 1.ipynb  dataloader.py
[1m[34m__pycache__[m[m/               models.py


# First, prepare the data

In [3]:
from dataloader import *

In [4]:
# load every dataset manually and create torch objects
comment_df = pd.read_csv("../data/attack_annotated_comments.tsv", sep ='\t')
body_df = pd.read_csv("../data/fake_news_bodies.csv")
stance_df = pd.read_csv("../data/fake_news_stances.csv")
vocab = Vocabulary([comment_df["comment"], body_df["articleBody"], stance_df["Headline"]])
annotation_df = pd.read_csv("../data/attack_annotations.tsv",  sep='\t')

wiki_dataset = WikiDataset(comment_df, annotation_df, vocab)
fake_news_dataset = FakeNewsDataset(body_df, stance_df, vocab)

In [5]:
comment_df.head(2)

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split
0,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train
1,44816,`NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...,2002,False,article,random,train


In [6]:
body_df.head(2)

Unnamed: 0,Body ID,articleBody,sentence_as_idx
0,0,A small meteorite crashed into a wooded area i...,"[2, 126, 2627, 7088, 36713, 1541, 126, 64723, ..."
1,4,Last week we hinted at what was to come as Ebo...,"[2, 978, 9959, 301, 27551, 181, 184, 243, 21, ..."


In [7]:
stance_df.head(2)

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree


In [8]:
# check label correspondence
stance_df['Stance'].unique()

array(['unrelated', 'agree', 'disagree', 'discuss'], dtype=object)

In [9]:
annotation_df.head(2)

Unnamed: 0,rev_id,worker_id,quoting_attack,recipient_attack,third_party_attack,other_attack,attack
0,37675,1362,0.0,0.0,0.0,0.0,0.0
1,37675,2408,0.0,0.0,0.0,0.0,0.0


In [10]:
# check label bias
fake_news_dataset.y.mean(axis=0)

array([0.73130953, 0.07360122, 0.01680941, 0.17827984])

In [11]:
# check label bias
wiki_dataset.y.mean(axis=0)

array([0.88270731, 0.11729269])

# Once the data is loaded, prepare the model

In [None]:
from models import *

In [23]:
# simple transformer model, cant be used for classification
def model_from_dataset(dataset):
    vocab_size = len(vocab)
    labels = len(dataset[0][1])
    embedding_dim = 64
    nhead = 1
    hidden_dim = 32
    num_layers = 1
    feedforward_dim = 64
    model = TransformerClassifier(vocab_size, labels, embedding_dim, nhead, feedforward_dim, num_layers)
    return model

In [24]:
model = model_from_dataset([(1, np.array([0, 1, 0]))]) # fake dataset with 3 labels
nparr = np.random.randint(0, vocab_size, size=(5, 25))
x = torch.from_numpy(nparr)
print(x)
print(model(x))

tensor([[189338, 349340, 130379,  70347,  26030, 144609, 409111,  79360, 396789,
         223325, 105338, 149599,  35169, 169713, 262245, 307636,  19871, 151277,
         335384, 220001, 369510, 233462, 396713,  97482, 268379],
        [380314,  38802,   6311, 444920, 162214,  11475, 395371, 346487, 453857,
         306023, 319806, 143118, 333188, 472798, 432039, 371149, 335180, 264578,
         254151, 148867,   4780, 164126,  15791, 440552, 325971],
        [467580, 240134,  49322, 316158, 398978, 323567,  75178, 344533, 456183,
         206595,  56484, 339986, 243255, 412491,   7284,  92012, 245453, 274358,
          56209, 428782, 431512, 363485,  74657, 443381, 123941],
        [ 58800, 289419, 299973, 229196, 211726, 274413,    858, 187201,  72117,
         126222, 299324, 230165, 400354,  20908, 446991, 250222, 337316, 411058,
         223158, 425012, 370616, 445619, 397941,  59422, 277661],
        [ 85436,  95419, 464094, 373483, 259173, 328874, 309528, 337219,  59183,
       

## Once the models are working, we can implement our train method

In [16]:
def train_model(encoder, dataset, batch_size=32):
    print("training encoder")
    n = len(dataset)
    batch_order = np.arange(n)
    np.random.shuffle(batch_order)
    for i in range(0, n, batch_size):
        curr_batch = dataset[batch_order[i:min(i+batch_size, n)]]