In [33]:
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import DataLoader
import time

In [2]:
ls

Train Transformer 1.ipynb  dataloader.py
[1m[34m__pycache__[m[m/               models.py


# First, prepare the data

In [3]:
from dataloader import *

In [4]:
# load every dataset manually and create torch objects
comment_df = pd.read_csv("../data/attack_annotated_comments.tsv", sep ='\t')
body_df = pd.read_csv("../data/fake_news_bodies.csv")
stance_df = pd.read_csv("../data/fake_news_stances.csv")
vocab = Vocabulary([comment_df["comment"], body_df["articleBody"], stance_df["Headline"]])
annotation_df = pd.read_csv("../data/attack_annotations.tsv",  sep='\t')

wiki_dataset = WikiDataset(comment_df, annotation_df, vocab)
fake_news_dataset = FakeNewsDataset(body_df, stance_df, vocab)

In [5]:
comment_df.head(2)

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split
0,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train
1,44816,`NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...,2002,False,article,random,train


In [6]:
body_df.head(2)

Unnamed: 0,Body ID,articleBody,sentence_as_idx
0,0,A small meteorite crashed into a wooded area i...,"[2, 126, 2627, 7088, 36713, 1541, 126, 64723, ..."
1,4,Last week we hinted at what was to come as Ebo...,"[2, 978, 9959, 301, 27551, 181, 184, 243, 21, ..."


In [7]:
stance_df.head(2)

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree


In [8]:
# check label correspondence
stance_df['Stance'].unique()

array(['unrelated', 'agree', 'disagree', 'discuss'], dtype=object)

In [9]:
annotation_df.head(2)

Unnamed: 0,rev_id,worker_id,quoting_attack,recipient_attack,third_party_attack,other_attack,attack
0,37675,1362,0.0,0.0,0.0,0.0,0.0
1,37675,2408,0.0,0.0,0.0,0.0,0.0


In [10]:
# check label bias
fake_news_dataset.y.mean(axis=0)

array([0.73130953, 0.07360122, 0.01680941, 0.17827984])

In [11]:
# check label bias
wiki_dataset.y.mean(axis=0)

array([0.88270731, 0.11729269])

# Once the data is loaded, prepare the model

In [12]:
from models import *

In [13]:
# simple transformer model, cant be used for classification
vocab_size = len(vocab)
embedding_dim = 64
nhead = 1
hidden_dim = 32
num_layers = 1
feedforward_dim = 64
def model_from_dataset(dataset):
    labels = len(dataset[0][1])
    model = TransformerClassifier(vocab_size, labels, embedding_dim, nhead, feedforward_dim, num_layers)
    return model

In [14]:
model = model_from_dataset([(1, np.array([0, 1, 0]))]) # fake dataset with 3 labels
nparr = np.random.randint(0, vocab_size, size=(5, 25))
x = torch.from_numpy(nparr)
print(x)
print(model(x))

tensor([[332357,  93304, 213199, 260793, 113526, 368237,  89275, 150544,   5370,
         320066, 187463, 462218, 172429,  91137,  53367, 225638, 425222, 397346,
         335718, 323157,  12196,  38956, 451840, 426789, 188464],
        [402335, 111952, 271401, 302040,   8375,  67705,  71914, 450786, 471491,
         211672, 467741, 227187,  80995, 281732, 420791, 358241, 375711,  83140,
         264232, 365294, 109667,  58405, 166355,  81902, 209890],
        [165047, 349506, 133902, 303338,  47538, 125836, 387747, 256494, 227820,
         246814, 302028, 370187, 183471, 307331, 102729, 321903,    737, 174726,
         457255,  61494,  14260, 336013, 448204, 422765, 459836],
        [124355, 165645, 392727, 309833, 200424, 305715, 117722, 414013, 261790,
         108170, 382767, 147655, 266789, 216114, 275488, 176817,  52150, 440249,
         223400, 293773, 440226,  65888, 395575, 117996, 176921],
        [139807,  99777, 233510,  51941, 445060, 123658, 300440, 398914,  84150,
       

## Once the models are working, we can implement our train method

In [45]:
# method for training given model on given dataset
def train(model, dataset, batch_size=32, epochs=10, lr=0.1):
    # TEMPORARY CODE: make x's equal length
    max_len = 0
    for (x,y) in dataset:
        if len(x) > max_len:
            max_len = len(x)
    print("start")
    t = time.time()
    clean_dataset = []
    for (x,y) in dataset: clean_dataset.append((list(x) + (max_len - len(x))*[0], list(y)))
    print("stop", time.time() - t)
    print(clean_dataset)
    return
    # END OF TEMPORARY CODE
    dataloader = DataLoader(clean_dataset, batch_size=batch_size, shuffle=True)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
    model.train() # Turn on the train mode
    for epoch in range(epochs):
        total_loss = 0.
        start_time = time.time()
        for data in dataloader: # different shuffle each time
            x_batch, y_batch = data
            optimizer.zero_grad()
            output = model(x_batch)
            loss = criterion(output.view(-1, ntokens), y_batch)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()
            total_loss += loss.item()
        elapsed = time.time() - start_time
        print('| epoch {:3d} | {:5d}/{:5d} samples | '
              'lr {:02.2f} | time {:5.2f} | loss {:5.2f}'.format(epoch, i, 
                len(train_data), scheduler.get_lr()[0], elapsed, total_loss))

In [46]:
train(model, wiki_dataset)

start


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/johnhallman/mlcourse/mlenv/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3291, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-46-33f1aa94e01e>", line 1, in <module>
    train(model, wiki_dataset)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/johnhallman/mlcourse/mlenv/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2033, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/johnhallman/mlcourse/mlenv/bin/../lib/python3.6/genericpath.py", line 19, in exists
    os.stat(path)
FileNotFoundError: [Errno 2] No such file or directory: '<ipython-input-46-33f1aa94e01e>'

Dur

KeyboardInterrupt: 

In [None]:
wiki_dataset[0:2][1]