In [6]:
import os

# This variable is used by helperbot to make the training deterministic
os.environ["SEED"] = "33223"

import logging
from pathlib import Path

import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert.modeling import BertModel

from helperbot import BaseBot, TriangularLR
from Utils import *

In [7]:
BERT_MODEL = 'bert-large-uncased'
CASED = False

In [8]:
df_train = pd.read_csv("gap-test.tsv", delimiter="\t")
df_val = pd.read_csv("gap-validation.tsv", delimiter="\t")
df_test = pd.read_csv("gap-development.tsv", delimiter="\t")

In [9]:
tokenizer = BertTokenizer.from_pretrained(
    BERT_MODEL,
    do_lower_case=CASED,
    never_split = ("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]", "[A]", "[B]", "[P]")
)
# These tokens are not actually used, so we can assign arbitrary values.
tokenizer.vocab["[A]"] = -1
tokenizer.vocab["[B]"] = -1
tokenizer.vocab["[P]"] = -1

In [10]:
train_ds = GAPDataset(df_train, tokenizer)
val_ds = GAPDataset(df_val, tokenizer)
test_ds = GAPDataset(df_test, tokenizer)
train_loader = DataLoader(
    train_ds,
    collate_fn = collate_examples,
    batch_size=20,
    num_workers=2,
    pin_memory=True,
    shuffle=True,
    drop_last=True
)
val_loader = DataLoader(
    val_ds,
    collate_fn = collate_examples,
    batch_size=128,
    num_workers=2,
    pin_memory=True,
    shuffle=False
)
test_loader = DataLoader(
    test_ds,
    collate_fn = collate_examples,
    batch_size=128,
    num_workers=2,
    pin_memory=True,
    shuffle=False
)

In [None]:
model = GAPModel(BERT_MODEL, torch.device("cuda:0"))
# You can unfreeze the last layer of bert by calling set_trainable(model.bert.encoder.layer[23], True)
set_trainable(model.bert, False)
set_trainable(model.head, True)

100%|█████████████████████████████████████████████████████| 1248501532/1248501532 [06:45<00:00, 3080577.69B/s]


In [7]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
bot = GAPBot(
    model, train_loader, val_loader,
    optimizer=optimizer, echo=True,
    avg_window=25
)

[[03/31/2019 04:39:06 AM]] SEED: 33223
[[03/31/2019 04:39:06 AM]] # of paramters: 336,723,971
[[03/31/2019 04:39:06 AM]] # of trainable paramters: 1,582,083


In [8]:
steps_per_epoch = len(train_loader) 
n_steps = steps_per_epoch * 5
bot.train(
    n_steps,
    log_interval=steps_per_epoch // 4,
    snapshot_interval=steps_per_epoch,
    scheduler=TriangularLR(
        optimizer, 20, ratio=2, steps_per_cycle=n_steps)
)

[[03/31/2019 04:39:08 AM]] Optimizer Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    initial_lr: 0.001
    lr: 5e-05
    weight_decay: 0
)
[[03/31/2019 04:39:08 AM]] Batches per epoch: 100
[[03/31/2019 04:39:14 AM]] Step 25: train 1.853466 lr: 1.816e-04
[[03/31/2019 04:39:20 AM]] Step 50: train 1.697854 lr: 3.247e-04
[[03/31/2019 04:39:25 AM]] Step 75: train 1.579227 lr: 4.678e-04
[[03/31/2019 04:39:30 AM]] Step 100: train 1.516384 lr: 6.108e-04
100%|██████████| 4/4 [00:05<00:00,  1.30s/it]
[[03/31/2019 04:39:36 AM]] Snapshot loss 0.894100
[[03/31/2019 04:39:48 AM]] Saving checkpoint cache/model_cache/best.pth...
[[03/31/2019 04:39:48 AM]] New low

[[03/31/2019 04:39:54 AM]] Step 125: train 1.439029 lr: 7.539e-04
[[03/31/2019 04:39:59 AM]] Step 150: train 1.362474 lr: 8.970e-04
[[03/31/2019 04:40:04 AM]] Step 175: train 1.353909 lr: 9.801e-04
[[03/31/2019 04:40:09 AM]] Step 200: train 1.297870 lr: 9.090e-04
100%|██████████| 4/4 [00:05<00:00,  1.28

In [9]:
bot.load_model("./cache/model_cache/best.pth")

In [10]:
#torch.save(bot.model.state_dict(), "./cache/model_cache/best.pth")

In [11]:
# Evaluate on the test dataset
bot.eval(test_loader)

100%|██████████| 16/16 [00:24<00:00,  1.21s/it]


0.550768973827362

In [12]:
# Extract predictions to the test dataset
preds = bot.predict(test_loader,return_y = True)

100%|██████████| 16/16 [00:24<00:00,  1.22s/it]


In [13]:
preds

(tensor([[ 0.9115, -0.1810, -1.3243],
         [ 4.3801, -4.1799, -2.0921],
         [-1.9990,  1.1068,  1.9490],
         ...,
         [ 2.2471,  1.4013,  0.7029],
         [ 6.9068, -4.2859, -1.9861],
         [-0.7334,  1.0151,  0.3522]]), tensor([0, 0, 1,  ..., 0, 0, 1]))

In [14]:
_, labels = torch.max(preds[0], 1)

In [15]:
total_count = 0
for i in range(1, 11):
    count = sum(labels[(i-1)*200:i*200] == preds[1][(i-1)*200:i*200])
    print(count.item())
    total_count += count.item()
print(total_count*1.0/2000)

157
155
162
166
155
167
151
165
160
154
0.796


In [19]:
df_inference = pd.read_csv("inference.tsv", delimiter="\t")
inference_ds = GAPDataset(df_inference, tokenizer)
inference_loader = DataLoader(
    inference_ds,
    collate_fn = collate_examples,
    batch_size=128,
    num_workers=2,
    pin_memory=True,
    shuffle=False
)

In [20]:
bot.eval(inference_loader)

100%|██████████| 1/1 [00:00<00:00,  6.92it/s]


0.36622440814971924

In [21]:
preds = bot.predict(inference_loader,return_y = True)

100%|██████████| 1/1 [00:00<00:00,  6.32it/s]


In [22]:
preds

(tensor([[ 0.9115, -0.1810, -1.3243]]), tensor([0]))

In [23]:
_, labels = torch.max(preds[0], 1)

In [24]:
labels, preds[1]

(tensor([0]), tensor([0]))