This is not my work.  It was forked from the pytorch bert baseline with .54 score.

I have messed around with epochs, weight decay, and adding another fully connected layer in the head.

In [1]:
!wget https://github.com/google-research-datasets/gap-coreference/raw/master/gap-development.tsv -q
!wget https://github.com/google-research-datasets/gap-coreference/raw/master/gap-test.tsv -q
!wget https://github.com/google-research-datasets/gap-coreference/raw/master/gap-validation.tsv -q

"pytorch_helper_bot" is a thin abstraction of some common PyTorch training routines. It can easily be replaced, so you can mostly ignore it and focus on the preprocessing and model definition instead.

In [2]:
!pip install pytorch-pretrained-bert
!pip install https://github.com/ceshine/pytorch_helper_bot/archive/0.0.4.zip

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/5d/3c/d5fa084dd3a82ffc645aba78c417e6072ff48552e3301b1fa3bd711e03d4/pytorch_pretrained_bert-0.6.1-py3-none-any.whl (114kB)
[K    100% |████████████████████████████████| 122kB 3.9MB/s ta 0:00:01
Installing collected packages: pytorch-pretrained-bert
Successfully installed pytorch-pretrained-bert-0.6.1
Collecting https://github.com/ceshine/pytorch_helper_bot/archive/0.0.4.zip
  Downloading https://github.com/ceshine/pytorch_helper_bot/archive/0.0.4.zip
[K     | 112kB 7.5MB/s
Building wheels for collected packages: PyTorchHelperBot
  Building wheel for PyTorchHelperBot (setup.py) ... [?25ldone
[?25h  Stored in directory: /tmp/pip-ephem-wheel-cache-xbh8e6yi/wheels/1f/01/01/da39a14e8e30666f3eec7106664e59059789c330a11b5fa357
Successfully built PyTorchHelperBot
Installing collected packages: PyTorchHelperBot
Successfully installed PyTorchHelperBot-0.0.4


In [3]:
import os

# This variable is used by helperbot to make the training deterministic
os.environ["SEED"] = "420"

import logging
from pathlib import Path

import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert.modeling import BertModel

from helperbot import BaseBot, TriangularLR, WeightDecayOptimizerWrapper


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [4]:
BERT_MODEL = 'bert-large-uncased'
CASED = False

In [5]:
def insert_tag(row):
    """Insert custom tags to help us find the position of A, B, and the pronoun after tokenization."""
    to_be_inserted = sorted([
        (row["A-offset"], " [A] "),
        (row["B-offset"], " [B] "),
        (row["Pronoun-offset"], " [P] ")
    ], key=lambda x: x[0], reverse=True)
    text = row["Text"]
    for offset, tag in to_be_inserted:
        text = text[:offset] + tag + text[offset:]
    return text

def tokenize(text, tokenizer):
    """Returns a list of tokens and the positions of A, B, and the pronoun."""
    entries = {}
    final_tokens = []
    for token in tokenizer.tokenize(text):
        if token in ("[A]", "[B]", "[P]"):
            entries[token] = len(final_tokens)
            continue
        final_tokens.append(token)
    return final_tokens, (entries["[A]"], entries["[B]"], entries["[P]"])

class GAPDataset(Dataset):
    """Custom GAP Dataset class"""
    def __init__(self, df, tokenizer, labeled=True):
        self.labeled = labeled
        if labeled:
            tmp = df[["A-coref", "B-coref"]].copy()
            tmp["Neither"] = ~(df["A-coref"] | df["B-coref"])
            self.y = tmp.values.astype("bool")
        # Extracts the tokens and offsets(positions of A, B, and P)
        self.offsets, self.tokens = [], []
        for _, row in df.iterrows():
            text = insert_tag(row)
            tokens, offsets = tokenize(text, tokenizer)
            self.offsets.append(offsets)
            self.tokens.append(tokenizer.convert_tokens_to_ids(
                ["[CLS]"] + tokens + ["[SEP]"]))
        
    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        if self.labeled:
            return self.tokens[idx], self.offsets[idx], self.y[idx]
        return self.tokens[idx], self.offsets[idx], None
    
def collate_examples(batch, truncate_len=500):
    """Batch preparation.
    
    1. Pad the sequences
    2. Transform the target.
    """
    transposed = list(zip(*batch))
    max_len = min(
        max((len(x) for x in transposed[0])),
        truncate_len
    )
    tokens = np.zeros((len(batch), max_len), dtype=np.int64)
    for i, row in enumerate(transposed[0]):
        row = np.array(row[:truncate_len])
        tokens[i, :len(row)] = row
    token_tensor = torch.from_numpy(tokens)
    # Offsets
    offsets = torch.stack([
        torch.LongTensor(x) for x in transposed[1]
    ], dim=0) + 1 # Account for the [CLS] token
    # Labels
    if len(transposed) == 2:
        return token_tensor, offsets, None
    one_hot_labels = torch.stack([
        torch.from_numpy(x.astype("uint8")) for x in transposed[2]
    ], dim=0)
    _, labels = one_hot_labels.max(dim=1)
    return token_tensor, offsets, labels

class Head(nn.Module):
    """The MLP submodule"""
    def __init__(self, bert_hidden_size: int, head_hidden_size:int = 3^6, num_blocks:int = 5):
        super().__init__()
        self.bert_hidden_size = bert_hidden_size
        fca = ([
            nn.Linear(self.bert_hidden_size * 3, head_hidden_size),
            nn.BatchNorm1d(head_hidden_size),
            nn.ReLU(),
        ])
        for block in range(num_blocks):
            fca.append(nn.Linear(head_hidden_size, head_hidden_size))
            fca.append(nn.BatchNorm1d(head_hidden_size))
            fca.append(nn.ReLU())
        fca.append(nn.Linear(head_hidden_size, 3))
        self.fc = nn.Sequential(*fca)
        for i, module in enumerate(self.fc):
            if isinstance(module, (nn.BatchNorm1d, nn.BatchNorm2d)):
                nn.init.constant_(module.weight, 1)
                nn.init.constant_(module.bias, 0)
                print("Initing batchnorm")
            elif isinstance(module, nn.Linear):
                if getattr(module, "weight_v", None) is not None:
                    nn.init.uniform_(module.weight_g, 0, 1)
                    nn.init.kaiming_normal_(module.weight_v)
                    print("Initing linear with weight normalization")
                    assert model[i].weight_g is not None
                else:
                    nn.init.kaiming_normal_(module.weight)
                    print("Initing linear")
                nn.init.constant_(module.bias, 0)
                
    def forward(self, bert_outputs, offsets):
        assert bert_outputs.size(2) == self.bert_hidden_size
        extracted_outputs = bert_outputs.gather(
            1, offsets.unsqueeze(2).expand(-1, -1, bert_outputs.size(2))
        ).view(bert_outputs.size(0), -1)
        return self.fc(extracted_outputs)

    
class GAPModel(nn.Module):
    """The main model."""
    def __init__(self, bert_model: str, device: torch.device):
        super().__init__()
        self.device = device
        if bert_model in ("bert-base-uncased", "bert-base-cased"):
            self.bert_hidden_size = 768
        elif bert_model in ("bert-large-uncased", "bert-large-cased"):
            self.bert_hidden_size = 1024
        else:
            raise ValueError("Unsupported BERT model.")
        self.bert = BertModel.from_pretrained(bert_model).to(device)
        self.head = Head(self.bert_hidden_size).to(device)
    
    def forward(self, token_tensor, offsets):
        token_tensor = token_tensor.to(self.device)
        bert_outputs, _ =  self.bert(
            token_tensor, attention_mask=(token_tensor > 0).long(), 
            token_type_ids=None, output_all_encoded_layers=False)
        head_outputs = self.head(bert_outputs, offsets.to(self.device))
        return head_outputs            

    
def children(m):
    return m if isinstance(m, (list, tuple)) else list(m.children())


def set_trainable_attr(m, b):
    m.trainable = b
    for p in m.parameters():
        p.requires_grad = b


def apply_leaf(m, f):
    c = children(m)
    if isinstance(m, nn.Module):
        f(m)
    if len(c) > 0:
        for l in c:
            apply_leaf(l, f)

            
def set_trainable(l, b):
    apply_leaf(l, lambda m: set_trainable_attr(m, b))
    
    
class GAPBot(BaseBot):
    def __init__(self, model, train_loader, val_loader, *, optimizer, clip_grad=0,
        avg_window=100, log_dir="./cache/logs/", log_level=logging.INFO,
        checkpoint_dir="./cache/model_cache/", batch_idx=0, echo=False,
        device="cuda:0", use_tensorboard=False):
        super().__init__(
            model, train_loader, val_loader, 
            optimizer=optimizer, clip_grad=clip_grad,
            log_dir=log_dir, checkpoint_dir=checkpoint_dir, 
            batch_idx=batch_idx, echo=echo,
            device=device, use_tensorboard=use_tensorboard
        )
        self.criterion = torch.nn.CrossEntropyLoss()
        self.loss_format = "%.6f"
        
    def extract_prediction(self, tensor):
        return tensor
    
    def snapshot(self):
        """Override the snapshot method because Kaggle kernel has limited local disk space."""
        loss = self.eval(self.val_loader)
        loss_str = self.loss_format % loss
        self.logger.info("Snapshot loss %s", loss_str)
        self.logger.tb_scalars(
            "losses", {"val": loss},  self.step)
        target_path = (
            self.checkpoint_dir / "best.pth")        
        if not self.best_performers or (self.best_performers[0][0] > loss):
            torch.save(self.model.state_dict(), target_path)
            self.best_performers = [(loss, target_path, self.step)]
            self.logger.info("Saving checkpoint %s...", target_path)
        else:
            new_loss_str = self.loss_format % self.best_performers[0][0]
            self.logger.info("This performance:%s is not as a good as our previously saved:%s", loss_str,new_loss_str )
        assert Path(target_path).exists()
        return loss

In [6]:
df_train = pd.read_csv("gap-test.tsv", delimiter="\t")
df_val = pd.read_csv("gap-validation.tsv", delimiter="\t")
df_test = pd.read_csv("gap-development.tsv", delimiter="\t")
sample_sub = pd.read_csv("../input/sample_submission_stage_1.csv")
assert sample_sub.shape[0] == df_test.shape[0]

In [7]:
tokenizer = BertTokenizer.from_pretrained(
    BERT_MODEL,
    do_lower_case=CASED,
    never_split = ("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]", "[A]", "[B]", "[P]")
)
# These tokens are not actually used, so we can assign arbitrary values.
tokenizer.vocab["[A]"] = -1
tokenizer.vocab["[B]"] = -1
tokenizer.vocab["[P]"] = -1

100%|██████████| 231508/231508 [00:00<00:00, 6190762.65B/s]


In [8]:
train_ds = GAPDataset(df_train, tokenizer)
val_ds = GAPDataset(df_val, tokenizer)
test_ds = GAPDataset(df_test, tokenizer)
train_loader = DataLoader(
    train_ds,
    collate_fn = collate_examples,
    batch_size=20,
    num_workers=2,
    pin_memory=True,
    shuffle=True,
    drop_last=True
)
val_loader = DataLoader(
    val_ds,
    collate_fn = collate_examples,
    batch_size=128,
    num_workers=2,
    pin_memory=True,
    shuffle=False
)
test_loader = DataLoader(
    test_ds,
    collate_fn = collate_examples,
    batch_size=128,
    num_workers=2,
    pin_memory=True,
    shuffle=False
)

In [9]:
model = GAPModel(BERT_MODEL, torch.device("cuda:0"))
# You can unfreeze the last layer of bert by calling set_trainable(model.bert.encoder.layer[23], True)
set_trainable(model.bert, False)
set_trainable(model.head, True)

100%|██████████| 1248501532/1248501532 [00:25<00:00, 48194149.42B/s]


Initing linear
Initing batchnorm
Initing linear
Initing batchnorm
Initing linear
Initing batchnorm
Initing linear
Initing batchnorm
Initing linear
Initing batchnorm
Initing linear
Initing batchnorm
Initing linear
Initing batchnorm
Initing linear
Initing batchnorm
Initing linear
Initing batchnorm
Initing linear
Initing batchnorm
Initing linear
Initing batchnorm
Initing linear
Initing batchnorm
Initing linear
Initing batchnorm
Initing linear
Initing batchnorm
Initing linear
Initing batchnorm
Initing linear
Initing batchnorm
Initing linear
Initing batchnorm
Initing linear
Initing batchnorm
Initing linear


In [10]:
lr=1e-4
weight_decay=5e-5
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

bot = GAPBot(
    model, train_loader, val_loader,
    optimizer=optimizer, echo=True,
    avg_window=25
)

[[04/01/2019 09:08:02 PM]] SEED: 420
[[04/01/2019 09:08:02 PM]] # of paramters: 336,906,071
[[04/01/2019 09:08:02 PM]] # of trainable paramters: 1,764,183


In [11]:
steps_per_epoch = len(train_loader) 
n_steps = steps_per_epoch * 27
bot.train(
    n_steps,
    log_interval=steps_per_epoch // 4,
    snapshot_interval=steps_per_epoch,
    scheduler=TriangularLR(
        optimizer, 20, ratio=3, steps_per_cycle=n_steps)
)

[[04/01/2019 09:08:02 PM]] Optimizer Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    initial_lr: 0.0001
    lr: 5e-06
    weight_decay: 5e-05
)
[[04/01/2019 09:08:02 PM]] Batches per epoch: 100
[[04/01/2019 09:08:12 PM]] Step 25: train 1.138716 lr: 1.471e-05
[[04/01/2019 09:08:22 PM]] Step 50: train 1.127204 lr: 2.527e-05
[[04/01/2019 09:08:31 PM]] Step 75: train 1.136880 lr: 3.582e-05
[[04/01/2019 09:08:39 PM]] Step 100: train 1.134579 lr: 4.638e-05
100%|██████████| 4/4 [00:08<00:00,  2.10s/it]
[[04/01/2019 09:08:48 PM]] Snapshot loss 1.112209
[[04/01/2019 09:08:50 PM]] Saving checkpoint cache/model_cache/best.pth...
[[04/01/2019 09:08:50 PM]] New low

[[04/01/2019 09:09:01 PM]] Step 125: train 1.130063 lr: 5.693e-05
[[04/01/2019 09:09:09 PM]] Step 150: train 1.123101 lr: 6.749e-05
[[04/01/2019 09:09:19 PM]] Step 175: train 1.121119 lr: 7.804e-05
[[04/01/2019 09:09:28 PM]] Step 200: train 1.114836 lr: 8.860e-05
100%|██████████| 4/4 [00:08<00:00, 

In [12]:
# Load the best checkpoint
bot.load_model(bot.best_performers[0][1])

In [13]:
# Evaluate on the test dataset
bot.eval(test_loader)

100%|██████████| 16/16 [00:40<00:00,  2.03s/it]


1.011976761817932

In [None]:
# Extract predictions to the test dataset
preds = bot.predict(test_loader)

 44%|████▍     | 7/16 [00:19<00:23,  2.63s/it]

In [None]:

from torch.autograd import Variable
#make_dot(model(test_ds), params=dict(model.named_parameters()))

In [None]:
def exx(row):
    if(row.A > row.B and row.A > row.NEITHER):
          row.A = 1
          row.B = 0
          row.NEITHER = 0    
    elif(row.B > row.A and row.B > row.NEITHER):
          row.A = 0
          row.B = 1
          row.NEITHER = 0 
    else:
          row.A = 0
          row.B = 0
          row.NEITHER = 1
    return row

In [None]:
# Create submission file
df_sub = pd.DataFrame(torch.softmax(preds, -1).cpu().numpy().clip(1e-3, 1-1e-3), columns=["A", "B", "NEITHER"])
df_sub["ID"] = df_test.ID
df_sub.to_csv("submission.csv", index=False)
df_sub.head()