In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!nvidia-smi

Sun Feb  9 20:40:36 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.48.02    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P0    30W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [3]:
import torch
torch.__version__

'1.4.0'

In [0]:
bart = torch.hub.load('pytorch/fairseq', 'bart.large')
bart.eval()  # disable dropout (or leave in train mode to finetune)

Using cache found in /root/.cache/torch/hub/pytorch_fairseq_master


In [0]:
for p in bart.parameters():
  p.requires_grad = False

## Convert whole dataset to bart embeddings

In [0]:
import pandas as pd

PATH = '/content/drive/My Drive/Kaggle/Google QUEST Q&A Labeling/data/base/'
OUTPUT_PATH = '/content/drive/My Drive/Kaggle/Google QUEST Q&A Labeling/output/2/'

df_train = pd.read_csv(PATH+'train.csv')
df_test = pd.read_csv(PATH+'test.csv')
df_sub = pd.read_csv(PATH+'sample_submission.csv')
print('train shape =', df_train.shape)
print('test shape =', df_test.shape)

output_categories = list(df_train.columns[11:])
input_categories = list(df_train.columns[[1,2,5]])
print('\noutput categories:\n\t', output_categories)
print('\ninput categories:\n\t', input_categories)

In [0]:
df_train.head()

In [0]:
df_train[df_train[output_categories] > 1.0].sum()

In [0]:
# torch.cuda.empty_cache()

In [0]:
TRAIN_BATCH_SIZE = 4
EVAL_BATCH_SIZE = 4
# MAX_TITLE_SIZE = 64
MAX_TITLE_SIZE = 32
# MAX_QUESTION_SIZE = 512
MAX_QUESTION_SIZE = 256
# MAX_ANSWER_SIZE = 512
MAX_ANSWER_SIZE = 256
BART_EMBEDDING_SIZE = 1024
MAX_TITLE_QUESTION_SIZE = MAX_TITLE_SIZE + MAX_QUESTION_SIZE
LABELS_COUNT = len(output_categories)

In [0]:
from fairseq.data.data_utils import collate_tokens

def extract_features_for_batch(batch_snt, max_length):
  batch_encoded = []
  trimmed_count = 0
  for s in batch_snt:
    encoded = bart.encode(s)
    if len(encoded) <= max_length:
      encoded = torch.cat((encoded, torch.tensor([1]).repeat(max_length - len(encoded))), 0)
    else:
      encoded = encoded[:max_length]
      trimmed_count += 1
    batch_encoded.append(encoded)
  batch = collate_tokens(batch_encoded, pad_idx=1)
  # print('trimmed_count:', trimmed_count)
  return batch

In [0]:
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def prepare_dataset(inpput_df, batch_size):
  titles = []
  questions = []
  answers = []
  output = []
  data_q = torch.zeros([0, batch_size, MAX_TITLE_QUESTION_SIZE], dtype=torch.int64)
  data_a = torch.zeros([0, batch_size, MAX_ANSWER_SIZE], dtype=torch.int64)
  target = torch.zeros([0, batch_size, LABELS_COUNT], dtype=torch.float32)
  for index, row in tqdm(inpput_df.iterrows()):
    titles.append(row['question_title'])
    questions.append(row['question_body'])
    answers.append(row['answer'])
    output.append(row[output_categories].values)
    if len(titles) < batch_size:
      continue
    
    titles_f = extract_features_for_batch(titles, MAX_TITLE_SIZE)
    questions_f = extract_features_for_batch(questions, MAX_QUESTION_SIZE)
    answers_f = extract_features_for_batch(answers, MAX_ANSWER_SIZE)
    feat_q = torch.cat((titles_f, questions_f), 1).unsqueeze(0)
    feat_a = answers_f.unsqueeze(0)

    data_q = torch.cat((data_q, feat_q), 0)
    data_a = torch.cat((data_a, feat_a), 0)
    output = torch.Tensor(output).unsqueeze(0)
    target = torch.cat((target, output), 0)

    titles = []
    questions = []
    answers = []
    output = []
  # TODO: extract the rest?

  data_q = data_q.contiguous().to(device)
  data_a = data_a.contiguous().to(device)
  target = target.contiguous().to(device)
  print('data_q:', data_q.shape)
  print('data_a:', data_a.shape)
  print('target:', target.shape)
  return data_q, data_a, target

train_q, train_a, train_target = prepare_dataset(df_train, TRAIN_BATCH_SIZE)
# test_q, test_a, test_target = prepare_dataset(df_test, EVAL_BATCH_SIZE)

## Build pytorch model

In [0]:
import math
import torch
import torch.nn.functional as F
import torch.nn as nn

class BARTClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(
        self,
        input_dim,
        inner_dim,
        num_classes,
        activation_fn,
        pooler_dropout,
    ):
        super().__init__()
        self.dense = nn.Linear(input_dim, inner_dim)
        self.activation_fn = activation_fn
        self.dropout = nn.Dropout(p=pooler_dropout)
        self.out_proj = nn.Linear(inner_dim, num_classes)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        # print('dense', x.shape)
        x = self.dense(x)
        # print('activation_fn', x.shape)
        print('x0:', x.cpu())
        x = self.activation_fn(x)
        # print('dropout', x.shape)
        print('x1:', x.cpu())
        x = self.dropout(x)
        # print('out_proj', x.shape)
        x = self.out_proj(x)
        # print('return', x.shape)
        return x

class BartMultiLabelModel(nn.Module):

    def __init__(self, bart, q_size, a_size, bart_embedding_size, labels_count, dropout):
        super(BartMultiLabelModel, self).__init__()
        
        self.bart = bart
        self.dense = nn.Linear(q_size + a_size, 1)
        inner_dimension = 512
        self.head = BARTClassificationHead(
            bart_embedding_size, inner_dimension,
            labels_count, torch.tanh, dropout
            )

    def forward(self, src_q, src_a):
        print('src_q:', src_q.cpu().numpy())
        print('src_a:', src_a.cpu().numpy())
        features_q = self.bart.extract_features(src_q)
        features_a = self.bart.extract_features(src_a)
        if (features_q != features_q).any():
          for i, a in enumerate(src_q):
            for j, b in enumerate(a):
              print(f'[{i}, {j}]: {b}')
        print('features_q:', features_q.cpu())
        print('features_a:', features_a.cpu())
        sys.stdout.flush()
        features = torch.cat((features_q, features_a), 1)
        # print('permute:', features.shape)
        features = features.permute(0, 2, 1)
        # print('dense:', features.shape)
        print('features0:', features.cpu())
        features = self.dense(features).squeeze(2)
        # print('head:', features.shape)
        print('features1:', features.cpu())
        labels_logits = self.head(features)
        print('labels_logits:', labels_logits.cpu())
        labels = torch.sigmoid(labels_logits)
        return labels

## Initiate an instance

In [0]:
dropout = 0.2 # the dropout value
model = BartMultiLabelModel(bart, MAX_TITLE_QUESTION_SIZE, MAX_ANSWER_SIZE, BART_EMBEDDING_SIZE, LABELS_COUNT, dropout).to(device)

In [0]:
torch.cuda.memory_allocated() / 1024 / 1024 / 1024

## Run the model

In [0]:
criterion = nn.BCELoss()
lr = 2e-5 # learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

import time
def train():
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    for i in range(0, train_q.size(0) - 1):
        question, answer, target = train_q[i], train_a[i], train_target[i]
        optimizer.zero_grad()
        print(f'step {i}')
        print(question.cpu())
        print(answer.cpu())
        output = model(question, answer)
        print(target.cpu())
        print(output.cpu())
        assert len(((target > 1.0) | (target < 0.0)).nonzero().cpu().numpy()) == 0
        assert len(((output > 1.0) | (output < 0.0)).nonzero().cpu().numpy()) == 0
        loss = criterion(output, target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        sys.stdout.flush()

        total_loss += loss.item()
        log_interval = 50
        if i % log_interval == 0 and i > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, i, len(train_q), scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    with torch.no_grad():
        for i in range(0, test_q.size(0) - 1):
            question, answer, target = test_q[i], test_q[i], test_target[i]
            output = eval_model(data)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)



Loop over epochs. Save the model if the validation loss is the best
we've seen so far. Adjust the learning rate after each epoch.



In [0]:
def run_training():
  best_val_loss = float("inf")
  epochs = 3 # The number of epochs
  best_model = None

  for epoch in range(1, epochs + 1):
      epoch_start_time = time.time()
      train()
      val_loss = evaluate(model, val_data)
      print('-' * 89)
      print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
            'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                      val_loss, math.exp(val_loss)))
      print('-' * 89)

      if val_loss < best_val_loss:
          best_val_loss = val_loss
          best_model = model

      scheduler.step()

In [0]:
from multiprocessing import Process, Queue

queue = Queue()
p = Process(target=run_training, args=())
p.start()
p.join() # this blocks until the process terminates
result = queue.get()

In [0]:
a = torch.Tensor([1, 2, 3])

In [0]:
((a == 5) | (a == 3)).nonzero().numpy()

In [0]:
a.shape

In [0]:
(a != 5)