In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

import torch
import torch.nn as nn
from torch.optim.lr_scheduler import *
from tqdm.notebook import tqdm, tnrange

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.model_selection import train_test_split
from threading import Thread, Semaphore
import matplotlib.pyplot as plt

In [None]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

## Train on frozen activations

The activations are imported from [Part I](https://www.kaggle.com/alexbas/freeze-language-model-activations-part-i)

In [None]:
MODEL_NAME = 'joeddav/xlm-roberta-large-xnli'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

xnli_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

lm = xnli_model.roberta
head = xnli_model.classifier.to(DEVICE)

### Data Loaders
Test Loader - preserves order, fetches batches of pre-defined size

Train Loader - randomly picks smaller arrays (of size 16), since retrieving a batch takes a few seconds - does this on a separate thread (to offload IO)

In [None]:
ROOT_DIR =  '/kaggle/input/freeze-language-model-activations-part-i/'

class TestLoader:
    def __init__(self):
        self.folder = f'{ROOT_DIR}test'
        self.len = len(os.listdir(self.folder))
    def __len__(self):
        return self.len
    def __iter__(self):
        folder = self.folder
        for i in range(self.len):
            X = torch.load(f'{folder}/{i}.pt', map_location=DEVICE)
            yield X

In [None]:
train_folder = f'{ROOT_DIR}train'
total_files = len([fn for fn in os.listdir(train_folder) if 'l_' != fn[:2]])
bs = 256
files_per_batch = bs // 16
n_batches = total_files // files_per_batch

In [None]:
test_size = .1

train_len = int((1-test_size) * total_files)
all_idx = np.random.permutation(total_files)
train_idx = all_idx[:train_len]
val_idx = all_idx[train_len:]

In [None]:
def fetch_batch(idx):
    files_ids = np.random.choice(idx, size=files_per_batch, replace=False)
    inputs, labels = [], []
    for file_id in files_ids:
        fn_x = f'{train_folder}/{file_id}.pt'
        fn_y = f'{train_folder}/l_{file_id}.pt'
        inputs.append(torch.load(fn_x))
        labels.append(torch.load(fn_y))        
    X, y = [torch.cat(x) for x in [inputs, labels]]
    return X, y
        
class Producer(Thread):
    def __init__(self, idx):
        super().__init__()
        self.consume = Semaphore(0)
        self.produce = Semaphore(0)
        self.setDaemon(True)
        self.batch = None
        self.idx = idx
    def run(self):
        while True:
            self.produce.acquire()
            self.batch = fetch_batch(self.idx)
            self.consume.release()

In [None]:
class TrainDataloader:
    def __init__(self, producer, n):        
        self.producer = producer                                      
        self.n = n
        producer.start()
        
    def __len__(self):
        return self.n
                
    def __iter__(self):
        from time import sleep, time
        consume = self.producer.consume
        produce = self.producer.produce
        produce.release()
        for i in range(self.n):
            consume.acquire()
            X, y = self.producer.batch
            produce.release()
            yield X.to(DEVICE), y.to(DEVICE)

In [None]:
train_dl = TrainDataloader(Producer(train_idx), n_batches)
val_dl = TrainDataloader(Producer(val_idx), n_batches)
model = head
loss_func = torch.nn.CrossEntropyLoss()

In [None]:
class EpochStats():
    def __init__(self):
        self.y_hat = []
        self.y = []
        self.loss = 0
    def update(self, y, logits, loss):
        self.y.append(y)
        self.y_hat.append(logits.argmax(1))
        self.loss += loss
    def metrics(self):
        y, y_hat = [torch.cat(l) for l in [self.y, self.y_hat]]
        loss = self.loss / y.shape[0]
        accuracy = (y == y_hat).float().mean().item()
        return accuracy, loss

In [None]:
def train_batch(model, opt, loss_f, X, y):
    opt.zero_grad()
    logits = model(X)
    loss = loss_f(logits, y)
    loss.backward()
    opt.step()
    return logits, loss.item()
    
def train_epoch(model, opt, loss_f, dl, e):
    model.train()
    stats = EpochStats()
    for X, y in tqdm(dl, desc=f"Epoch {e}", leave=False):
        logits, batch_loss = train_batch(model, opt, loss_f, X, y)
        stats.update(y, logits, batch_loss)
    return stats.metrics()

def eval_epoch(model, loss_f, dl):
    model.eval()
    stats = EpochStats()
    with torch.no_grad():        
        for X, y in dl:
            logits = model(X)
            batch_loss = loss_f(logits, y).item()
            stats.update(y, logits, batch_loss)
    return stats.metrics()

In [None]:
class Trainer:
    def __init__(self, model, opt, loss_f, train_dl, val_dl, scheduler=None):
        self.model = model
        self.opt = opt
        self.loss_f = loss_f
        self.train = train_dl
        self.val = val_dl
        self.metrics = []
        self.scheduler = scheduler
    def fit(self, n_epochs=1, print_metrics = True):
        for i in tnrange(n_epochs):
            train_metrics = train_epoch(self.model,
                                       self.opt,
                                       self.loss_f,
                                       self.train,
                                       i)
            if self.scheduler:
                self.scheduler.step()                
            val_metrics = eval_epoch(self.model,
                                    self.loss_f,
                                    self.val)
            metrics = train_metrics + val_metrics
            self.metrics.append(metrics)
            if print_metrics:
                print("Epoch %d. Train acc: %.3f, loss: %.6f | Val acc: %.3f, loss: %.6f" %
                      tuple([i]+list(metrics)))
    
    def show_lc(self):
        def show(ax, idx, name):
            ax.set_title(name)
            ax.plot(x, metrics[:, idx[0]], 
                    label = "Train")        
            ax.plot(x, metrics[:, idx[1]], 
                    label = "Val")        
            ax.legend()
        metrics = np.array(self.metrics)
        x = np.arange(metrics.shape[0])
        f, ax = plt.subplots(1,2, figsize=(10,4))
        show(ax[0], [1,3], "Loss")
        show(ax[1], [0,2], "Accuracy")

In [None]:
lr = 3e-4

optimizer = torch.optim.Adam(model.parameters(), lr)
scheduler = StepLR(optimizer, 1, .6)
trainer = Trainer(model, 
                  optimizer, 
                  loss_func, 
                  train_dl, 
                  val_dl,
                  scheduler)
trainer.fit(20)

In [None]:
trainer.show_lc()

In [None]:
def switch_labels(y):
     return ((y-1) * -1) + 1

def predict(model, dl):
    y_hat = []
    for X in tqdm(dl, "Running inference"):
        logits = model(X)
        y_hat.append(logits.argmax(1))
    preds = torch.cat(y_hat).cpu().numpy()
    preds = switch_labels(preds)
    return preds
    
predictions = predict(model, TestLoader())

In [None]:
df_test = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv')
df_test['prediction'] = predictions
df_test[['id', 'prediction']].to_csv('submission.csv', index=False)

In [None]:
with open('submission.csv') as f:
    for i in range(3):
        print(f.readline().strip())