In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
ds = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/train.csv')
ds.head(2)

In [None]:
ds_test = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv')
ds_test.head(2)

In [None]:
LABELS = ['entailment', 'neutral', 'contradiction']

def show(row):
    print('1)', row.premise)
    print('2)', row.hypothesis)
    print('Label:', LABELS[row.label])
    
engs =  ds[ds.lang_abv == 'en']
idx = np.random.randint(engs.shape[0])
show(engs.iloc[idx])

In [None]:
import torch
import torch.nn as nn
from tqdm.notebook import tqdm, tnrange

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.model_selection import train_test_split

In [None]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

### Labels

In the [original dataset](https://huggingface.co/datasets/xnli) the labels are in reverse order: 0 is Contradiction, 2 is Entailment (Nuetral is the same - 1)


In [None]:
def switch_labels(y):
     return ((y-1) * -1) + 1
    
labels = torch.tensor(ds.label)
labels = switch_labels(labels)

### Model

In [None]:
MODEL_NAME = 'joeddav/xlm-roberta-large-xnli'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

xnli_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

lm = xnli_model.roberta.to(DEVICE)
head = xnli_model.classifier

In [None]:
def ds_to_tensors(ds):
    pairs = ds[['premise', 'hypothesis']].values.tolist()
    tokenized = tokenizer(pairs, add_special_tokens=True, padding=True, 
                          return_tensors = "pt",
                          truncation='only_first')
    ids = tokenized.input_ids
    mask = tokenized.attention_mask
    return ids, mask

%time ids, mask = ds_to_tensors(ds)
ids.shape

In [None]:
!mkdir train test

In [None]:
def freeze(model, ids, masks, folder, batch_size=512, labels=None):        
    m = ids.shape[0]
    n_iters = (m // batch_size) + int(m % batch_size > 0)
    model.eval()
    with torch.no_grad():
        for batch_i in tqdm(range(n_iters)):
            start = batch_i * batch_size
            end = min(m, (batch_i+1) * batch_size)
            batch_ids = ids[start:end].to(DEVICE)
            batch_mask = mask[start:end].to(DEVICE)
            output = model(input_ids = batch_ids, 
                           attention_mask = batch_mask)
            activations = output.last_hidden_state
            torch.save(activations, 
                       f'{folder}/{batch_i}.pt')
            if labels is not None:
                torch.save(labels[start:end], f'{folder}/l_{batch_i}.pt')
            
    return n_iters

freeze(lm, ids, mask, 'train', 16, labels)

In [None]:
ids, mask = ds_to_tensors(ds_test)
freeze(lm, ids, mask, 'test', 256)