In [1]:
%reload_ext autoreload
%autoreload 2

In [46]:
import os
import torch
import random
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [3]:
from utils import io
from utils import plot
from utils import metric
from model import train_evaluate

from model import xlmr_xnli_model
from model import xlmr_xnli_dataset

from transformers import XLMRobertaTokenizer, XLMRobertaModel

## XNLI

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
seed = 144
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [6]:
batch_size = 32

In [9]:
input_file = 'data/test/sample_input'
output_file = 'data/test/sample_output'

model_file = "experiments/LinearHead/R_014/best_014.pth.tar"
info_file = "experiments/LinearHead/R_014/info_014.pth.tar"

### information file

In [11]:
model_params, dataset_info = io.load_info_file(info_file)

### load data

In [17]:
languages = ['zh', 'es', 'hi', 'sw']

lang_code_map = {x:i for i, x in enumerate(dataset_info['language'])}
lang_codes = {lang_code_map[lang]: lang for lang in languages}

In [19]:
data = io.load_xnli_test_dataset(input_file, dataset_info['language'])

### dataloader

In [21]:
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

In [22]:
test_dataset = xlmr_xnli_dataset.XLMRXNLIDataset(data, tokenizer, torch.device('cpu'), isTrain=False)

In [27]:
test_dataset[0]

{'input_ids': tensor([    0, 81713,  6284,  2467, 10848,     4,  4006,  2259,   524, 22567,
             4,     6, 56906,     4,   253,     4,  2259,   687, 28617,  1358,
           158,  3030,     5,     2,     2, 14343,  4022,   158,  3030,  3178,
         16859,   681, 10900, 11847,     2]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'language': tensor(13)}

In [28]:
test_dataloader = DataLoader(
    test_dataset, 
    batch_size=batch_size,
    drop_last=False,
    num_workers=0,
    shuffle=False,
    collate_fn=tokenizer.pad
)

In [29]:
batch = next(iter(test_dataloader))

### model

In [31]:
xnli_model = xlmr_xnli_model.XLMRXLNIModel(**model_params)

In [32]:
state = io.load_checkpoint(model_file, xnli_model, device)

In [36]:
output = train_evaluate.predict(xnli_model, test_dataloader, dataset_info['gold_labels'], device)

  0%|          | 0/1 [00:00<?, ?it/s]

In [48]:
save_file = 'data/test/output_file'

Path(os.path.dirname(save_file)).mkdir(parents=True, exist_ok=True)

pd.DataFrame(output).to_csv(save_file, header=False, index=False)