In [1]:
import random
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import BatchSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pickle5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("joeddav/xlm-roberta-large-xnli", padding=True)
filename = 'model.sav'
model = pickle5.load(open(filename,'rb'))
model.config

XLMRobertaConfig {
  "_name_or_path": "joeddav/xlm-roberta-large-xnli",
  "architectures": [
    "XLMRobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "forced_bos_token_id": null,
  "forced_eos_token_id": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "contradiction",
    "1": "neutral",
    "2": "entailment"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "contradiction": 0,
    "entailment": 2,
    "neutral": 1
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": null,
  "remove_invalid_values": false,
  "transformers_version": "4.3.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "v

In [2]:
class MyDataset(Dataset):
    def __init__(self, df, tokenizer, labels = False):
        self.inputs = df.loc[:,['premise', 'hypothesis']].values
        self.tokenizer = tokenizer
        self.labels = labels
        if self.labels:
            self.tgt = df['label'].values
    def __len__(self):
        return len(self.inputs)
    def __getitem__(self, idx):
        inputs = tokenizer(self.inputs[idx].tolist(), add_special_tokens=True, padding=True, return_tensors='pt')
        if self.labels:
            inputs['labels'] = self.tgt[idx]
            return inputs
        return inputs

In [3]:
def submission_predict(model, dataloader, device):
    model.eval()
    predicts = np.array([])
    for i, batch in enumerate(dataloader):
        inp_ids = batch['input_ids'].squeeze().to(device)
        mask = batch['attention_mask'].squeeze().to(device)
        out = model(input_ids=inp_ids, attention_mask=mask)
        batch_preds = out[0].argmax(dim=1)
        predicts = np.concatenate((predicts, batch_preds.cpu().detach().numpy()))
    return predicts

In [4]:
# temp_data = pd.read_csv('./temp.csv', encoding='utf8')
string = { 'premisis':'I am good' , 'hypothesis':'I am better'}

data = [[string['premisis'], string['hypothesis']], ['i am good', 'I am great']] 
temp_data = pd.DataFrame(data, columns = ['premise', 'hypothesis'])

temp_dataset = MyDataset(temp_data, tokenizer, labels=False)
temp_dataloader = DataLoader(dataset=temp_dataset,
                              sampler=BatchSampler(
                                    SequentialSampler(temp_dataset), 
                                    batch_size=8, drop_last=False), shuffle=False)
temp_preds = submission_predict(model, temp_dataloader, device)
temp_preds[0]

1.0

In [44]:
temp_data.head()

Unnamed: 0,premise,hypothesis
0,I am good,I am better
1,i am good,I am great


In [45]:
# result = np.concatenate((temp_data['id'].values.reshape(-1,1), np.int32(temp_preds.reshape(-1,1))), axis=1)
# result = pd.DataFrame(result, columns=['id', 'prediction'])
# result['prediction'] = result['prediction'].astype(np.int32).replace([0,2], [2,0])
# result.to_csv('result.csv', index=False)
# result
for x in temp_preds:
    print(x)

1.0
2.0


**0** - entailment

**1** - neutral

**2** - contradiction