# Intro:
    1. train_dataset of type IterableDataset for streaming data with no sampler. Datasets used: mnli, xnli, snli.
    2. eval_dataset and test_dataset: map type datasets for random/sequential sampler
    3. Automatically train with Trainer
    4. Error in the Trainer api at the end of eval step throws error <KeyError: 'eval_loss'>

In [1]:
# !pip install --upgrade transformers
# !pip install datasets

In [2]:
import os
import numpy as np
import random
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import transformers
import tokenizers
import datetime

from datasets import load_dataset
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, TrainingArguments, Trainer
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig

In [4]:
mnli = load_dataset('glue', 'mnli')
len (mnli)

Reusing dataset glue (C:\Users\almug\.cache\huggingface\datasets\glue\mnli\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)


5

In [5]:
xnli = load_dataset('xnli')
len (xnli)

Reusing dataset xnli (C:\Users\almug\.cache\huggingface\datasets\xnli\plain_text\1.0.0\95e1793728642389d5c4f86e1bd48f6d1bb9e867c7f74a0f7ae4156a09255c46)


2

In [6]:
snli = load_dataset('snli')
len (snli)

Reusing dataset snli (C:\Users\almug\.cache\huggingface\datasets\snli\plain_text\1.0.0\bb1102591c6230bd78813e229d5dd4c7fbf4fc478cec28f298761eb69e5b537c)


3

In [7]:
# shuffle and randomly sample 25% as dev/valid and rest 75% as train
original_train_df = pd.read_csv ("../input/contradictory-my-dear-watson/train.csv")
original_train_df = shuffle (original_train_df)
print (original_train_df.shape)
original_valid_df = original_train_df[:len(original_train_df)//4]
original_train_df = original_train_df[(len(original_train_df)//4):]
print (original_train_df.shape, original_valid_df.shape)
original_train_df.head()

(12120, 6)
(9090, 6) (3030, 6)


Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
6897,5e4f77f4b2,and the NIT semifinals are on tonight,The NIT semifinals take place in New York City...,en,English,1
7003,c254e444ca,yeah i i think my favorite restaurant is alway...,I am not picky about what kind of food I eat I...,en,English,0
800,afbc6c87be,"Да, он предложил купить, ну, это... швабру, та...",Он предложил найти швабру.,ru,Russian,0
5615,b13b9d140b,Không có khả năng giao tiếp là một yếu tố quan...,Mọi người gặp khó khăn khi giao tiếp tại Trung...,vi,Vietnamese,1
8551,3951024fda,أوه لا، لم أخطط واحدة من قبل ولكننا لدينا واحد...,توقفوا عن فعل ذلك منذ 10 أعوام.,ar,Arabic,2


In [8]:
original_test_df = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")
original_test_df.shape

(5195, 5)

In [9]:
bert_model_name = 'bert-base-multilingual-cased'
max_len = 256
tokenizer = BertTokenizer.from_pretrained (bert_model_name, do_lower_case=False)

In [10]:
def encode (premise, hypothesis, label, lang):
    
    encoded_dict = tokenizer (
                        premise,                   # 1st of the Sentence pair to encode.
                        hypothesis,                # 2nd of the Sentence pair to encode. 
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        truncation=True,           # just max_len will not automatically truncate
                        max_length = max_len,      # Pad & truncate all sentences.
                        padding='max_length',
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
    )   
    # 1-D tensors are expected for a sample. Hence squeeze these 2-D tensors e.g [1,256] shaped tensors to 1-D [256] shape 
    for k in encoded_dict:
        encoded_dict[k] = torch.squeeze (encoded_dict[k])
    encoded_dict['label'] = label
    encoded_dict['lang'] = lang
    return encoded_dict

In [11]:
def _get_features(elt):
    '''
    Args:
        elt: elements of a `nlp.arrow_dataset.Dataset` that we have seen above
    
    Yields: tuples of 3 elements: (premise, hypothesis, language)
    '''

    if type(elt) == pd.core.series.Series:
        yield (elt['premise'], elt['hypothesis'], elt['lang_abv'])
    
    elif type(elt['premise']) == str:  
        yield (elt['premise'], elt['hypothesis'], 'en')
    
    elif type(elt) == dict:
        
        # dict of strings
        premises = elt['premise']
        
        # dict of lists
        hypotheses_dict = elt['hypothesis']
        
        # lists
        langs = hypotheses_dict['language']
        translations = hypotheses_dict['translation']
        
        hypotheses = {k: v for k, v in zip(langs, translations)}
                
        for lang in elt['premise']:
            if lang in hypotheses:
                yield (elt['premise'][lang], hypotheses[lang], lang)
    return
        
def _get_raw_datasets_from_nlp(ds):
    """ From a `ds: nlp.arrow_dataset.Dataset` that we have seen above to a generator of dictionaries with unified format.
    
    Yield a dictionary with keys: 'premise', 'hypothesis', 'label', 'lang'
    """
    
    for _, elt in enumerate(ds):
        
        label = -1
        if 'label' in elt:
            label = elt['label']
        for features in _get_features(elt):    
            # yield {'premise': features[0], 'hypothesis': features[1], 'label': label, 'lang': features[2]}
            yield (encode (features[0], features[1], label, features[2]))
    return
            
def _get_raw_datasets_from_dataframe(ds: pd.core.frame.DataFrame):
    """
    ds: pd.core.frame.DataFrame
    """
    
    result = []
    
    for idx, elt in ds.iterrows():
        for features in _get_features(elt):
            
            label = -1
            if 'label' in elt:
                label= elt['label']            
            # yield (encode {'premise': features[0], 'hypothesis': features[1], 'label': label, 'lang': features[2]})
            yield (encode (features[0], features[1], label, features[2]))
    return

In [12]:
raw_ds_mapping = {
    'original train': (_get_raw_datasets_from_dataframe, original_train_df, len(original_train_df)),
    'original valid': (_get_raw_datasets_from_dataframe, original_valid_df, len(original_valid_df)),
    'snli train': (_get_raw_datasets_from_nlp, snli['train'], snli['train'].num_rows),
    'snli valid': (_get_raw_datasets_from_nlp, snli['validation'], snli['validation'].num_rows),
    'mnli train': (_get_raw_datasets_from_nlp, mnli['train'], mnli['train'].num_rows),
    'mnli valid 1': (_get_raw_datasets_from_nlp, mnli['validation_matched'], mnli['validation_matched'].num_rows),
    'mnli valid 2': (_get_raw_datasets_from_nlp, mnli['validation_mismatched'], mnli['validation_mismatched'].num_rows),
    'xnli valid': (_get_raw_datasets_from_nlp, xnli['validation'], xnli['validation'].num_rows * 15), # 15 languages
    'original test': (_get_raw_datasets_from_dataframe, original_test_df, len(original_test_df)),
}

def get_raw_dataset(ds_name):
    
    fn, ds, nb_examples = raw_ds_mapping[ds_name]
    for x in fn(ds):
        yield x
    return

In [13]:
# sanity check
for k in raw_ds_mapping:
    for idx, x in enumerate(get_raw_dataset(k)):
        print (x)
        print (x['input_ids'].shape)
        if idx >= 0:
            break

{'input_ids': tensor([  101, 10111, 10105,   151, 37611, 82726, 10301, 10135, 22464, 27521,
          102, 10117,   151, 37611, 82726, 13574, 11192, 10106, 10287, 10482,
        10773, 22464, 27521,   119,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

In [14]:
train_ds_names = ['original train', 'snli train', 'mnli train', 'snli valid', 
                  'mnli valid 1', 'mnli valid 2', 'xnli valid']
eval_ds_names  = ['original valid']
test_ds_names  = ['original test']

In [15]:
class MyIterableDataset (torch.utils.data.IterableDataset):
    
    def __init__(self, ds_names, isLabeledDataset=False):
        
        super (MyIterableDataset).__init__()
        self.ds_names = ds_names
        self.length   = self.getLen ()
        self.isLabeledDataset = isLabeledDataset
        return
    
    def getLen (self):
        
        length = 0
        for ds_name in self.ds_names:
            
            fn, ds, nb_examples = raw_ds_mapping[ds_name]
            length += nb_examples
        return 128 #length
    
    def __iter__(self):
        
        junk = 0
        for ds_name in self.ds_names:
            
            fn, ds, nb_examples = raw_ds_mapping[ds_name]
            for x in fn(ds):
                # strangely sometimes label is not found i.e label=-1 by default, so filter such instanes
                if self.isLabeledDataset and x['label'] == -1:
                    continue
                junk += 1
                if junk >=128:
                    return
                yield x
        return
    
    def __len__(self):
        return self.length

In [16]:
# sanity test
train_dataset = MyIterableDataset (train_ds_names, True)
for i, d in enumerate (train_dataset):
    print (d)
    if i==1:
        break

{'input_ids': tensor([  101, 10111, 10105,   151, 37611, 82726, 10301, 10135, 22464, 27521,
          102, 10117,   151, 37611, 82726, 13574, 11192, 10106, 10287, 10482,
        10773, 22464, 27521,   119,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

In [17]:
# create a custom map type dataset
class My_Dataset (torch.utils.data.Dataset):
    
    def __init__(self, encodings, labels):
        
        self.encodings = encodings
        self.labels = labels
        return

    def __getitem__(self, idx):
        
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # item['label'] = torch.tensor(self.labels[idx])
        # item = {key: val[idx] for key, val in self.encodings.items()}
        # item['label'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [18]:
def get_mapType_Dataset (ds_names, isLabeledDataset):
    
    # test/eval dataset is not supposed to be infinite i.e generator/iterable dataset. 
    # It has to be an ordinary dataset which is based on a sampler. Hence, convert to an ordinary dataset
    input_ids          = []
    token_type_ids     = []
    attention_mask     = []
    label              = []
    lang               = []
    dataset_iter  = MyIterableDataset (ds_names, isLabeledDataset)
    for encoded_dict in dataset_iter:
        
        input_ids.append (encoded_dict['input_ids'])
        token_type_ids.append (encoded_dict['token_type_ids'])
        attention_mask.append (encoded_dict['attention_mask'])
        label.append (encoded_dict['label'])
        lang.append (encoded_dict['lang'])

    # Combine the training inputs into a TensorDataset. This arranges a row as a tuple of tensors.
    # Convert the lists into tensors.
    # input_ids = torch.cat (input_ids, dim=0)
    # attention_mask = torch.cat (attention_mask, dim=0)
    # token_type_ids = torch.cat (token_type_ids, dim=0)
    # label = torch.tensor (label)
    # But this works only in manually training the model because then you assign each element of the tuple 
    # to the corresponding model params. In the trainer you need to assign your custom map type dataset
    # my_dataset = TensorDataset (input_ids, attention_mask, token_type_ids, label)

    encodings = {'input_ids':input_ids, 'attention_mask':attention_mask, 'token_type_ids':token_type_ids}
    my_dataset   = My_Dataset (encodings,  label)
    return my_dataset

In [19]:
eval_dataset = get_mapType_Dataset (eval_ds_names, True)
test_dataset = get_mapType_Dataset (test_ds_names, False)

In [20]:
len (train_dataset), len (eval_dataset), len (test_dataset), isinstance (train_dataset, torch.utils.data.IterableDataset), isinstance (test_dataset, torch.utils.data.IterableDataset)

(128, 127, 127, True, False)

In [21]:
# sanity test
for i, row in enumerate (test_dataset):
    print (row)
    if i >=0:
        break

{'input_ids': tensor([  101,   764, 28744,   752, 11076, 16577,   752, 10748, 67499, 10961,
          752,   834, 82397, 17317, 11242,   752, 11076, 16161,   752, 11076,
        16161,   752, 11363, 13244, 22887, 76528,   829, 32245, 11722, 85408,
        10691, 86325, 77970, 10429, 10691, 12995, 11689,   764, 28744, 13378,
        13244, 29847,   774, 10658, 17651, 65176, 75399, 21516, 12574, 11722,
        11285, 43164, 32326, 23780,   823, 25908, 11145, 12687,   102, 11076,
        16577, 10691,   787, 26649, 40634, 59360, 50717, 21735, 18779, 47238,
          117, 13244, 22887, 12710,   829, 32245, 11722, 85408, 10691, 69883,
        23172, 13378, 10916, 11689, 12427, 13141,   788, 10673, 17571,   119,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 


To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).



In [22]:
# If there's a GPU available...
if torch.cuda.is_available ():    

    # Tell PyTorch to use the GPU.    
    device = torch.device ("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count ())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device ("cpu")

No GPU available, using the CPU instead.


In [23]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained (
    
    bert_model_name,              # Use the pretrained BERT model.
    num_labels = 3,               # The number of output labels--2 for binary classification.
                                  # You can increase this for multi-class tasks.   
    output_attentions = False,    # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
# model.cuda()

training_args = TrainingArguments (
    
    output_dir='./results',          # output directory
    overwrite_output_dir=True,
    num_train_epochs=2,              # total # of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=100,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    evaluation_strategy='epoch',
    logging_steps=10000,
    save_steps=10000,
    save_total_limit=1,
    # save_steps=int (len (train_dataset)/32),
    # fp16=True,
)

EPOCH_METRICS = []

def compute_metrics (pred):
    
    global EPOCH_METRICS
    
    labels = pred.label_ids
    preds = pred.predictions.argmax (-1)
    precision, recall, f1, _ = precision_recall_fscore_support (labels, preds, average='macro')
    acc = accuracy_score (labels, preds)
    mcc = matthews_corrcoef (labels, preds)        # matthews correlatoin coefficient
    metrics = {
        'mcc': mcc,
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
    EPOCH_METRICS.append (metrics)
    return metrics

trainer = Trainer (
    
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=eval_dataset,           # evaluation dataset
    compute_metrics=compute_metrics,
)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [24]:
trainer.train ()

Epoch,Training Loss,Validation Loss



To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).



KeyError: 'eval_loss'

In [None]:
# Classification Report curve
mccs       = [e['eval_mcc'] for e in EPOCH_METRICS]
accuracies = [e['eval_accuracy'] for e in EPOCH_METRICS]
f1_scores  = [e['eval_f1'] for e in EPOCH_METRICS]
precisions = [e['eval_precision'] for e in EPOCH_METRICS]
recalls    = [e['eval_recall'] for e in EPOCH_METRICS]
losses     = [e['eval_loss'] for e in EPOCH_METRICS]
epochs     = EPOCH_METRICS[-1]['epoch']

print ('mccs:',       mccs)
print ('accuracies:', accuracies)
print ('precisions:', precisions)
print ('recalls:',    recalls)
print ('f1_scores:',  f1_scores)
print ('losses:',     losses)

sns.lineplot (x=np.arange(1, epochs + 1), y=mccs,       label='val_mcc')
sns.lineplot (x=np.arange(1, epochs + 1), y=accuracies, label='val_accuracy')
sns.lineplot (x=np.arange(1, epochs + 1), y=precisions, label='val_precision')
sns.lineplot (x=np.arange(1, epochs + 1), y=recalls,    label='val_recall')
sns.lineplot (x=np.arange(1, epochs + 1), y=f1_scores,  label='val_f1') 

In [None]:
predictions, true_labels, metrics_dummy = trainer.predict (test_dataset)
# The predictions for this batch are a 2-column ndarray (one column for "0" 
# and one column for "1"). Pick the label with the highest value and turn this
# in to a list of 0s and 1s.
pred_labels = np.argmax (predictions, axis=1)
pred_labels

In [None]:
import warnings
warnings.filterwarnings ("ignore")

submitDF = original_test_df[['id']]
submitDF['prediction'] = pred_labels
submitDF.prediction = submitDF.prediction.astype (int)
submitDF.to_csv ('submission.csv', index=False)
submitDF.head ()