# 모델을 바꿔서 성능을 높여보자.
- [Contradictory, My Dear Watson using XLNI Robert2](https://www.kaggle.com/rahulbana/contradictory-my-dear-watson-using-xlni-robert2)
  - https://huggingface.co/joeddav/xlm-roberta-large-xnli 모델 사용

In [None]:
import pandas as pd

In [None]:
train = pd.read_csv('../input/contradictory-my-dear-watson/train.csv')
test = pd.read_csv('../input/contradictory-my-dear-watson/test.csv')
submission = pd.read_csv('../input/contradictory-my-dear-watson/sample_submission.csv')

In [None]:
submission

In [None]:
train['language'].value_counts(normalize=True)*100

In [None]:
test['language'].value_counts(normalize=True)*100

In [None]:
train.language.unique()

# Visualization

In [None]:
train['label_str'] = train['label'].map({0 : "entailment", 1 : "neutral", 2 : "contradiction"})

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns 

plt.figure(figsize=(8,5))
sns.countplot(y ='label_str', data = train, alpha=.5, palette="muted")

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(y ='language', hue = "label_str", data = train, alpha=.5, palette="muted")

# 모델 고르기

Multi-lingual model 을 사용하기 위해서 [huggingface](https://huggingface.co/transformers/multilingual.html) 사이트를 참고하였습니다. multilingual model 이 제공해주는 언어가 XNLI 인 경우에는 [facebook 의 XNLI github](https://github.com/facebookresearch/XNLI) 을 보면 되는데, 다음과 같은 14개의 언어를 타겟으로 하고 있다고 하고 해당 언어들은 이 competition 에서 제공하는 train set 과 일치합니다.
- French, Spanish, German, Greek, Bulgarian, Russian, Turkish, Arabic, Vietnamese, Thai, Chinese, Hindi, Swahili and Urdu

이 중에서 저는 다음과 같은 XLM-RoBERTa 모델을 사용해보고자 합니다.
- 100개의 언어로 된 새롭게 생성된 깨끗한 CommonCrawl data 2.5TB 를 기반으로 학습되었습니다.
- mBEERT, XLM 과 같은 언어모델보다 downstream tasks 에 강점을 가진다고 합니다. (분류, sequence labeling, question answering)
- 2개의 모델이 존재합니다: xlm-roberta-base, xlm-roberta-large

저는 large 를 사용해보도록 하겠습니다.
[이 사이트](https://huggingface.co/transformers/model_doc/xlmroberta.html) 를 살펴보면 대략적인 사용방법에 대해서 알 수 있습니다. 너무 어려워서 [이 노트북](https://www.kaggle.com/jbagdon/predict-with-tf-xlm-roberta-large) 을 따라하기로 하였습니다.

In [None]:
# !pip install -q transformers==3.0.2
!pip install -q nlp

In [None]:
from transformers import BertTokenizer, AutoTokenizer, TFBertModel, TFXLMRobertaModel, TFAutoModel
import tensorflow as tf
from tensorflow.keras import Input, Model, Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, LSTM, Embedding, GlobalAveragePooling1D
from keras.optimizers import Adam

from nlp import load_dataset

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# TPU 사용 준비

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print(f'Running on TPU: {tpu.master()}')
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
print('Number of replicas:', strategy.num_replicas_in_sync)

# 모델 준비

In [None]:
# https://huggingface.co/jplu/tf-xlm-roberta-large
# encoder_handle = 'jplu/tf-xlm-roberta-large'
encoder_handle = 'joeddav/xlm-roberta-large-xnli'

In [None]:
# https://huggingface.co/jplu/tf-xlm-roberta-large/raw/main/config.json 가능
!curl https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-large/config.json

In [None]:
tokenizer = AutoTokenizer.from_pretrained(encoder_handle)

In [None]:
# 너무 작은 건 아닌가?
max_len = 120 # max sequence length
# random_seed = 2021
random_seed = 11887
learning_rate = 1e-5 # Controls how large a step is taken when updating model weights during training.
epochs = 3
batch_size = 16 * strategy.num_replicas_in_sync # The number of examples that will be processed in parallel during training. Tailored for TPUs.
loss = 'sparse_categorical_crossentropy'
metrics = ['accuracy']
# steps_per_epoch = 1000

auto = tf.data.experimental.AUTOTUNE

# encoding 함수

ragged: 누더기가 된

tf.ragged.constant: 데이터는 다양한 형태로 제공됩니다; 텐서도 마찬가지입니다. 비정형 텐서는 중첩 가변 길이 목록에 해당하는 텐서플로입니다. 다음을 포함하여 균일하지 않은 모양으로 데이터를 쉽게 저장하고 처리할 수 있습니다.
- 일련의 영화의 배우들과 같은 가변 길이 기능
- 문장이나 비디오 클립과 같은 가변 길이 순차적 입력의 배치
- 절, 단락, 문장 및 단어로 세분화된 텍스트 문서와 같은 계층적 입력
- 프로토콜 버퍼와 같은 구조화된 입력의 개별 필드

ragged tensor를 to_tensor 를 통해서 일반 tensor로 바꾸게 되면, 내부의 모든 데이터가 같은 길이를 갖게 된다. 이 때 짧은 값들이 길어지게 되면서 새로운 값들이 채워지게 되는데 그 때 사용되는 값이 default_value 값이다.

In [None]:
def encode_sentence(s, tokenizer):
    """
    Turn a sequence of words into and array of numbers using a selected tokenizer.
    Args:
        s (list of str) - Input string.
        tokenizer - XLM-R tokenizer.
    Returns:
        (list of int) - Tokenized string.

    """
    tokens = list(tokenizer.tokenize(s))
    tokens.append(tokenizer.sep_token)
    return tokenizer.convert_tokens_to_ids(tokens)

def tokenize(data, tokenizer, max_len):
    """
    Encode hypotheses and premises into arrays of numbers using a selected tokenizer. 
    Args:
        data - An array consisting of [hypothesis (str), premise (str)] pairs.
        tokenizer - Tokenizer handle.
        max_len - Max sequence length.
    Returns: (dictionary of tensors)
        input_word_ids - Indices of input sequence tokens in the vocabulary, truncated to max_len.
        input_mask - Real input indices mapped to ones. Padding indices mapped to zeroes.
        input_type_ids - Segment token indices to indicate first and second portions of the inputs.
    """

    PAD_ID = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
 
    # Append a separator to each sentence, tokenize, and concatenate.
    tokens1 = tf.ragged.constant([encode_sentence(s[0], tokenizer) for s in data], dtype=tf.int32) # ENCODED_SEQUENCE_A [SEP]
    tokens2 = tf.ragged.constant([encode_sentence(s[1], tokenizer) for s in data], dtype=tf.int32) # ENCODED_SEQUENCE_B [SEP]
    cls_label = [tokenizer.convert_tokens_to_ids([tokenizer.cls_token])]*tokens1.shape[0] # [CLS] ENCODED_SEQUENCE_A [SEP]
    tokens = tf.concat([cls_label, tokens1, tokens2], axis=-1) # [CLS] ENCODED_SEQUENCE_A [SEP] ENCODED_SEQUENCE_B [SEP]

    # Truncate to max_len.
    tokens = tokens[:, :max_len]

    # Pad with zeroes if len < max_len.
    tokens = tokens.to_tensor(default_value=PAD_ID)
    pad = max_len - tf.shape(tokens)[1]
    tokens = tf.pad(tokens, [[0, 0], [0, pad]], constant_values=PAD_ID)
    input_word_ids = tf.reshape(tokens, [-1, max_len])

    # The input mask allows the model to cleanly differentiate between the content and the padding. 
    input_mask = tf.cast(input_word_ids != PAD_ID, tf.int32)
    input_mask = tf.reshape(input_mask, [-1, max_len])

    # Map tokens1 indices to zeroes and tokens2 indices to ones.
    input_type_ids = tf.concat([tf.zeros_like(cls_label), tf.zeros_like(tokens1), tf.ones_like(tokens2)], axis=-1).to_tensor()


    inputs = {
      'input_word_ids': input_word_ids,
      'input_mask': input_mask,
      'input_type_ids': input_type_ids}

    return inputs

In [None]:
def build_dataset(x, y, mode, batch_size):
    """
    Build a batched TF training, validation, or test dataset.
    
    (This function is borrowed from some of the other notebooks in this competition -
    not sure who to credit exactly so thanks all!)
    """
    if mode == "train":
        dataset = (
            tf.data.Dataset
            .from_tensor_slices((x, y))
            .repeat()
            .shuffle(5678)
            .batch(batch_size)
            .prefetch(auto)
        )
    elif mode == "valid":
        dataset = (
            tf.data.Dataset
            .from_tensor_slices((x, y))
            .batch(batch_size)
            .cache()
            .prefetch(auto)
        )
    elif mode == "test":
        dataset = (
            tf.data.Dataset
            .from_tensor_slices(x)
            .batch(batch_size)
            )
    else:
        raise NotImplementedError
    return dataset

# Train, validation dataset 준비

- mnli 데이터셋도 함께 사용하면 성능이 올라가는 것으로 보인다. (참고: https://www.kaggle.com/rahulbana/contradictory-my-dear-watson-using-xlni-robert2)
하지만 다음의 코드는 왠지모르게 작동을 잘 안함

```

```

그래서 huggingface 에 [dataset](https://github.com/huggingface/datasets) 을 이용하기로 함.

In [None]:
!pip3 install datasets

In [None]:
from datasets import load_dataset

def load_mnli(use_validation=True):
    result = []
    dataset = load_dataset('multi_nli')
    print(dataset['train'])
    keys = ['train', 'validation_matched','validation_mismatched'] if use_validation else ['train']
    for k in keys:
        for record in dataset[k]:
            c1, c2, c3 = record['premise'], record['hypothesis'], record['label']
            if c1 and c2 and c3 in {0,1,2}:
                result.append((c1,c2,c3,'en'))
    result = pd.DataFrame(result, columns=['premise','hypothesis','label','lang_abv'])
    return result

In [None]:
mnli = load_mnli()

In [None]:
total_train = train[['premise', 'hypothesis', 'label']]
total_train = pd.concat([total_train, mnli], axis=0)

In [None]:
total_train

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(total_train[['premise', 'hypothesis']].values.tolist(), total_train['label'], test_size=0.25, random_state=12345)

In [None]:
x_train_ = tokenize(x_train, tokenizer, max_len)
x_valid_ = tokenize(x_valid, tokenizer, max_len)

In [None]:
train_dataset = build_dataset(x_train_, y_train, "train", batch_size)
valid_dataset = build_dataset(x_valid_, y_valid, "valid", batch_size)

# Train model

[transformers.XLMRobertaModel](https://huggingface.co/transformers/model_doc/xlmroberta.html#tfxlmrobertamodel) 은 tf.keras.Model 의 subclass 로 만들어 졌으며, TF2.0 Keras model 로 만들어졌다. Return 은 return_dict=True 인 경우에는 TFBaseModelOutputWithPooling 이 나오며, 그렇지 않으면 tf.Tensor 가 반환된다. roberta 가 뱉은 output 중
- last_hidden_state 는 `(batch_size, sequence_length, hidden_size)` 의 shape 을 갖는다. 
- pooler_outpout 은 `(batch_size, hidden_size)` 의 크기를 갖으며 classification token 인 sequence 의 첫번째 토큰의 hidden-state 이다.   
  - pooler_output (tf.Tensor of shape (batch_size, hidden_size)) – Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during pretraining.
  - This output is usually not a good summary of the semantic content of the input, **you’re often better with averaging or pooling the sequence of hidden-states for the whole input sequence.**

roberta 의 output 을 3개의 unit 에 대한 softmax 로 만드는 것이 목표인데, 이 경우 보통은 Flatten 을 하고 Dense 를 쌓는 방식으로 가는데 여기서는 특이하게도 GlobalAveragePooling1D 를 사용하였다. 아마 위에서 hidden-states 를 averaging 하거나 pooling 하는게 좋다고 해서 그런것 같다.

In [None]:
def build_model(encoder_handle, random_seed, learning_rate, loss, metrics, max_len):
    
    tf.keras.backend.clear_session()
    tf.random.set_seed(random_seed)
    
    with strategy.scope():
        
        input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
#          input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
        # RoBERTa doesn’t use token_type_ids.
        
        #  Create an instance of a model defined in encoder_handle
#         roberta = TFXLMRobertaModel.from_pretrained(encoder_handle)
#         roberta = roberta([input_word_ids, input_mask])[0]
        roberta = TFAutoModel.from_pretrained(encoder_handle)
        roberta = roberta([input_word_ids])[0]
        out = GlobalAveragePooling1D()(roberta)
        out = Dense(3, activation='softmax')(out)
        
        model = Model(inputs=[input_word_ids], outputs = out)
        model.compile(optimizer=Adam(lr=learning_rate), loss=loss, metrics=metrics)
    
    model.summary()
    
    return model

In [None]:
model = build_model(encoder_handle, random_seed, learning_rate, loss, metrics, max_len)

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                                  verbose=1,
                                                  patience=2,
                                                  mode='min',
                                                  restore_best_weights=True)

In [None]:
steps_per_epoch = len(x_train) // batch_size
history = model.fit(train_dataset,
                    validation_data=valid_dataset,
                    steps_per_epoch=steps_per_epoch,
                    epochs=epochs,
                    callbacks=[early_stopping])

In [None]:
print(history.history.keys())

In [None]:
import numpy as np

# summarize history for loss
ep_nbr = np.arange(1, len(history.history['accuracy']) + 1)
plt.plot(ep_nbr, history.history['loss'])
plt.plot(ep_nbr, history.history['val_loss'])
plt.title('Unadjusted Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

# Training loss is continually reported over the course of an entire epoch.
# Validation metrics are computed over the validation set only once the current training epoch is completed.
# This implies, that on average, training losses are measured half an epoch earlier.

# plot the *shifted* training and validation loss
plt.plot(ep_nbr - 0.5, history.history['loss'], label="train_loss")
plt.plot(ep_nbr, history.history['val_loss'], label="val_loss")
plt.title("Shifted Loss")
plt.xlabel("epoch")
plt.ylabel("loss")
plt.legend()
plt.show()

# summarize history for accuracy
plt.plot(ep_nbr, history.history['accuracy'])
plt.plot(ep_nbr, history.history['val_accuracy'])
plt.title('Unadjusted Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

# plot the *shifted* training and validation accuracy
plt.plot(ep_nbr - 0.5, history.history['accuracy'], label="train_accuracy")
plt.plot(ep_nbr, history.history['val_accuracy'], label="val_accuracy")
plt.title("Shifted Accuracy")
plt.xlabel("epoch")
plt.ylabel("loss")
plt.legend()
plt.show()

# Submission

In [None]:
x_test = tokenize(test[['premise', 'hypothesis']].values.tolist(), tokenizer, max_len)
test_dataset  = build_dataset(x_test, None, "test", batch_size)

In [None]:
import numpy as np

predictions_prob = model.predict(test_dataset)
final = predictions_prob.argmax(axis=-1)   

submission = pd.DataFrame()    
submission['id'] = test['id']
submission['prediction'] = final.astype(np.int32)

In [None]:
submission.to_csv("/kaggle/working/submission.csv", index = False)