In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

train_df = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")
test_df = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")

In [None]:
import os
import gc
import tensorflow as tf
import transformers

from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

In [None]:
ACCELERATOR = 'TPU'

train_splits = 5
batch_size = 32
epochs = 10
max_length = 80
model_name = 'jplu/tf-xlm-roberta-large'
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

In [None]:
# Checking TPU first
if ACCELERATOR == "TPU":
    print("Connecting to TPU")
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print(f"Running on TPU {tpu.master()}")
        print("Initializing TPU")
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        print("TPU initialized")
    except ValueError:
        print("Could not connect to TPU")

# Default for CPU and GPU otherwise
else:
    print("Using default strategy for CPU and single GPU")
    strategy = tf.distribute.get_strategy()

# Checking GPUs
if ACCELERATOR == "GPU":
    print(f"GPUs Available: {len(tf.config.experimental.list_physical_devices('GPU'))}")

# Defining replicas
REPLICAS = strategy.num_replicas_in_sync
print(f"REPLICAS: {REPLICAS}")

In [None]:
def get_model():
    # Defining the encoded inputs
    input_ids = tf.keras.layers.Input(shape = (max_length,), dtype = tf.int32, name = "input_ids")
    
    # Loading pretrained transformer model
    transformer_model = transformers.TFAutoModel.from_pretrained(model_name)
    transformer_model.trainable = False

    # Defining the data embedding using the loaded model
    bert_output = transformer_model(input_ids)
    transformer_embeddings = bert_output[0]
    cls_token = transformer_embeddings[:, 0, :]
    
    sequence_output = bert_output.last_hidden_state
    pooled_output = bert_output.pooler_output
    
    # Add trainable layers on top of frozen layers to adapt the pretrained features on the new data.
    bi_lstm = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64, return_sequences=True)
    )(sequence_output)
    # Applying hybrid pooling approach to bi_lstm sequence output.
    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(bi_lstm)
    max_pool = tf.keras.layers.GlobalMaxPooling1D()(bi_lstm)
    concat = tf.keras.layers.concatenate([avg_pool, max_pool])
    dropout = tf.keras.layers.Dropout(0.3)(concat)
    batch_norm = tf.keras.layers.BatchNormalization()(dropout)
    
    # Defining the classifier layer
    output = tf.keras.layers.Dense(3, activation="softmax")(batch_norm)

    model = tf.keras.models.Model(inputs = input_ids, outputs=output)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss="categorical_crossentropy",
        metrics=["acc"],
    )

    return model, transformer_model

In [None]:
def encode(df):
    """
    Using tokenizer to encode text samples.
    """

    sentences = df[["premise", "hypothesis"]].values.astype("str").tolist()
    sentences_encoded = tokenizer.batch_encode_plus(
        sentences,
        add_special_tokens=True,
        max_length = max_length,
        padding = 'max_length',
        truncation = 'longest_first',
        return_attention_mask=True,
        return_token_type_ids=True,
    )

    return sentences_encoded

In [None]:
def to_tfds(X, y, labelled=True, repeat=False, shuffle=False, batch_size=128):
    # Train data
    if labelled:
        ds = (tf.data.Dataset.from_tensor_slices((X["input_ids"], y)))
    # Test data
    else:
        ds = (tf.data.Dataset.from_tensor_slices(X["input_ids"]))

    # Optional repeat or shuffle
    if repeat:
        ds = ds.repeat()
    if shuffle:
        ds = ds.shuffle(2048)

    # Fetch batch
    ds = ds.batch(batch_size)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)

    return ds

In [None]:
# adding language column for stratified splitting
train_df["language_label"] = train_df.language.astype(str) + "_" + train_df.label.astype(str)

# stratified K-fold on language and label for balance
skf = StratifiedKFold(n_splits=train_splits, shuffle=True, random_state=2021)

preds_oof = np.zeros((train_df.shape[0], 3))
test_predictions = np.zeros((test_df.shape[0], 3))
acc_oof = []
    
# Fine-tuning avec K-fold
for (fold, (train_index, valid_index)) in enumerate(skf.split(train_df, train_df.language_label)):
    print(f"--- Fold {fold} ---")
    
    # Construction du modèle (dans la boucle pour que chaque fold ait un modèle 'neuf')
    tf.keras.backend.clear_session()
    if ACCELERATOR == "TPU":
        tf.tpu.experimental.initialize_tpu_system(tpu)
    with strategy.scope():
        model, transformer_model = get_model()
        if fold == 0:
            print(model.summary())
    
    train = train_df.iloc[train_index]
    valid = train_df.iloc[valid_index]

    train_labels = tf.keras.utils.to_categorical(train.label, num_classes=3)
    valid_labels = tf.keras.utils.to_categorical(valid.label, num_classes=3)
    
    # Encoding text data using tokenizer
    train_encoded = encode(df=train)
    valid_encoded = encode(df=valid)

    # Creating TF Datasets for TPU
    ds_train = to_tfds(train_encoded, train_labels, repeat=True, shuffle=True, batch_size=batch_size * REPLICAS)
    ds_valid = to_tfds(valid_encoded, valid_labels, batch_size=batch_size * REPLICAS * 4)

    train_size = train.shape[0]
    
    # Only need to encode test data once
    if fold == 0:
        test_encoded = encode(df=test_df)

    # Pre-compilation
    model.fit(
        ds_train,
        epochs=epochs,
        steps_per_epoch=train_size / batch_size // REPLICAS,
        validation_data=ds_valid,
        verbose=1
    )
        
    transformer_model.trainable = True
    # Recompile the model to make the change effective.
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-5),
        loss="categorical_crossentropy",
        metrics=["accuracy"],
    )
        
    # Defining checkpoint callback
    checkpoint = tf.keras.callbacks.ModelCheckpoint(
        "model.h5",
        monitor="val_accuracy",
        verbose=1,
        save_best_only=True,
        save_weights_only=True,
        mode="max",
        save_freq="epoch"
    )
    callbacks = [checkpoint]

    print("Training...")
    model_history = model.fit(
        ds_train,
        epochs=epochs,
        callbacks=callbacks,
        steps_per_epoch=train_size / batch_size // REPLICAS,
        validation_data=ds_valid,
        verbose=1
    )
    print(model_history.history)

    print("Validating...")
    # Scoring validation data
    # we get the same acc score as val_accuracy for the best model (fit)
    model.load_weights("model.h5")
    ds_valid = to_tfds(valid_encoded, -1, labelled=False, batch_size=batch_size * REPLICAS * 4)
    print(ds_valid)
    preds_valid = model.predict(ds_valid, verbose=1)
    acc = accuracy_score(valid.label, np.argmax(preds_valid, axis = 1))

    preds_oof[valid_index] = preds_valid
    acc_oof.append(acc)

    print("Testing...")
    # Scoring test data
    ds_test = to_tfds(test_encoded, -1, labelled=False, batch_size=batch_size * REPLICAS * 4)
    test_predictions += model.predict(ds_test, verbose=1) / train_splits

    print(f"Fold {fold} Accuracy: {round(acc, 4)}")

    g = gc.collect()

# overall CV score and standard deviation
print(f"CV Mean Accuracy: {round(np.mean(acc_oof), 4)}")
print(f"CV StdDev Accuracy: {round(np.std(acc_oof), 4)}")


In [None]:
submission_df = pd.DataFrame({"id": test_df.id.values, "prediction": np.argmax(test_predictions, axis = 1)})
submission_df.to_csv("submission.csv", index = False)