In [None]:
import os
import sys
import gc
os.environ["WANDB_API_KEY"] = "0" ## to silence warning
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

import tensorflow as tf
import tensorflow.keras.backend as K
print(f"Using Tensorflow version: {tf.__version__}")
from tensorflow.keras.utils import to_categorical 
import tensorflow_addons as tfa
tfa.register_all(custom_kernels=False) # otherwise TPU throws up error

!pip install -q transformers==3.1.0

import tokenizers
import transformers
print(f"Using Transformers version: {transformers.__version__}")


# Config

In [None]:
DEVICE = 'TPU'
if DEVICE == "TPU":
    print("connecting to TPU...")
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        print("Could not connect to TPU")
        tpu = None

    if tpu:
        try:
            print("initializing  TPU ...")
            tf.config.experimental_connect_to_cluster(tpu)
            tf.tpu.experimental.initialize_tpu_system(tpu)
            strategy = tf.distribute.experimental.TPUStrategy(tpu)
            print("TPU initialized")
        except _:
            print("failed to initialize TPU")
    else: DEVICE = "GPU"


if DEVICE != "TPU":
    print("Using default strategy for CPU and single GPU")
    strategy = tf.distribute.get_strategy()

if DEVICE == "GPU":
    print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
    

AUTO     = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print('REPLICAS: {}'.format(REPLICAS))


In [None]:
SEED = 777
DATA_PATH = "../input/contradictory-my-dear-watson/"


#MODEL = "bert-base-multilingual-cased"   # the bert uncased model is trained on fewer languages,does not include Thai
MODEL = 'jplu/tf-xlm-roberta-large' #microsoft/Multilingual-MiniLM-L12-H384
TOKENIZER = transformers.AutoTokenizer.from_pretrained(MODEL)

#BERT_MODEL_PATH = "../input/bert-tensorflow/"


MAX_LEN = 96 # for input encoding sequence, multiple of 8
BATCH_SIZE = 16 * REPLICAS
EPOCHS= 10
FOLDS = 5 # num k-folds

NUM_CLASSES = 3
OPT_TYPE = "Adam" # "Adam" or "RAdam"
LR = 1e-5 # initial learning rate


In [None]:
train = pd.read_csv(DATA_PATH+"train.csv")
test = pd.read_csv(DATA_PATH+"test.csv")

In [None]:
train.head()

In [None]:
sns.countplot(train.label);

In [None]:
# Language distribution by target
f = plt.figure(figsize = (10,10))
f.add_subplot(211)
sns.countplot(train.language,hue=train.label);
plt.xticks(rotation=90);

# Non-english languages
f.add_subplot(212)
sns.countplot(train.loc[train.language!='English', 'language'],hue=train.loc[train.language!='English', 'label']);
plt.xticks(rotation=90);

- As expected, most texts in the corpus are in English. Therefore, to get a stable cross-validation score, we must stratify the CV folds by language as well.

In [None]:
# Combine the language and label columns to perform stratified K-fold later
train.loc[:, 'lang_label'] = train.loc[:, ['lang_abv', 'label']].apply(lambda row: row['lang_abv']+'-'+str(row['label']), axis=1)

In [None]:
# Word count distribution in the texts, note that this does not correspond to number of tokens, especially for non-English languages

pd.concat([train.premise.apply(lambda text: len(text.split(' '))).describe(),train.hypothesis.apply(lambda text: len(text.split(' '))).describe()],axis=1)

In [None]:
out = TOKENIZER(train.loc[2,'premise'],train.loc[2,'hypothesis'],add_special_tokens=True, max_length=40, padding='max_length',truncation = True,return_token_type_ids=True)
out

# Pre-processing

In [None]:
def preprocess_input_data(idx, tokenizer, label = False, dataset = 'train',max_len = MAX_LEN):
    
# Tokenize and encode the question (sentiment) and the context (tweet) with special tokens
    if dataset == 'train':
        enc = TOKENIZER(train.loc[idx,['premise', 'hypothesis']].values.tolist(), max_length=max_len, padding= 'max_length', add_special_tokens=True,truncation = True,return_token_type_ids=True)
            
    elif dataset == 'test':
        enc = TOKENIZER(test.loc[idx,['premise', 'hypothesis']].values.tolist(), max_length=max_len, padding= 'max_length', add_special_tokens=True,truncation = True,return_token_type_ids=True)
    
    attention_mask = enc.attention_mask
    input_ids = enc.input_ids 
    token_type_ids = enc.token_type_ids
    input_tokens = [tokenizer.convert_ids_to_tokens(enc.input_ids[i]) for i in range(len(enc.input_ids))]

    output_dict = {'token_type_ids': np.array(token_type_ids).astype('int32'),
                  'input_ids': np.array(input_ids).astype('int32'),
                  'input_tokens': input_tokens,
                   'attention_mask': np.array(attention_mask).astype('int32'),
                   }
    if label:
         output_dict['labels'] = to_categorical(train.loc[idx, 'label'], num_classes = NUM_CLASSES).astype('int32')
                        
    return output_dict

# pre-process the training set
processed_dict = preprocess_input_data(np.arange(train.shape[0]), tokenizer=TOKENIZER, label=True, dataset = 'train',max_len = MAX_LEN)
input_ids, attention_mask, token_type_ids,labels = processed_dict['input_ids'],processed_dict['attention_mask'], processed_dict['token_type_ids'], processed_dict['labels']

# pre-process the test set
processed_dict = preprocess_input_data(np.arange(test.shape[0]), tokenizer=TOKENIZER, label=False, dataset = 'test',max_len = MAX_LEN)
input_ids_test, attention_mask_test, token_type_ids_test = processed_dict['input_ids'],processed_dict['attention_mask'], processed_dict['token_type_ids']

# Data Loader

In [None]:
def data_loader(idx,data= 'train', labelled=True, return_ids=False, repeat=True, shuffle=True,cache=True, batch_size = BATCH_SIZE):
    if data == 'train':
        if labelled:
            dataset = tf.data.Dataset.from_tensor_slices( ((input_ids[idx,],attention_mask[idx,],token_type_ids[idx,]),
                                                   labels[idx,]) )                                                   
        else:
            dataset = tf.data.Dataset.from_tensor_slices( ((input_ids[idx,],attention_mask[idx,],token_type_ids[idx,]),) )
            if return_ids:
                dataset = tf.data.Dataset.from_tensor_slices( ((input_ids[idx,],attention_mask[idx,],token_type_ids[idx,], train.loc[idx,'id'].values),) )
    elif data == 'test':
        dataset = tf.data.Dataset.from_tensor_slices( ((input_ids_test,attention_mask_test,token_type_ids_test),) )
    if cache:
        dataset = dataset.cache()
    
    if shuffle:
        dataset = dataset.shuffle(2048)
    if repeat:
        dataset = dataset.repeat()
    dataset = dataset.batch(batch_size).prefetch(AUTO)
    return dataset

# test the data loader
for out in data_loader(np.arange(10),batch_size=5).unbatch().take(1):
    print(out[0], out[1])

# Model and LR

In [None]:
def get_lr_callback(PLOT_LR = False): # LR scheduler
    lr_start   = 1e-5
    lr_max     = 1.5e-5 
    lr_min     = 1e-5
    lr_ramp_ep = 3
    lr_sus_ep  = 0
    lr_decay   = 0.8
    def lrfn(epoch):
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
            
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max
            
        else:
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
            
        return lr

    if PLOT_LR:
        plt.figure(figsize=(15, 5))
        plt.subplot(1, 2, 1)
        plt.plot([lrfn(e) for e in range(EPOCHS)]);
        plt.xlabel('Epoch'); plt.ylabel('LR');
        plt.subplot(1, 2, 2);
        plt.plot([lrfn(e) for e in range(EPOCHS)]);
        plt.xlabel('Epoch'); plt.ylabel('Log LR');
        plt.yscale('log');

    lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=False)

    return lr_callback

get_lr_callback(PLOT_LR = True)

def display_training_curves(training, validation, title, subplot):
    if subplot%10==1: # set up the subplots on the first call
        plt.subplots(figsize=(10,10), facecolor='#F0F0F0')
        plt.tight_layout()
    ax = plt.subplot(subplot)
    ax.set_facecolor('#F8F8F8')
    ax.plot(training)
    ax.plot(validation)
    ax.set_title('model '+ title)
    ax.set_ylabel(title)
    #ax.set_ylim(0.28,1.05)
    ax.set_xlabel('epoch')
    ax.legend(['train', 'valid.'])

    
def build_model(opt = OPT_TYPE):
    ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    tok = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
            
    transformer_embedding = transformers.TFAutoModel.from_pretrained(MODEL)#(BERT_MODEL_PATH+MODEL+ "-tf_model.h5", config=BERT_MODEL_PATH+MODEL+"-config.json")
    x = transformer_embedding({'input_ids':ids,'attention_mask': att, 'token_type_ids':tok})[0][:,0,:]
    x = tf.keras.layers.Dropout(0.3)(x)
    x = tf.keras.layers.Dense(128, activation="relu")(x) 
    x = tf.keras.layers.Dense(3)(x)    
    x = tf.keras.layers.Activation('softmax')(x) 
    
    model = tf.keras.Model(inputs = [ids,att,tok], outputs = x)
    if opt == 'Adam':
        optimizer = tf.keras.optimizers.Adam(learning_rate=LR)
    elif OPT_TYPE == "RAdam":
        optimizer =  tfa.optimizers.RectifiedAdam(lr=LR)
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    model.compile(optimizer = optimizer, loss=loss_fn, metrics = ['accuracy'])
    
    return model

In [None]:
build_model().summary()
#tf.keras.utils.plot_model(build_model()) # for plotting the graph

In [None]:
def memory():
    import psutil
    pid = os.getpid()
    py = psutil.Process(pid)
    memoryUse = py.memory_info()[0]/2.**30  
    print('memory use:', memoryUse)

In [None]:
display_training_curve=True
oof_preds = np.zeros(shape = (train.shape[0],3))
ypreds_test = np.zeros(shape = (test.shape[0],3))
val_ids = []

print(f'DEVICE: {DEVICE}')
skf = StratifiedKFold(n_splits=FOLDS,shuffle=True,random_state=SEED)
for fold,(idxT,idxV) in enumerate(skf.split(train,train.lang_label.values)):    
    print('#'*25)
    print(f'### FOLD {fold+1}')
    print('#'*25)
    print(f"Training on {len(idxT)} examples with batch size = {BATCH_SIZE}, validate on {len(idxV)} examples")
    if DEVICE=='TPU':
        if tpu: tf.tpu.experimental.initialize_tpu_system(tpu) # to reset TPU memory, otherwise there's a memory leak. Took me some time to figure this out.
            
    memory()  
    train_dataset = data_loader(idxT,labelled=True,repeat=True, shuffle=True)
    valid_dataset = data_loader(idxV,labelled=True,repeat=False, shuffle=False, cache=False)
    
    K.clear_session()
    with strategy.scope():
        model = build_model()
        
    mod_checkpoint = tf.keras.callbacks.ModelCheckpoint("fold{}.h5".format(fold+1), monitor="val_accuracy", 
                                                 verbose=1, save_best_only=True,
                                                 save_weights_only=True, mode='max', save_freq='epoch')
    
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=0.5,verbose=1,mode='max',
                              patience=2, min_lr=5e-6)
        
    history = model.fit(train_dataset, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1, 
                        steps_per_epoch= len(idxT)//BATCH_SIZE, callbacks=[mod_checkpoint,get_lr_callback()],
                        validation_data=valid_dataset)
    print("-"*5 +" Loading model weights from best epoch "+"-"*5)
    try:
        model.load_weights("fold{}.h5".format(fold+1))
        print('Done')
    except OSError:
        print(f'Unable to load model!')
    
    # Save oof preds from best epoch
    valid_dataset_unlabelled = data_loader(idxV,labelled=False,return_ids=False,repeat=False, shuffle=False, cache=False)
    oof_preds[idxV,] = model.predict(valid_dataset_unlabelled, steps = len(idxV)/BATCH_SIZE)
    oof_acc = accuracy_score(np.argmax(oof_preds[idxV,],axis=1), train.label.values[idxV])
    print(f' Out-of-fold accuracy score: {oof_acc}')
    
    valid_dataset_unlabelled = data_loader(idxV,labelled=False,return_ids=True,repeat=False, shuffle=False)
    val_ids.extend([sample[0][3].numpy().decode('utf-8') for sample in valid_dataset_unlabelled.unbatch()])
    
    # Predict on the test set
    test_dataset = data_loader(_,data='test', labelled=False,repeat=False, shuffle=False, cache=False)
    ypreds_test += model.predict(test_dataset, steps = test.shape[0]/BATCH_SIZE)/FOLDS
    
    os.remove("fold{}.h5".format(fold+1))
    
    if display_training_curve:
        display_training_curves(history.history['loss'], history.history['val_loss'], 'loss', 211)
        display_training_curves(history.history['accuracy'], history.history['val_accuracy'], 'Accuracy', 212)

    gc.collect()

# OOF predictions
- Save out-of-fold predictions to disk

In [None]:
ypred_oof = np.argmax(oof_preds,axis=1)
print(f"{FOLDS}-fold CV accuracy score = {accuracy_score(ypred_oof, train.label.values)}")
oof_df = pd.DataFrame(list(zip(val_ids,ypred_oof.tolist())),columns = ['id','pred'])
oof_df.to_csv('oof.csv', index=False)
oof_df.head()

# Submission

In [None]:
ypred_sub = np.argmax(ypreds_test, axis=1) # Prediction labels

sub_df = pd.read_csv("../input/contradictory-my-dear-watson/sample_submission.csv")
sub_df.loc[:,'prediction'] = ypred_sub
sub_df.to_csv('submission.csv',index=False)
sub_df.head()