# Settings

In [None]:
# CONTROLS
MODEL_PREFIX = "V01"
MODEL_NUMBER = MODEL_PREFIX[-2:]
MODEL_NAME = 'roberta' # options include 'xlm' or 'distilbert' or 'roberta'

ON_KAGGLE = True
MAX_SEQ_LEN = 200
RUN_ON_SAMPLE = 0

# for the current setup NUM_FOLDS * NUM_EPOCHS = 135 = 3 * 3 * 3 * 5 with per epoch per fold time = 1.33 seconds

if ON_KAGGLE:
    BATCH_SIZE = 12
    PREDICT_BATCH_SIZE = 128
else:
    BATCH_SIZE = 16
    PREDICT_BATCH_SIZE = 256

In [None]:
if ON_KAGGLE:
    RESULTS_DIR = '../working/'
    DATA_DIR = '../input/commonlitreadabilityprize/'
    if MODEL_NAME == 'roberta':
        MODEL_DIR = '../input/robertamodelobjects/'
    else:
        MODEL_DIR = '../input/tf-distilbert-base-multilingual-cased/'
else:
    PATH = ".."
    RESULTS_DIR = os.path.join(PATH, "results")
    DATA_DIR = os.path.join(PATH, "data")
    if MODEL_NAME == 'xlm':
        MODEL_DIR = os.path.join(PATH, "models","robertamodelobjects")
    else:
        MODEL_DIR = os.path.join(PATH, "models","distilbert-base-multilingual-cased")

# Libraries

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import class_weight

import pickle, os, sys, re, json, gc
from time import time, ctime
from pprint import pprint
from collections import Counter

import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Conv1D, Conv2D, LSTM, Embedding, Dense, concatenate, MaxPooling2D, Softmax, Flatten
from tensorflow.keras.layers import BatchNormalization, Dropout, Reshape, Activation, Bidirectional, TimeDistributed
from tensorflow.keras.layers import RepeatVector, Multiply, Layer, LeakyReLU, Subtract
from tensorflow.keras.activations import softmax
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import initializers, regularizers, constraints
from tensorflow.keras.callbacks import *
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint
import tensorflow.keras.backend as K
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.models import save_model, load_model
from tensorflow.keras.utils import to_categorical

from tensorflow.data import Dataset

import tokenizers, transformers
from transformers import *

import tensorflow_addons as tfa
from tensorflow_addons.optimizers import TriangularCyclicalLearningRate


import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import os, sys, pickle
from time import time, ctime

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans

import xgboost as xgb

from hyperopt import hp, tpe
from hyperopt.fmin import fmin
from hyperopt import STATUS_OK
from hyperopt import Trials

from functools import partial

%matplotlib inline

In [None]:
np.random.randint(0, 10_000_000)

In [None]:
def get_seeds():
    seeds_used = []
    while True:
        seed_ = np.random.randint(0, 10_000_000)
        seeds_used.append(seed_)
#         print("Seed List ", seeds_used)
        yield seed_

get_seed = get_seeds()

In [None]:
seeded_value = next(get_seed)
pd.set_option('display.max_colwidth', None)
np.random.seed(seeded_value)
tf.random.set_seed(seeded_value)

In [None]:
def sigmoid(X):
    return 1/(1+np.exp(-X))

In [None]:
print(ctime(time()))

In [None]:
print([
    tf.__version__,
    transformers.__version__,
    tokenizers.__version__
])

<a href="https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth"  target="_blank"><h2 id="limiting_gpu_memory_growth" data-text="Limiting GPU memory growth" tabindex="0">Limiting GPU memory growth</h2></a>
<p>By default, TensorFlow maps nearly all of the GPU memory of all GPUs (subject to
<a href="https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars"><code translate="no" dir="ltr">CUDA_VISIBLE_DEVICES</code></a>) visible to the process. This is done to more efficiently use the relatively precious GPU memory resources on the devices by reducing memory fragmentation. To limit TensorFlow to a specific set of GPUs we use the <code translate="no" dir="ltr">tf.config.experimental.set_visible_devices</code> method.</p>

In [None]:
print(tf.config.experimental.list_logical_devices('CPU'))
print(tf.config.experimental.list_logical_devices('GPU'))
print(tf.config.experimental.list_physical_devices('CPU'))
print(tf.config.experimental.list_physical_devices('GPU'))

In [None]:
physical_devices = tf.config.list_physical_devices('GPU')
try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
    # Invalid device or cannot modify virtual devices once initialized.
    pass

# Import Data

In [None]:
data = pd.read_csv(DATA_DIR+'train.csv')
test = pd.read_csv(DATA_DIR+'test.csv')
print(data.columns, test.columns)

In [None]:
if RUN_ON_SAMPLE:
    data = data.sample(RUN_ON_SAMPLE)
    data = data.reset_index(drop=True)

In [None]:
data.columns = ['id', 'url_legal', 'license', 'excerpt', 'target', 'standard_error']
test.columns = ['id', 'url_legal', 'license', 'excerpt']

In [None]:
REQ_COLS = ['id', 'url_legal', 'license', 'excerpt', 'target']

In [None]:
data['excerpt'] = data["excerpt"].astype(str)
test['excerpt'] = test["excerpt"].astype(str)

In [None]:
data.shape

In [None]:
data.sample(2)

In [None]:
print(dict(data['target'].describe()))

In [None]:
target_histogram = plt.hist(data['target'], bins=100)

In [None]:
excerpt_histogram = plt.hist(data['excerpt'].apply(lambda x: len(x.split(" "))), bins=100)

# Tokenizer, Config & Model Initialization

1. https://arxiv.org/pdf/1911.02116.pdf
2. https://huggingface.co/transformers/model_doc/xlmroberta.html

In [None]:
MODEL_DIR = '../input/robertamodelobjects/'
if MODEL_NAME == 'xlm':
    model_tokenizer = transformers.XLMRobertaTokenizer.from_pretrained(MODEL_DIR)
elif MODEL_NAME == 'roberta':
    model_tokenizer = transformers.RobertaTokenizer.from_pretrained(MODEL_DIR)
else:
    model_tokenizer = transformers.DistilBertTokenizer.from_pretrained(MODEL_DIR)

In [None]:
with open(MODEL_DIR+"special_tokens_map.json") as f:
    special_tokens = json.load(f)
model_tokenizer.add_special_tokens(special_tokens)

In [None]:
VOCAB_SIZE = model_tokenizer.vocab_size
print(VOCAB_SIZE)

# Tokenization

In [None]:
def find_max_len():
    X_tokens_ = []

    for t in data.excerpt.tolist():
        encoded_text_ = model_tokenizer.encode_plus(t, padding='do_not_pad', truncation='do_not_truncate')
        X_tokens_.append(encoded_text_['input_ids'])

    return max([len(el) for el in X_tokens_])

MAX_SEQ_LEN = find_max_len()

In [None]:
def preprocess_data(data, MAX_SEQ_LEN):
    X_tokens, X_att = [], []
    
    for t in data.excerpt.tolist():
        encoded_text = model_tokenizer.encode_plus(t, padding='max_length', truncation=True, max_length=MAX_SEQ_LEN)
        X_tokens.append(encoded_text['input_ids'])
        X_att.append(encoded_text['attention_mask'])

    X_tokens, X_att = X_tokens, X_att
    Y = data['target'].tolist()
    
    X_tokens, X_att, Y = np.array(X_tokens, dtype=np.int32), np.array(X_att, dtype=np.int32), np.array(Y, dtype=np.float32)
    
    return X_tokens, X_att, Y

In [None]:
X_tokens, X_att, Y = preprocess_data(data, MAX_SEQ_LEN)

print("\n \t Sample\n",
      len(X_tokens), "\t: X_tokens ", "\n",
      len(X_att), "\t: X_att ", "\n",
      len(Y), "\t: Y ", "\n"
)

In [None]:
sample_weights = data.standard_error.values
sample_weights = sigmoid(sample_weights + (1e-6 * np.random.rand()))

In [None]:
X_tokens_test, X_att_test = [], []
for t in test.excerpt.tolist():
    encoded_text = model_tokenizer.encode_plus(t, padding='max_length', truncation=True, max_length=MAX_SEQ_LEN)
    X_tokens_test.append(encoded_text['input_ids'])
    X_att_test.append(encoded_text['attention_mask'])

X_tokens_test, X_att_test = np.array(X_tokens_test, dtype=np.int32), np.array(X_att_test, dtype=np.int32)
    
print("\n",
      len(X_tokens_test), "\t: X_tokens_test ", "\n",
      len(X_att_test), "\t: X_att_test ", "\n"
)

# Model Specifications

In [None]:
def build_model():
    input_sequences = Input((MAX_SEQ_LEN), dtype=tf.int32, name="words")
    input_att_flags = Input((MAX_SEQ_LEN), dtype=tf.int32, name="att_flags")
    
    MODEL_DIR = '../input/robertamodelobjects/'
    
    if MODEL_NAME == 'xlm':
        config = transformers.XLMRobertaConfig.from_pretrained(MODEL_DIR)
        model = transformers.TFXLMRobertaModel.from_pretrained(MODEL_DIR, config=config)
        x = model({'inputs': input_sequences, 'attention_mask': input_att_flags})
    elif MODEL_NAME == 'roberta':
        config = transformers.RobertaConfig.from_pretrained(MODEL_DIR)
        model = transformers.TFRobertaModel.from_pretrained(MODEL_DIR, config=config)
        x = model({'inputs': input_sequences, 'attention_mask': input_att_flags})
    else:
        config = transformers.DistilBertConfig.from_pretrained(MODEL_DIR)
        model = transformers.TFDistilBertModel.from_pretrained(MODEL_DIR, config=config)
        x = model({'inputs': input_sequences, 'attention_mask': input_att_flags})
    
    model_ = Model([input_att_flags, input_sequences], x)
    
    return model_

In [None]:
model = build_model()

X_ = model.predict(x = {"att_flags": X_att,
                        "words": X_tokens},
                   batch_size=PREDICT_BATCH_SIZE)

X_test_ = model.predict(x = {"att_flags": X_att_test,
                             "words": X_tokens_test},
                        batch_size=PREDICT_BATCH_SIZE)

In [None]:
X_.keys(), X_['last_hidden_state'].shape

In [None]:
X, y, X_test = X_['last_hidden_state'].mean(axis=1), data.target.values, X_test_['last_hidden_state'].mean(axis=1)

In [None]:
X.shape, y.shape, X_test.shape

In [None]:
seeded_value = 12345

NUM_FOLDS = 3
NUM_EVALS = 20
NUM_TREES = 500
ENABLE_POSTPROCESSING = False
USE_STRATIFY = True
NUM_CLUSTERS = 20

In [None]:
Kmeans_model = KMeans(n_clusters=NUM_CLUSTERS, random_state=next(get_seed))
Kmeans_model.fit(X)
stratify_by_column = Kmeans_model.labels_

In [None]:
def objective(params, X , y, sample_weights):
    params = {
        'max_depth': int(params['max_depth']),
        'gamma': params['gamma'],
        'colsample_bytree': params['colsample_bytree'],
        'learning_rate': params['learning_rate']
    }

    kf_cv = KFold(n_splits=NUM_FOLDS,
                  shuffle=True,
                  random_state=next(get_seed))
    
    train_losses, valid_losses = [], []
    
    for fold_num, (t_i, v_i) in enumerate(kf_cv.split(X=X, y=y)):
        X_train, y_train, sample_weights_train = X[t_i], y[t_i], sample_weights[t_i]
        X_valid, y_valid, sample_weights_valid = X[v_i], y[v_i], sample_weights[v_i]

        model = xgb.XGBRegressor(
            n_estimators=NUM_TREES,
            verbosity=1,
            objective="reg:squarederror",
            random_state=next(get_seed),
            n_jobs=2,
    #         tree_method= 'gpu_hist',
            **params
        )
        
        model.fit(X=X_train, y=y_train, sample_weight=sample_weights_train)
            
        pred_train = model.predict(X_train)
        pred_valid = model.predict(X_valid)
        
        train_loss = mean_squared_error(y_train, pred_train, squared=False)
        valid_loss = mean_squared_error(y_valid, pred_valid, squared=False)
        
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)
        
#         print("\t\t\t\tTrain RMSE:", train_loss)
#         print("\t\t\t\tValid RMSE:", valid_loss)
        
    mean_valid_loss = np.mean(valid_losses)
#     print("\t\tFinal CV validation for fold:", fold_num, mean_valid_loss)
    return {"loss": mean_valid_loss, "params": params, "status": STATUS_OK}

In [None]:
def hpt(X, y, sample_weights):
    bayes_trials = Trials()

    space = {
        'max_depth': hp.quniform('max_depth', 2, 8, 1),
        'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
        'gamma': hp.uniform('gamma', 0.00001, 0.5),
        'learning_rate': hp.loguniform('learning_rate', np.log(0.001), np.log(0.5))
    }
    
    partial_objective = partial(objective, X=X, y=y, sample_weights=sample_weights)
    
    best = fmin(fn=partial_objective,
                space=space,
                algo=tpe.suggest,
                max_evals=NUM_EVALS, 
                trials=bayes_trials)
    
    print('\tbest: ', best)

    optimal_params = {
            'max_depth': int(best['max_depth']),
            'gamma': best['gamma'],
            'colsample_bytree': best['colsample_bytree'],
            'learning_rate': best['learning_rate']
    }

    print('\toptimal_params', optimal_params)
    
    return optimal_params

In [None]:
def retrain_after_hpt(optimal_params, X, y, sample_weights):
    tuned_model = xgb.XGBRegressor(**{
        "n_estimators":NUM_TREES,
        "verbosity":1,
        "objective":"reg:squarederror",
        "random_state":next(get_seed),
        "n_jobs":12,
#         'tree_method': 'gpu_hist',
        **optimal_params
    })
    tuned_model.fit(X=X, y=y, sample_weight=sample_weights)
    return tuned_model

In [None]:
def evaluate(y_train, pred_train, y_valid, pred_valid, fold_num, prefix):
    trains = pd.DataFrame({
        'target': y_train,
        'preds': pred_train
    })
    
    valids = pd.DataFrame({
        'target': y_valid,
        'preds': pred_valid
    })
    
    trains.to_csv(f'{prefix}-train-preds-{fold_num}.csv', index=False)
    valids.to_csv(f'{prefix}-valid-preds-{fold_num}.csv', index=False)

#     print("Training Correlations: ", "\n\t\t\t", trains[['target', 'preds']].corr())
#     print("Training np.sign: ", "\n\t\t\t", np.sign(trains['target'] - trains['preds']).value_counts())
    print("Training RMSE: ", "\n\t\t\t", np.sqrt(mean_squared_error(trains['target'], trains['preds'])))
    print("Training distribution stats: ", "\n\t\t\t", valids.describe())
    
#     print("Validation Correlations: ", "\n\t\t\t", valids[['target', 'preds']].corr())
#     print("Validation np.sign: ", "\n\t\t\t", np.sign(valids['target'] - valids['preds']).value_counts())
    print("Validation RMSE: ", "\n\t\t\t", np.sqrt(mean_squared_error(valids['target'], valids['preds'])))
    print("Validation distribution stats: ", "\n\t\t\t", trains.describe())

In [None]:
def apply_minmax_(y_ref, pred_y):
    min_ = y_ref.min()
    max_ = y_ref.max()

    pred_min_ = pred_y.min()
    pred_max_ = pred_y.max()
    
    pred_norm = (pred_y - pred_min_) / (pred_max_ - pred_min_)
    y_pred_adj = (pred_norm * (max_ - min_)) + min_
    return y_pred_adj

In [None]:
def apply_norm_(y_ref, pred_y):
    mean_ = y_ref.mean()
    std_ = y_ref.std()

    pred_mean_ = pred_y.mean()
    pred_std_ = pred_y.std()

    pred_norm = (pred_y - pred_mean_) / pred_std_
    y_pred_adj = pred_norm * std_ + mean_
    return y_pred_adj

In [None]:
kf_cv = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=next(get_seed))
stratified_kf_cv = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=next(get_seed))

kfold_strategy = stratified_kf_cv if USE_STRATIFY else kf_cv

for fold_num, (t_i, v_i) in enumerate(kfold_strategy.split(X=X, y=stratify_by_column)):
    print("="*15, f" START: {str(fold_num)} ", "="*15)    
    
    if fold_num == 0: 
        predictions = {}
        predictions_norm = {}
        predictions_minmax = {}
    
    X_train, y_train, sample_weights_train = X[t_i], y[t_i], sample_weights[t_i]
    X_valid, y_valid, sample_weights_valid = X[v_i], y[v_i], sample_weights[v_i]
    
    optimal_params = hpt(X_train, y_train, sample_weights_train)
    eval_model = retrain_after_hpt(optimal_params, X_train, y_train, sample_weights_train)

    eval_model.save_model(f"eval_model_xgb_{fold_num}.txt") # loaded_model = xgb.XGBRegressor(); loaded_model.load_model("path-to-file")
    
    pred_train = eval_model.predict(X_train)
    pred_valid = eval_model.predict(X_valid)
    print("\tTrain RMSE:", mean_squared_error(y_train, pred_train, squared=False))
    print("\tValid RMSE:", mean_squared_error(y_valid, pred_valid, squared=False))
    
    evaluate(y_train, pred_train, y_valid, pred_valid, fold_num, 'original')
    
    if ENABLE_POSTPROCESSING:
        print("="*25, "After Normalizing Predictions on Training and Validation Datasets", "="*25)
        evaluate(y_train, apply_norm_(y_train, pred_train), y_valid, apply_norm_(y_train, pred_valid), fold_num, 'norm')
        
        print("="*25, "After MinMaxScaling Predictions on Training and Validation Datasets", "="*25)
        evaluate(y_train, apply_minmax_(y_train, pred_train), y_valid, apply_minmax_(y_train, pred_valid), fold_num, 'minmax')
    
    model = retrain_after_hpt(optimal_params, X, y, sample_weights)
    model.save_model(f"model_xgb_{fold_num}.txt") # loaded_model = xgb.XGBRegressor(); loaded_model.load_model("path-to-file")
    pred_test = model.predict(X_test)
    
    predictions["F_"+str(fold_num)] = pred_test
    
    if ENABLE_POSTPROCESSING:
        predictions_norm["F_"+str(fold_num)] = apply_norm_(y, pred_test)
        predictions_minmax["F_"+str(fold_num)] = apply_minmax_(y, pred_test)
    
    print("="*15, " COMPLETE ", "="*15)

In [None]:
final_preds_to_submit = predictions # predictions, predictions_norm, predictions_minmax

In [None]:
test_results = pd.DataFrame({k:v.flatten() for k,v in final_preds_to_submit.items()},
                            columns=["F_"+str(num) for num in range(NUM_FOLDS)]).values

test_results = test_results.mean(axis=1)

test['target'] = test_results
test[['id', 'target']].to_csv('submission.csv', index=False)

test[['id', 'target']].head()

In [None]:
next(get_seed)

In [None]:
print(ctime(time()))