# Берт, усреднение по фолдам

In [None]:
from collections import Counter
import math
from pathlib import Path

import matplotlib.pyplot as plt
plt.style.use('ggplot')

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

import sklearn
from sklearn import metrics, model_selection

from tqdm.auto import tqdm
tqdm.pandas()

import tensorflow as tf
print(f'TF version: {tf.__version__}')
print(f'Eager mode: {tf.executing_eagerly()}')
print(f'GPU: {"is available" if tf.config.experimental.list_physical_devices("GPU") else "IS NOT AVAILABLE"}')

import transformers
print(f'Hugging Face version: {transformers.__version__}')

## Прочитаю данные

In [None]:
df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
df.shape

## Добавлю столбцы с бинами таргета и фолдами

In [None]:
def add_target_bin_and_fold_columns(df):
    """
    https://www.kaggle.com/takiholadi/01-commonlit-linreg-4-features
    """

    def add_column_with_target_bin(df):
        assert 'target_bin' not in df

        hist, bin_edges = np.histogram(
            df['target'],
            bins='doane',  # `sqrt`, `doane`, `sturges`, `rice`, `scott`, `fd`, `auto`
        )
        num_bins = len(hist)

        out, bins = pd.cut(
            df['target'],
            bins=num_bins,
            labels=[f'target_bin_{x}' for x in range(num_bins)],
            retbins=True,
        )

        df.loc[:, 'target_bin'] = out.astype(str)

        return df


    def add_column_with_fold(df):
        assert 'fold' not in df
        assert 'target_bin' in df

        df['fold'] = -1

        train_size = 2_500

        _, holdout_ids = sklearn.model_selection.train_test_split(
            df['id'],
            train_size=train_size,
            random_state=567,
            shuffle=True,
            stratify=df['target_bin'],
        )

        holdout_ids = holdout_ids.values

        df.loc[df['id'].isin(holdout_ids), 'fold'] = 'holdout'
        assert sum(df['fold'] == -1) == train_size

        #####
        #####

        df = df.reset_index()

        crossvalidation_df = df[df['fold'] != 'holdout'].reset_index(drop=True)
        holdout_df = df[df['fold'] == 'holdout'].reset_index(drop=True)

        n_splits = 5
        skf = sklearn.model_selection.StratifiedKFold(
            n_splits=n_splits,
            shuffle=True,
            random_state=567,
        )

        for idx, (train_index, test_index) in enumerate(
            skf.split(X=crossvalidation_df,
                      y=crossvalidation_df['target_bin'])):
            crossvalidation_df.loc[test_index, 'fold'] = f'fold_{idx}'

        df = pd.concat([crossvalidation_df, holdout_df]).set_index('index').sort_index()

        assert -1 not in df['fold']
        for each in df['fold'].unique():
            current, rest = df[df['fold'] == each], df[df['fold'] != each]
            assert set(current.index).isdisjoint(rest.index)

        return df
    
    df = add_column_with_target_bin(df)
    df = add_column_with_fold(df)
    return df

In [None]:
df = add_target_bin_and_fold_columns(df)
print(Counter(df['fold']))

In [None]:
df.head(1)

## Выбираю сколько токенов взять

In [None]:
def prepare_texts_for_bert(texts, tokenizer, max_sequence_length):
    assert all([isinstance(x, str) for x in texts])
    res = tokenizer.batch_encode_plus(
        texts,
        max_length=max_sequence_length,
        truncation=True,
        padding='max_length',
        add_special_tokens=True,
        pad_to_max_length=True,
        return_tensors='tf',
    )
    return res


def plot_tokens_heatmap(list_of_texts, tokenizer, max_sequence_length):
    encoded = prepare_texts_for_bert(
        texts=list_of_texts,
        tokenizer=tokenizer,
        max_sequence_length=max_sequence_length,
    )
    plt.figure(figsize=(12, 5))
    plt.pcolormesh(encoded['input_ids'])
    plt.show()
    return None

In [None]:
plot_tokens_heatmap(
    list_of_texts=df['excerpt'].values.tolist(),
    tokenizer=transformers.AutoTokenizer.from_pretrained('../input/huggingface-bert-variants/bert-base-cased/bert-base-cased'),
    max_sequence_length=512,
)

## Параметры и оптимизатор

In [None]:
bert_model_name = '../input/huggingface-bert-variants/bert-base-cased/bert-base-cased'

max_sequence_length = 256

train_data_size = 4 * 500  # это 4 фолда * 500 сэмплов, ещё один фолд уйдёт в валидацию
initial_learning_rate = 2e-5
epochs = 3  # ещё шедулинг лёрнинг-рейта и ёрли-стоп

train_batch_size = 16
warmup_ratio = 0.1

In [None]:
num_steps_per_epoch = math.ceil(train_data_size / train_batch_size)
num_train_steps = num_steps_per_epoch * epochs
num_warmup_steps = num_train_steps * warmup_ratio
print('num_steps_per_epoch:', num_steps_per_epoch)
print('num_train_steps:', num_train_steps)
print('num_warmup_steps:', num_warmup_steps)

def get_optimizer(initial_learning_rate, num_train_steps, num_warmup_steps, is_return_schedule=False):
    """
    # https://huggingface.co/transformers/v4.4.2/_modules/transformers/optimization_tf.html#create_optimizer
    """
    optimizer, lr_schedule = transformers.optimization_tf.create_optimizer(
        init_lr=initial_learning_rate,  # The desired learning rate at the end of the warmup phase.
        num_train_steps=num_train_steps,  # The total number of training steps.
        num_warmup_steps=num_warmup_steps,  # The number of warmup steps.
        min_lr_ratio=0.0,  # The final learning rate at the end of the linear decay will be. defaults to 0
        adam_beta1=0.9,  # The beta1 to use in Adam. defaults to 0.9
        adam_beta2=0.999,  # The beta2 to use in Adam. defaults to 0.999
        adam_epsilon=1e-8,  # The epsilon to use in Adam. defaults to 1e-8
        weight_decay_rate=0.01,  # The weight decay to use. defaults to 0
        power=1.0,  # The power to use for PolynomialDecay. defaults to 1.0 (1.0 is a linear warmup)
        include_in_weight_decay=None,  # if none is passed, weight decay is applied to all parameters except bias and layer norm parameters.
    )
    if is_return_schedule:
        return optimizer, lr_schedule
    return optimizer


optimizer, lr_schedule = get_optimizer(initial_learning_rate, num_train_steps, num_warmup_steps, True)
plt.figure(figsize=(12, 5))
plt.plot([lr_schedule(n) for n in range(num_train_steps)])
plt.xlabel(f'training steps, each step is batch of: {train_batch_size}')
plt.ylabel('learning rate')
plt.plot()
del optimizer, lr_schedule

Лёрнинг-рейт будет линейно увеличиваться для первых 10% шагов, а потом линейно уменьшаться до нуля.

## Модель Берта

In [None]:
config = transformers.AutoConfig.from_pretrained(bert_model_name)
print(config.initializer_range)

tokenizer = transformers.AutoTokenizer.from_pretrained(bert_model_name, fast=True)

In [None]:
def get_model(bert_model_name, max_sequence_length):
    
    in_input_ids = tf.keras.Input(shape=(max_sequence_length, ), name='input_ids', dtype=tf.int32)
    in_token_type_ids = tf.keras.Input(shape=(max_sequence_length, ), name='token_type_ids', dtype=tf.int32)
    in_attention_mask = tf.keras.Input(shape=(max_sequence_length, ), name='attention_mask', dtype=tf.int32)  
    bert_input = [in_input_ids, in_token_type_ids, in_attention_mask]
    
    bert = transformers.TFAutoModel.from_pretrained(bert_model_name)
    output = bert(bert_input)
    pooled_output = output.pooler_output
    drop = tf.keras.layers.Dropout(0.1)(pooled_output)
    scores = tf.keras.layers.Dense(
        units=1,  # regression task
        activation='linear',
        #kernel_initializer=transformers.modeling_tf_utils.get_initializer(0.02),
        #bias_initializer=tf.keras.initializers.Constant(-0.96),  # mean target
    )(drop)
    
    model = tf.keras.models.Model(inputs=bert_input, outputs=scores)
    return model

model = get_model(bert_model_name, max_sequence_length)

In [None]:
model.summary()

## Скоринг на кросс-валидации, для подбора параметров вручную

In [None]:
def get_cv_iterator(df):
    """
    В тест по очереди фолды.
    """
    cv_iterator = []
    for each in sorted(df['fold'].unique()):
        if each != 'holdout':
            train_indices = df[~df['fold'].isin([each, 'holdout'])].index.values.astype(int)
            test_indices =  df[df['fold'].isin([each])].index.values.astype(int)
            cv_iterator.append( (train_indices, test_indices) )
    return cv_iterator

cv_iterator = get_cv_iterator(df)

In [None]:
X = prepare_texts_for_bert(
    texts=df['excerpt'].values.tolist(),
    tokenizer=tokenizer,
    max_sequence_length=max_sequence_length,
)

y = df['target'].values

In [None]:
Path('checkpoints').mkdir(parents=True, exist_ok=True)
folds_history = []
for idx, (train_index, val_index) in enumerate(cv_iterator):

    X_train = [
        tf.gather(X['input_ids'], train_index),
        tf.gather(X['token_type_ids'], train_index),
        tf.gather(X['attention_mask'], train_index),
    ]
    y_train = y[train_index]
    
    X_validation = [
        tf.gather(X['input_ids'], val_index),
        tf.gather(X['token_type_ids'], val_index),
        tf.gather(X['attention_mask'], val_index),
    ]
    y_validation = y[val_index]    
    
    model = get_model(bert_model_name, max_sequence_length)
    model.compile(
        optimizer=get_optimizer(initial_learning_rate, num_train_steps, num_warmup_steps),
        loss=tf.keras.losses.MeanSquaredError(),
        metrics=[tf.keras.metrics.RootMeanSquaredError()],
    )
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_root_mean_squared_error',
        patience=2,  # number of epochs with no improvement
        mode='min',  # mode='auto'
        restore_best_weights=True,
        verbose=1,
    )

    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
        f'checkpoints/checkpoint_{idx}',
        monitor='val_root_mean_squared_error',
        mode='min',
        save_best_only=True,
        save_weights_only=True,
        verbose=1,
    )

    history = model.fit(
        x=X_train,
        y=y_train,
        validation_data=(X_validation, y_validation),
        batch_size=train_batch_size,
        validation_batch_size=train_batch_size,
        epochs=epochs,
        sample_weight=(1 - df['standard_error'].values[train_index]),
        callbacks=[early_stopping, model_checkpoint],
    )
    
    best_epoch_idx = np.argmin(history.history['val_root_mean_squared_error'])
    print(f"Fold {idx}, best score: {history.history['val_root_mean_squared_error'][best_epoch_idx]}, at epoch: {best_epoch_idx}")
    folds_history.append(history.history)
    
    del X_train, y_train, X_validation, y_validation
    del model, early_stopping, model_checkpoint
    del history, best_epoch_idx

In [None]:
def plot_cv_perfomance(folds_history):
    fig, axes = plt.subplots(1, 5, sharey=True, figsize=(15, 5))
    for ax, history in enumerate(folds_history):
        axes[ax].plot(range(1, len(history['root_mean_squared_error']) + 1), history['root_mean_squared_error'], 'black', label='train')
        axes[ax].plot(range(1, len(history['val_root_mean_squared_error']) + 1), history['val_root_mean_squared_error'], 'green', label='validation')
        axes[ax].set_title(f'fold {ax}')
        axes[ax].set_xlabel('Epoch')
        axes[ax].legend()
        if ax == 0:
            axes[ax].set_ylabel('rmse')
    plt.show()

plot_cv_perfomance(folds_history)

## Посмотрю на холдауте, блендинг чекпоинтов с фолдов

In [None]:
holdout_index = df[df['fold'] == 'holdout'].index.values

X_holdout = [
    tf.gather(X['input_ids'], holdout_index),
    tf.gather(X['token_type_ids'], holdout_index),
    tf.gather(X['attention_mask'], holdout_index),
]

y_holdout = y[holdout_index] 

In [None]:
!ls checkpoints

In [None]:
predictions = []
for idx, checkpoint_filepath in enumerate([
    'checkpoints/checkpoint_0', 'checkpoints/checkpoint_1', 'checkpoints/checkpoint_2',
    'checkpoints/checkpoint_3', 'checkpoints/checkpoint_4']):
    
    model = get_model(bert_model_name, max_sequence_length)
    model.load_weights(checkpoint_filepath).expect_partial()
    
    y_pred = model.predict(X_holdout)
    y_pred = y_pred.flatten()

    score = sklearn.metrics.mean_squared_error(y_pred, y_holdout, squared=False)
    print(f'Holdout score, from checkpoint {idx}:', score)
    
    predictions.append(y_pred)
    del model

assert [len(x) == len(y_holdout) for x in predictions]
mean_prediction = np.mean(predictions, axis=0)

In [None]:
plt.figure(figsize=(12, 5))
plt.plot(range(len(y_pred)), mean_prediction, color='black')
plt.plot(range(len(y_holdout)), y_holdout, color='red')
plt.show()

In [None]:
mean_score = sklearn.metrics.mean_squared_error(y_holdout, mean_prediction, squared=False)
print(f'Holdout, avg score:', mean_score)

In [None]:
holdout_error = np.abs(y_holdout - mean_prediction)
print('Holdout, mean_error:', np.mean(holdout_error))
print('Holdout, std_error:', np.std(holdout_error))
print('Holdout, min_error:', np.min(holdout_error))
print('Holdout, max_error:' ,np.max(holdout_error))

## Сабмит, блендинг чепоинтов с фолдов

In [None]:
submit_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')
submit_df.shape

In [None]:
X_submit = prepare_texts_for_bert(
    texts=submit_df['excerpt'].values.tolist(),
    tokenizer=tokenizer,
    max_sequence_length=max_sequence_length,
)

X_submit = [X_submit['input_ids'], X_submit['token_type_ids'], X_submit['attention_mask']]

In [None]:
predictions = []
for idx, checkpoint_filepath in enumerate([
    'checkpoints/checkpoint_0', 'checkpoints/checkpoint_1', 'checkpoints/checkpoint_2',
    'checkpoints/checkpoint_3', 'checkpoints/checkpoint_4']):
    
    model = get_model(bert_model_name, max_sequence_length)
    model.load_weights(checkpoint_filepath).expect_partial()
    
    y_pred = model.predict(X_submit)
    y_pred = y_pred.flatten()

    predictions.append(y_pred)
    del model

assert [len(x) == len(X_submit) for x in predictions]
mean_prediction = np.mean(predictions, axis=0)

In [None]:
result = mean_prediction

In [None]:
submission_df = pd.DataFrame({'id': submit_df.id, 'target': 0})
submission_df.target = result

submission_df.to_csv('submission.csv', index=False)

In [None]:
submission_df