In [None]:
import codecs
import copy
import csv
import gc
from itertools import chain
import os
import pickle
import random
from typing import Dict, List, Tuple, Union
import warnings

In [None]:
import matplotlib.pyplot as plt
import nltk
import numpy as np
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import tensorflow_addons as tfa
from transformers import AutoConfig, AutoTokenizer, TFAutoModel

In [None]:
print(tf.__version__)

In [None]:
class MaskCalculator(tf.keras.layers.Layer):
    def __init__(self, output_dim, **kwargs):
        self.output_dim = output_dim
        super(MaskCalculator, self).__init__(**kwargs)

    def build(self, input_shape):
        super(MaskCalculator, self).build(input_shape)

    def call(self, inputs, **kwargs):
        return tf.keras.backend.permute_dimensions(
            x=tf.keras.backend.repeat(
                x=tf.keras.backend.cast(
                    x=tf.keras.backend.greater(
                        x=inputs,
                        y=0
                    ),
                    dtype='float32'
                ),
                n=self.output_dim
            ),
            pattern=(0, 2, 1)
        )

    def compute_output_shape(self, input_shape):
        assert len(input_shape) == 1
        shape = list(input_shape)
        shape.append(self.output_dim)
        return tuple(shape)

In [None]:
def generate_attention_mask(token_indices: np.ndarray, padding_id: int) -> np.ndarray:
    attention = np.zeros(token_indices.shape, dtype=np.int32)
    for sample_idx in range(token_indices.shape[0]):
        for token_idx in range(token_indices.shape[1]):
            if token_indices[sample_idx, token_idx] == padding_id:
                break
            attention[sample_idx, token_idx] = 1
    return attention

In [None]:
def calc_text_features(texts: List[List[str]], tok: AutoTokenizer) -> np.ndarray:
    f = np.zeros((len(texts), 9), dtype=np.float32)
    for idx, sentences in enumerate(texts):
        f[idx, 0] = len(sentences)
        words = []
        pure_words = []
        for cur_sent in sentences:
            words_in_sentence = nltk.word_tokenize(cur_sent)
            words += words_in_sentence
            pure_words += list(filter(lambda it: it.isalpha(), words_in_sentence))
        f[idx, 1] = len(words) / f[idx, 0]
        f[idx, 2] = len(pure_words) / f[idx, 0]
        f[idx, 3] = len(' '.join(sentences))
        f[idx, 4] = len(pure_words)
        f[idx, 5] = np.mean([len(w) for w in pure_words])
        for w in pure_words:
            syllables = tok.tokenize(w)
            f[idx, 6] += len(syllables) 
            f[idx, 7] += sum(map(lambda it: len(it), syllables))
        f[idx, 7] /= f[idx, 6]
        f[idx, 8] = f[idx, 6] / f[idx, 4]
    return f

In [None]:
def load_data_for_training(
    fname: str,
    tok: AutoTokenizer
) -> List[Dict[str, Tuple[List[str], float, float, np.ndarray]]]:
    loaded_header = []
    id_col_idx = -1
    text_col_idx = -1
    target_col_idx = -1
    std_col_idx = -1
    line_idx = 1
    data = dict()
    set_of_texts = set()
    with codecs.open(fname, mode='r', encoding='utf-8') as fp:
        data_reader = csv.reader(fp, quotechar='"', delimiter=',')
        for row in data_reader:
            if len(row) > 0:
                err_msg = f'File {fname}: line {line_idx} is wrong!'
                if len(loaded_header) == 0:
                    loaded_header = copy.copy(row)
                    try:
                        text_col_idx = loaded_header.index('excerpt')
                    except:
                        text_col_idx = -1
                    if text_col_idx <= 0:
                        raise ValueError(err_msg + ' Field "excerpt" is not found!')
                    try:
                        id_col_idx = loaded_header.index('id')
                    except:
                        id_col_idx = -1
                    if id_col_idx < 0:
                        raise ValueError(err_msg + ' Field "id" is not found!')
                    try:
                        target_col_idx = loaded_header.index('target')
                    except:
                        target_col_idx = -1
                    if target_col_idx < 0:
                        raise ValueError(err_msg + ' Field "target" is not found!')
                    try:
                        std_col_idx = loaded_header.index('standard_error')
                    except:
                        std_col_idx = -1
                    if std_col_idx < 0:
                        err_msg2 = f'{err_msg} Field "standard_error" is not found!'
                        raise ValueError(err_msg2)
                else:
                    sample_id = row[id_col_idx]
                    if sample_id != sample_id.strip():
                        raise ValueError(err_msg + f' {sample_id} is wrong sample ID!')
                    if sample_id in data:
                        err_msg2 = f'{err_msg} {sample_id} is not unique sample ID!'
                        raise ValueError(err_msg2)
                    text = row[text_col_idx].replace('\r', '\n')
                    if len(text) == 0:
                        raise ValueError(err_msg + f' Text {sample_id} is empty!')
                    sentences = []
                    for paragraph in map(lambda it: it.strip(), text.split('\n')):
                        if len(paragraph) > 0:
                            sentences += nltk.sent_tokenize(paragraph)
                    if len(sentences) == 0:
                        raise ValueError(err_msg + f' Text {sample_id} is empty!')
                    text = ' '.join([cur_sent.lower() for cur_sent in sentences])
                    if text in set_of_texts:
                        raise ValueError(err_msg + f' Text {sample_id} is not unique!')
                    set_of_texts.add(text.lower())
                    try:
                        target_val = float(row[target_col_idx])
                        ok = True
                    except:
                        target_val = 0.0
                        ok = False
                    if not ok:
                        err_msg2 = err_msg
                        err_msg2 += f' {row[target_col_idx]} is wrong target for ' \
                                    f'text {sample_id}.'
                        raise ValueError(err_msg2)
                    try:
                        std_val = float(row[std_col_idx])
                        ok = (std_val > 0.0)
                    except:
                        std_val = 0.0
                        ok = False
                    if not ok:
                        err_msg2 = err_msg
                        err_msg2 += f' {row[std_col_idx]} is wrong standard error' \
                                    f' for text {sample_id}.'
                        warnings.warn(err_msg2)
                    else:
                        data[sample_id] = (
                            [tok.cls_token] + tok.tokenize(text) + [tok.sep_token],
                            target_val, std_val,
                            calc_text_features([sentences], tok)
                        )
            line_idx += 1
    return data

In [None]:
def load_data_for_testing(fname: str, tok: AutoTokenizer, batch_size: int):
    loaded_header = []
    id_col_idx = -1
    text_col_idx = -1
    target_col_idx = -1
    std_col_idx = -1
    line_idx = 1
    data = dict()
    with codecs.open(fname, mode='r', encoding='utf-8') as fp:
        data_reader = csv.reader(fp, quotechar='"', delimiter=',')
        for row in data_reader:
            if len(row) > 0:
                err_msg = f'File {fname}: line {line_idx} is wrong!'
                if len(loaded_header) == 0:
                    loaded_header = copy.copy(row)
                    try:
                        text_col_idx = loaded_header.index('excerpt')
                    except:
                        text_col_idx = -1
                    if text_col_idx <= 0:
                        raise ValueError(err_msg + ' Field "excerpt" is not found!')
                    try:
                        id_col_idx = loaded_header.index('id')
                    except:
                        id_col_idx = -1
                    if id_col_idx < 0:
                        raise ValueError(err_msg + ' Field "id" is not found!')
                else:
                    sample_id = row[id_col_idx]
                    if sample_id != sample_id.strip():
                        raise ValueError(err_msg + f' {sample_id} is wrong sample ID!')
                    if sample_id in data:
                        err_msg2 = f'{err_msg} {sample_id} is not unique sample ID!'
                        raise ValueError(err_msg2)
                    text = row[text_col_idx].replace('\n', ' ').replace('\r', ' ')
                    text = ' '.join(text.split()).strip()
                    if len(text) == 0:
                        raise ValueError(err_msg + f' Text {sample_id} is empty!')
                    features = calc_text_features([nltk.sent_tokenize(text)], tok) 
                    data[sample_id] = (
                        [tok.cls_token] + tok.tokenize(text.lower()) + [tok.sep_token],
                        features
                    )
                    if len(data) >= batch_size:
                        yield data
                        del data
                        data = dict()
            line_idx += 1
    if len(data) > 0:
        yield data

In [None]:
def vectorize_data(
    data: Union[List[Dict[str, Tuple[List[str], np.ndarray]]],
                List[Dict[str, Tuple[List[str], float, float, np.ndarray]]]],
    tokenizer: AutoTokenizer, fe: tf.keras.Model, scaler: StandardScaler,
    max_seq_len: int, batch_size: int
) -> Tuple[Union[Dict[str, int], Dict[str, Tuple[int, float, float]]], np.ndarray]:
    tokenized_data = dict()
    all_tokens_matrix = []
    additional_features = []
    for sample_idx, cur_ID in enumerate(sorted(list(data.keys()))):
        if len(data[cur_ID]) == 2:
            tokens = data[cur_ID][0]
            token_ids = tokenizer.convert_tokens_to_ids(
                [tokenizer.cls_token] + tokens + [tokenizer.sep_token]
            )
            ndiff = max_seq_len - len(token_ids)
            if ndiff > 0:
                token_ids += [tokenizer.pad_token_id for _ in range(ndiff)]
            elif ndiff < 0:
                token_ids = token_ids[:max_seq_len]
            tokenized_data[cur_ID] = len(all_tokens_matrix)
            all_tokens_matrix.append(token_ids)
            additional_features.append(data[cur_ID][1])
        else:
            tokens = data[cur_ID][0]
            token_ids = tokenizer.convert_tokens_to_ids(
                [tokenizer.cls_token] + tokens + [tokenizer.sep_token]
            )
            ndiff = max_seq_len - len(token_ids)
            if ndiff > 0:
                token_ids += [tokenizer.pad_token_id for _ in range(ndiff)]
            elif ndiff < 0:
                token_ids = token_ids[:max_seq_len]
            tokenized_data[cur_ID] = (
                len(all_tokens_matrix),
                data[cur_ID][1],
                data[cur_ID][2]
            )
            all_tokens_matrix.append(token_ids)
            additional_features.append(data[cur_ID][3])
    n_samples = len(all_tokens_matrix)
    while len(all_tokens_matrix) % batch_size != 0:
        all_tokens_matrix.append(all_tokens_matrix[-1])
        additional_features.append(additional_features[-1])
    all_tokens_matrix = np.array(all_tokens_matrix, dtype=np.int32)
    attentions = generate_attention_mask(all_tokens_matrix, tokenizer.pad_token_id)
    additional_features = scaler.transform(np.vstack(additional_features))
    features = fe.predict(
        [all_tokens_matrix, attentions, additional_features],
        batch_size=batch_size
    )[:n_samples]
    return tokenized_data, features

In [None]:
def load_feature_extractor(fe_dir_name: str) -> Tuple[tf.keras.Model, StandardScaler,
                                                      int, int]:
    batch_size = 32
    max_seq_len = 256
    scaler_name = os.path.join(fe_dir_name, 'output_scaler.pkl')
    assert os.path.isfile(scaler_name)
    nn_weights_name = os.path.join(fe_dir_name, 'regression_nn.h5')
    assert os.path.isfile(nn_weights_name)
    with open(scaler_name, 'rb') as fp:
        text_feature_scaler, _ = pickle.load(fp)
    feature_vector_size = text_feature_scaler.scale_.shape[0]
    transformer_config = AutoConfig.from_pretrained(fe_dir_name)
    print('Transformer Configuration')
    print('=========================')
    transformer_model = TFAutoModel.from_config(
        config=transformer_config,
        name='DistilTransformer'
    )
    united_emb_layer = tf.keras.layers.Dense(
        units=256, input_dim=transformer_config.hidden_size,
        activation='tanh',
        kernel_initializer=tf.keras.initializers.GlorotNormal(seed=42),
        bias_initializer='zeros' 
    )
    print(transformer_config)
    left_tokens = tf.keras.layers.Input(shape=(max_seq_len,), batch_size=batch_size,
                                        dtype=tf.int32, name='word_ids')
    left_attention = tf.keras.layers.Input(shape=(max_seq_len,), batch_size=batch_size,
                                           dtype=tf.int32, name='attention_mask')
    left_features = tf.keras.layers.Input(shape=(feature_vector_size,), dtype=tf.float32,
                                          batch_size=batch_size, name='features')
    left_sequence_output = transformer_model([left_tokens, left_attention])[0]
    left_output_mask = MaskCalculator(
        output_dim=transformer_config.hidden_size, trainable=False,
        name='OutMaskCalculator'
    )(left_attention)
    left_masked_output = tf.keras.layers.Multiply(
        name='OutMaskMultiplicator'
    )([left_output_mask, left_sequence_output])
    left_masked_output = tf.keras.layers.Masking(
        name='OutMasking'
    )(left_masked_output)
    left_output = tf.keras.layers.GlobalAvgPool1D(name='AvePool')(left_masked_output)
    left_output = tf.keras.layers.LayerNormalization(
        name='Emdedding'
    )(left_output)
    left_output = tf.keras.layers.Concatenate(
        name='Concat'
    )([left_output, left_features])
    left_output = tf.keras.layers.Dropout(
        rate=0.3, seed=42, name='Dropout1' 
    )(left_output)
    left_output = united_emb_layer(left_output)
    regression_layer = tf.keras.layers.Dense(
        units=1, input_dim=256, activation=None,
        kernel_initializer=tf.keras.initializers.GlorotNormal(seed=42),
        bias_initializer='zeros',
        kernel_regularizer=tf.keras.regularizers.l1_l2(l1=1e-5, l2=1e-4),
        name='RegressionLayer'
    )
    left_regression_output = regression_layer(left_output)
    regression_model = tf.keras.Model(
        inputs=[left_tokens, left_attention, left_features],
        outputs=left_regression_output,
        name='RegressionModel'
    )
    regression_model.build(input_shape=[(batch_size, max_seq_len),
                                        (batch_size, max_seq_len),
                                        (batch_size, feature_vector_size)])
    feature_extraction_model = tf.keras.Model(
        inputs=[left_tokens, left_attention, left_features],
        outputs=left_output,
        name='FeatureExtractionModel'
    )
    feature_extraction_model.build(input_shape=[(batch_size, max_seq_len),
                                                (batch_size, max_seq_len),
                                                (batch_size, feature_vector_size)])
    regression_model.load_weights(nn_weights_name)
    return feature_extraction_model, text_feature_scaler, max_seq_len, batch_size

In [None]:
def mixup(X: np.ndarray, y_mean: np.ndarray, y_std: np.ndarray,
          mixup_coeff: float, n_samples: int) -> Tuple[np.ndarray, np.ndarray]:
    assert (mixup_coeff > 0.0) and (mixup_coeff < 1.0)
    assert len(X.shape) == 2
    assert len(y_mean.shape) == 1
    assert len(y_std.shape) == 1
    assert X.shape[0] == y_mean.shape[0]
    assert y_mean.shape[0] == y_std.shape[0]
    X_new = np.empty((n_samples, X.shape[1]), dtype=np.float64)
    y_new = np.empty((n_samples,), dtype=np.float64)
    for sample_idx in range(n_samples):
        idx1 = random.randint(0, X.shape[0] - 1)
        idx2 = random.randint(0, X.shape[0] - 1)
        X_new[sample_idx] = (1.0 - mixup_coeff) * X[idx1] + mixup_coeff * X[idx2]
        y_new[sample_idx] = (1.0 - mixup_coeff) * np.random.normal(
            loc=y_mean[idx1], scale=y_std[idx1]
        )
        y_new[sample_idx] += mixup_coeff * np.random.normal(
            loc=y_mean[idx2], scale=y_std[idx2]
        )
    return X_new, y_new

In [None]:
def build_regressor(labels: Dict[str, Tuple[int, float, float]], features: np.ndarray,
                    IDs_for_training: List[str],
                    IDs_for_validation: List[str], batch_size: int,
                    ensemble_idx: int) -> tf.keras.Model:
    X_train = []
    y_train_mean = []
    y_train_std = []
    X_val = []
    y_val = []
    for cur_id in IDs_for_training:
        sample_idx, target_mean, target_std = labels[cur_id]
        X_train.append(features[sample_idx:(sample_idx + 1)])
        y_train_mean.append(target_mean)
        y_train_std.append(target_std)
    X_train = np.vstack(X_train)
    y_train_mean = np.array(y_train_mean, dtype=np.float32)
    y_train_std = np.array(y_train_std, dtype=np.float32)
    for cur_id in IDs_for_validation:
        sample_idx, target, _ = labels[cur_id]
        X_val.append(features[sample_idx:(sample_idx + 1)])
        y_val.append(target)
    X_val = np.vstack(X_val)
    y_val = np.array(y_val, dtype=np.float32)
    regressor = tf.keras.Sequential(
        layers=[
            tf.keras.layers.InputLayer(
                input_shape=(features.shape[1],),
                dtype=tf.float32,
                name=f'input_nn{ensemble_idx}'
            ),
            tf.keras.layers.AlphaDropout(
                rate=0.1, seed=ensemble_idx * 10,
                name=f'dropout1_nn{ensemble_idx}'
            ),
            tf.keras.layers.Dense(
                units=400, activation='selu',
                kernel_initializer=tf.keras.initializers.LecunNormal(
                    seed=(ensemble_idx + 1) * 10
                ),
                name=f'dense1_nn{ensemble_idx}'
            ),
            tf.keras.layers.AlphaDropout(
                rate=0.1, seed=ensemble_idx * 20,
                name=f'dropout2_nn{ensemble_idx}'
            ),
            tf.keras.layers.Dense(
                units=400, activation='selu',
                kernel_initializer=tf.keras.initializers.LecunNormal(
                    seed=(ensemble_idx + 1) * 20
                ),
                name=f'dense2_nn{ensemble_idx}'
            ),
            tf.keras.layers.AlphaDropout(
                rate=0.1, seed=ensemble_idx * 30,
                name=f'dropout3_nn{ensemble_idx}'
            ),
            tf.keras.layers.Dense(
                units=300, activation='selu',
                kernel_initializer=tf.keras.initializers.LecunNormal(
                    seed=(ensemble_idx + 1) * 30
                ),
                name=f'dense3_nn{ensemble_idx}'
            ),
            tf.keras.layers.AlphaDropout(
                rate=0.1, seed=ensemble_idx * 40,
                name=f'dropout4_nn{ensemble_idx}'
            ),
            tf.keras.layers.Dense(
                units=300, activation='selu',
                kernel_initializer=tf.keras.initializers.LecunNormal(
                    seed=(ensemble_idx + 1) * 40
                ),
                name=f'dense4_nn{ensemble_idx}'
            ),
            tf.keras.layers.AlphaDropout(
                rate=0.1, seed=ensemble_idx * 50,
                name=f'dropout5_nn{ensemble_idx}'
            ),
            tf.keras.layers.Dense(
                units=200, activation='selu',
                kernel_initializer=tf.keras.initializers.LecunNormal(
                    seed=(ensemble_idx + 1) * 50
                ),
                name=f'dense5_nn{ensemble_idx}'
            ),
            tf.keras.layers.AlphaDropout(
                rate=0.1, seed=ensemble_idx * 60,
                name=f'dropout6_nn{ensemble_idx}'
            ),
            tf.keras.layers.Dense(
                units=100, activation='selu',
                kernel_initializer=tf.keras.initializers.LecunNormal(
                    seed=(ensemble_idx + 1) * 60
                ),
                name=f'dense6_nn{ensemble_idx}'
            ),
            tf.keras.layers.AlphaDropout(
                rate=0.1, seed=ensemble_idx * 70,
                name=f'dropout7_nn{ensemble_idx}'
            ),
            tf.keras.layers.Dense(
                units=1, activation=None,
                kernel_initializer=tf.keras.initializers.LecunNormal(
                    seed=(ensemble_idx + 1) * 70
                ),
                name=f'dense7_nn{ensemble_idx}'
            )
        ],
        name=f'FinalRegressor{ensemble_idx}'
    )
    radam = tfa.optimizers.RectifiedAdam(learning_rate=1e-3)
    ranger = tfa.optimizers.Lookahead(radam, sync_period=6, slow_step_size=0.5)
    regressor.compile(optimizer=ranger, loss=tf.keras.losses.MeanSquaredError(),
                      metrics=[tf.keras.metrics.RootMeanSquaredError()])
    print(f'Estimator {ensemble_idx}')
    print('====================')
    print('')
    regressor.summary()
    X_train, y_train = mixup(X_train, y_train_mean, y_train_std, 0.1, 50000)
    steps_per_epoch = min(5 * X_val.shape[0], X_train.shape[0]) // batch_size
    steps_per_trainset = X_train.shape[0] // batch_size
    n_epochs = steps_per_trainset // steps_per_epoch
    print('')
    print(f'steps_per_trainset = {steps_per_trainset}')
    print(f'steps_per_epoch = {steps_per_epoch}')
    print(f'n_epochs = {n_epochs}')
    print('')
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor="val_root_mean_squared_error",
            patience=3,
            verbose=True,
            restore_best_weights=True
        )
    ]
    tf_trainset = tf.data.Dataset.from_tensor_slices(
        (X_train, y_train)
    ).repeat().shuffle(50000).batch(batch_size)
    tf_validset = tf.data.Dataset.from_tensor_slices(
        (X_val, y_val)
    ).batch(batch_size)
    history = regressor.fit(tf_trainset, validation_data=tf_validset,
                            callbacks=callbacks, epochs=n_epochs,
                            steps_per_epoch=steps_per_epoch, batch_size=batch_size)
    show_training_process(history, "root_mean_squared_error",
                          ensemble_idx, f'of regressor {ensemble_idx}')
    return regressor

In [None]:
def show_training_process(history: tf.keras.callbacks.History, metric_name: str,
                          figure_id: int, additional_info: str):
    val_metric_name = 'val_' + metric_name
    possible_metrics = list(history.history.keys())
    if metric_name not in history.history:
        err_msg = f'The metric "{metric_name}" is not found!'
        err_msg += f' Available metrics are: {possible_metrics}.'
        raise ValueError(err_msg)
    fig = plt.figure(figure_id, figsize=(7, 7))
    metric_values = history.history[metric_name]
    plt.plot(list(range(len(metric_values))), metric_values,
             label='Training {0}'.format(metric_name))
    if val_metric_name in history.history:
        val_metric_values = history.history['val_' + metric_name]
        assert len(metric_values) == len(val_metric_values)
        plt.plot(list(range(len(val_metric_values))), val_metric_values,
                 label='Validation {0}'.format(metric_name))
    plt.xlabel('Epochs')
    plt.ylabel(metric_name)
    if len(additional_info.strip()) == 0:
        plt.title('Training process')
    else:
        plt.title('Training process ' + additional_info.strip())
    plt.legend(loc='best')
    plt.show()

In [None]:
def do_predictions(regressor: List[tf.keras.Model], batch_size: int, features: np.ndarray,
                   data: Union[Dict[str, int], Dict[str, Tuple[int, float, float]]],
                   identifiers: Union[List[str], None]=None) -> Dict[str, float]:
    if identifiers is None:
        identifiers_ = sorted(list(data.keys()))
    else:
        identifiers_ = sorted(identifiers)
    indices = list(map(
        lambda it: data[it] if isinstance(data[it], int) else data[it][0],
        identifiers_
    ))
    X = features[indices]
    predictions = regressor[0].predict(X, batch_size=batch_size).reshape((len(indices),))
    for cur in regressor[1:]:
        predictions += cur.predict(X, batch_size=batch_size).reshape((len(indices),))
    predictions /= float(len(regressor))
    return dict(map(lambda idx: (identifiers_[idx], predictions[idx]),
                    range(len(indices))))

In [None]:
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
PRETRAINED_MODEL_DIR = '/kaggle/input/tf-readability-distilbert'
print(f'{PRETRAINED_MODEL_DIR} {os.path.isdir(PRETRAINED_MODEL_DIR)}')

In [None]:
pretrained_tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_DIR)
feature_extractor, input_scaler, max_text_len, minibatch_size = load_feature_extractor(
    PRETRAINED_MODEL_DIR
)

In [None]:
DATA_DIR = '/kaggle/input/commonlitreadabilityprize'
MODEL_DIR = '/kaggle/working'
print(f'{DATA_DIR} {os.path.isdir(DATA_DIR)}')
print(f'{MODEL_DIR} {os.path.isdir(MODEL_DIR)}')

In [None]:
trainset_name = os.path.join(DATA_DIR, 'train.csv')
print(f'{trainset_name} {os.path.isfile(trainset_name)}')

In [None]:
testset_name = os.path.join(DATA_DIR, 'test.csv')
print(f'{testset_name} {os.path.isfile(testset_name)}')

In [None]:
submission_name = os.path.join(MODEL_DIR, 'submission.csv')
print(f'{submission_name} {os.path.isfile(submission_name)}')

In [None]:
data_for_training = load_data_for_training(trainset_name,
                                           pretrained_tokenizer)
assert len(data_for_training) > 100

In [None]:
data_for_training, features_for_training = vectorize_data(
    data=data_for_training,
    tokenizer=pretrained_tokenizer,
    fe=feature_extractor,
    scaler=input_scaler,
    max_seq_len=max_text_len,
    batch_size=minibatch_size
)

In [None]:
all_IDs = list(data_for_training.keys())
random.shuffle(all_IDs)
n_train_size = int(round(len(all_IDs) * 0.7))
n_val_size = int(round(len(all_IDs) * 0.15))
identifiers_for_training = all_IDs[:n_train_size]
identifiers_for_validation = all_IDs[n_train_size:(n_train_size + n_val_size)]
identifiers_for_final_testing = all_IDs[(n_train_size + n_val_size):]
del all_IDs

In [None]:
ensemble = []

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=1
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 1 = {error}')

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=2
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 2 = {error}')

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=3
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 3 = {error}')

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=4
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 4 = {error}')

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=5
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 5 = {error}')

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=6
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 6 = {error}')

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=7
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 7 = {error}')

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=8
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 8 = {error}')

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=9
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 9 = {error}')

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=10
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 10 = {error}')

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=11
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 11 = {error}')

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=12
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 12 = {error}')

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=13
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 13 = {error}')

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=14
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 14 = {error}')

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=15
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 15 = {error}')

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=16
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 16 = {error}')

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=17
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 17 = {error}')

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=18
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 18 = {error}')

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=19
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 19 = {error}')

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=20
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 20 = {error}')

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=21
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 21 = {error}')

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=22
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 22 = {error}')

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=23
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 23 = {error}')

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=24
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 24 = {error}')

In [None]:
ensemble.append(
    build_regressor(
        labels=data_for_training,
        features=features_for_training,
        IDs_for_training=identifiers_for_training,
        IDs_for_validation=identifiers_for_validation,
        batch_size=minibatch_size * 4,
        ensemble_idx=25
    )
)

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble[-1:], batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of item 25 = {error}')

In [None]:
predictions_for_testing = do_predictions(
    regressor=ensemble, batch_size=minibatch_size * 4,
    data=data_for_training, features=features_for_training,
    identifiers=identifiers_for_final_testing
)

In [None]:
error = 0.0
for cur_id in identifiers_for_final_testing:
    difference = predictions_for_testing[cur_id] - data_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(identifiers_for_final_testing))
error = np.sqrt(error)
print(f'RMSE of ensemble = {error}')

In [None]:
gc.collect()

In [None]:
with codecs.open(submission_name, mode='w', encoding='utf-8') as fp:
    data_writer = csv.writer(fp, quotechar='"', delimiter=',')
    data_writer.writerow(['id', 'target'])
    for data_part in load_data_for_testing(testset_name, pretrained_tokenizer,
                                           minibatch_size * 32):
        data_for_submission, features_for_submission = vectorize_data(
            data=data_part,
            tokenizer=pretrained_tokenizer,
            fe=feature_extractor,
            scaler=input_scaler,
            max_seq_len=max_text_len,
            batch_size=minibatch_size
        )
        predictions_for_submission = do_predictions(
            regressor=ensemble, batch_size=minibatch_size * 4,
            data=data_for_submission, features=features_for_submission
        )
        for cur_id in predictions_for_submission:
            data_writer.writerow([cur_id, f'{predictions_for_submission[cur_id]}'])
        del predictions_for_submission
        del data_for_submission, features_for_submission
        gc.collect()