In [None]:
import codecs
import copy
import csv
import gc
from itertools import chain
import os
import pickle
import random
import time
from typing import Dict, List, Tuple, Union
import warnings

In [None]:
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
import nltk
from nltk.corpus import wordnet
import numpy as np
from sklearn.manifold import TSNE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PowerTransformer
import tensorflow as tf
import tensorflow_addons as tfa
from transformers import AutoConfig, AutoTokenizer, TFAutoModel

In [None]:
print(tf.__version__)

In [None]:
class MaskCalculator(tf.keras.layers.Layer):
    def __init__(self, output_dim, **kwargs):
        self.output_dim = output_dim
        super(MaskCalculator, self).__init__(**kwargs)

    def build(self, input_shape):
        super(MaskCalculator, self).build(input_shape)

    def call(self, inputs, **kwargs):
        return tf.keras.backend.permute_dimensions(
            x=tf.keras.backend.repeat(
                x=tf.keras.backend.cast(
                    x=tf.keras.backend.greater(
                        x=inputs,
                        y=0
                    ),
                    dtype='float32'
                ),
                n=self.output_dim
            ),
            pattern=(0, 2, 1)
        )

    def compute_output_shape(self, input_shape):
        assert len(input_shape) == 1
        shape = list(input_shape)
        shape.append(self.output_dim)
        return tuple(shape)

In [None]:
class DatasetGen(tf.keras.utils.Sequence):
    def __init__(self, data: Dict[str, Tuple[List[int], float, float, np.ndarray]],
                 data_IDs: List[str], apply_augmentation: bool,
                 feature_scaler: Pipeline,
                 token_indices: np.ndarray, pad_token_id: int,
                 batch_size: int, batches_per_epoch: Union[int, None] = None):
        self.data = copy.deepcopy(data)
        self.token_indices = token_indices
        self.pad_token_id = pad_token_id
        self.batch_size = batch_size
        self.batches_per_epoch = batches_per_epoch
        self.feature_scaler = feature_scaler
        self.apply_augmentation = apply_augmentation
        self.pairs = set()
        for key1 in data_IDs:
            for key2 in data_IDs:
                if key1 == key2:
                    continue
                if (key1, key2) not in self.pairs:
                    self.pairs.add((key1, key2))
        self.pairs = list(self.pairs)
        random.shuffle(self.pairs)
        self.n_samples = min(len(self.pairs), len(data_IDs) * 4)
    
    def __len__(self):
        if self.batches_per_epoch is None:
            return int(np.ceil(self.n_samples / float(self.batch_size)))
        return self.batches_per_epoch

    def __getitem__(self, idx):
        x_left = np.zeros(
            shape=(self.batch_size, self.token_indices.shape[1]),
            dtype=np.int32
        )
        left_features = []
        x_right = np.zeros(
            shape=(self.batch_size, self.token_indices.shape[1]),
            dtype=np.int32
        )
        right_features = []
        batch_y = np.zeros(
            (self.batch_size, 1),
            dtype=np.float32
        )
        if self.batches_per_epoch is None:
            batch_start = idx * self.batch_size
            batch_end = min(len(self.pairs), batch_start + self.batch_size)
            for sample_idx in range(batch_end - batch_start):
                left_key, right_key = self.pairs[sample_idx + batch_start]
                left_idx = self.data[left_key][0][0]
                left_features.append(self.data[left_key][3][0:1])
                left_target = self.data[left_key][1]
                right_idx = self.data[right_key][0][0]
                right_target = self.data[right_key][1]
                right_features.append(self.data[right_key][3][0:1])
                x_left[sample_idx] = self.token_indices[left_idx]
                x_right[sample_idx] = self.token_indices[right_idx]
                batch_y[sample_idx, 0] = left_target - right_target
            n_pad = self.batch_size - (batch_end - batch_start)
            if n_pad > 0:
                for sample_idx in range(batch_end - batch_start, self.batch_size):
                    x_left[sample_idx] = x_left[sample_idx - 1]
                    x_right[sample_idx] = x_right[sample_idx - 1]
                    left_features.append(left_features[-1])
                    right_features.append(right_features[-1])
                    batch_y[sample_idx, 0] = batch_y[sample_idx - 1, 0]
        else:
            for sample_idx in range(self.batch_size):
                left_key, right_key = random.choice(self.pairs)
                if self.apply_augmentation:
                    p = np.ones((len(self.data[left_key][0]),),
                                dtype=np.float64)
                    p[0] = max(2.0, p.shape[0] - 1.0)
                    p /= p.sum()
                    left_idx_ = np.random.choice(
                        list(range(len(self.data[left_key][0]))),
                        p=p
                    )
                    left_target = np.random.normal(
                        loc=self.data[left_key][1],
                        scale=self.data[left_key][2]
                    )
                else:
                    left_idx_ = 0
                    left_target = self.data[left_key][1]
                left_idx = self.data[left_key][0][left_idx_]
                left_features.append(self.data[left_key][3][left_idx_:(left_idx_ + 1)])
                if self.apply_augmentation:
                    p = np.ones((len(self.data[right_key][0]),),
                                dtype=np.float64)
                    p[0] = max(2.0, p.shape[0] - 1.0)
                    p /= p.sum()
                    right_idx_ = np.random.choice(
                        list(range(len(self.data[right_key][0]))),
                        p=p
                    )
                    right_target = np.random.normal(
                        loc=self.data[right_key][1],
                        scale=self.data[right_key][2]
                    )
                else:
                    right_idx_ = 0
                    right_target = self.data[right_key][1]
                right_idx = self.data[right_key][0][right_idx_]
                right_features.append(self.data[right_key][3][right_idx_:(right_idx_ + 1)])
                x_left[sample_idx] = self.token_indices[left_idx]
                x_right[sample_idx] = self.token_indices[right_idx]
                batch_y[sample_idx, 0] = left_target - right_target
        batch_x = [
            x_left,
            generate_attention_mask(x_left, self.pad_token_id),
            self.feature_scaler.transform(np.vstack(left_features)),
            x_right,
            generate_attention_mask(x_right, self.pad_token_id), 
            self.feature_scaler.transform(np.vstack(right_features))
        ]
        del x_left, x_right
        return batch_x, batch_y, None

In [None]:
def generate_attention_mask(token_indices: np.ndarray, padding_id: int) -> np.ndarray:
    attention = np.zeros(token_indices.shape, dtype=np.int32)
    for sample_idx in range(token_indices.shape[0]):
        for token_idx in range(token_indices.shape[1]):
            if token_indices[sample_idx, token_idx] == padding_id:
                break
            attention[sample_idx, token_idx] = 1
    return attention

In [None]:
def calc_text_features(texts: List[List[str]], tok: AutoTokenizer) -> np.ndarray:
    f = np.zeros((len(texts), 9), dtype=np.float32)
    for idx, sentences in enumerate(texts):
        f[idx, 0] = len(sentences)
        words = []
        pure_words = []
        for cur_sent in sentences:
            words_in_sentence = nltk.word_tokenize(cur_sent)
            words += words_in_sentence
            pure_words += list(filter(lambda it: it.isalpha(), words_in_sentence))
        f[idx, 1] = len(words) / f[idx, 0]
        f[idx, 2] = len(pure_words) / f[idx, 0]
        f[idx, 3] = len(' '.join(sentences))
        f[idx, 4] = len(pure_words)
        f[idx, 5] = np.mean([len(w) for w in pure_words])
        for w in pure_words:
            syllables = tok.tokenize(w.lower())
            f[idx, 6] += len(syllables)
            f[idx, 7] += sum(map(lambda it: len(it), syllables))
        f[idx, 7] /= f[idx, 6]
        f[idx, 8] = f[idx, 6] / f[idx, 4]
    return f

In [None]:
def load_data_for_training(
    fname: str,
    tok: AutoTokenizer
) -> List[Dict[str, Tuple[List[str], float, float, np.ndarray]]]:
    loaded_header = []
    id_col_idx = -1
    text_col_idx = -1
    target_col_idx = -1
    std_col_idx = -1
    line_idx = 1
    data = dict()
    set_of_texts = set()
    with codecs.open(fname, mode='r', encoding='utf-8') as fp:
        data_reader = csv.reader(fp, quotechar='"', delimiter=',')
        for row in data_reader:
            if len(row) > 0:
                err_msg = f'File {fname}: line {line_idx} is wrong!'
                if len(loaded_header) == 0:
                    loaded_header = copy.copy(row)
                    try:
                        text_col_idx = loaded_header.index('excerpt')
                    except:
                        text_col_idx = -1
                    if text_col_idx <= 0:
                        raise ValueError(err_msg + ' Field "excerpt" is not found!')
                    try:
                        id_col_idx = loaded_header.index('id')
                    except:
                        id_col_idx = -1
                    if id_col_idx < 0:
                        raise ValueError(err_msg + ' Field "id" is not found!')
                    try:
                        target_col_idx = loaded_header.index('target')
                    except:
                        target_col_idx = -1
                    if target_col_idx < 0:
                        raise ValueError(err_msg + ' Field "target" is not found!')
                    try:
                        std_col_idx = loaded_header.index('standard_error')
                    except:
                        std_col_idx = -1
                    if std_col_idx < 0:
                        err_msg2 = f'{err_msg} Field "standard_error" is not found!'
                        raise ValueError(err_msg2)
                else:
                    sample_id = row[id_col_idx]
                    if sample_id != sample_id.strip():
                        raise ValueError(err_msg + f' {sample_id} is wrong sample ID!')
                    if sample_id in data:
                        err_msg2 = f'{err_msg} {sample_id} is not unique sample ID!'
                        raise ValueError(err_msg2)
                    text = row[text_col_idx].replace('\r', '\n')
                    if len(text) == 0:
                        raise ValueError(err_msg + f' Text {sample_id} is empty!')
                    sentences = []
                    for paragraph in map(lambda it: it.strip(), text.split('\n')):
                        if len(paragraph) > 0:
                            sentences += nltk.sent_tokenize(paragraph)
                    if len(sentences) == 0:
                        raise ValueError(err_msg + f' Text {sample_id} is empty!')
                    text = ' '.join([cur_sent for cur_sent in sentences])
                    if text.lower() in set_of_texts:
                        raise ValueError(err_msg + f' Text {sample_id} is not unique!')
                    set_of_texts.add(text.lower())
                    added_texts = [sentences]
                    try:
                        target_val = float(row[target_col_idx])
                        ok = True
                    except:
                        target_val = 0.0
                        ok = False
                    if not ok:
                        err_msg2 = err_msg
                        err_msg2 += f' {row[target_col_idx]} is wrong target for ' \
                                    f'text {sample_id}.'
                        raise ValueError(err_msg2)
                    try:
                        std_val = float(row[std_col_idx])
                        ok = (std_val > 0.0)
                    except:
                        std_val = 0.0
                        ok = False
                    if not ok:
                        err_msg2 = err_msg
                        err_msg2 += f' {row[std_col_idx]} is wrong standard error' \
                                    f' for text {sample_id}.'
                        warnings.warn(err_msg2)
                    else:
                        for _ in range(3):
                            new_augmented_text = []
                            for cur_sent in sentences:
                                new_sent = cur_sent.strip()
                                if len(new_sent) > 0:
                                    new_augmented_text.append(new_sent)
                            assert len(new_augmented_text) > 0
                            random.shuffle(new_augmented_text)
                            new_augmented_text_ = ' '.join(new_augmented_text)
                            if (len(new_augmented_text_) > 0) and \
                                    (new_augmented_text_.lower() not in set_of_texts):
                                set_of_texts.add(new_augmented_text_.lower())
                                added_texts.append(new_augmented_text)
                            del new_augmented_text, new_augmented_text_
                        data[sample_id] = (
                            list(map(lambda it: ' '.join(it), added_texts)),
                            target_val, std_val,
                            calc_text_features(added_texts, tok)
                        )
            line_idx += 1
    return data

In [None]:
def load_data_for_testing(fname: str, tok: AutoTokenizer, batch_size: int):
    loaded_header = []
    id_col_idx = -1
    text_col_idx = -1
    target_col_idx = -1
    std_col_idx = -1
    line_idx = 1
    data = dict()
    with codecs.open(fname, mode='r', encoding='utf-8') as fp:
        data_reader = csv.reader(fp, quotechar='"', delimiter=',')
        for row in data_reader:
            if len(row) > 0:
                err_msg = f'File {fname}: line {line_idx} is wrong!'
                if len(loaded_header) == 0:
                    loaded_header = copy.copy(row)
                    try:
                        text_col_idx = loaded_header.index('excerpt')
                    except:
                        text_col_idx = -1
                    if text_col_idx <= 0:
                        raise ValueError(err_msg + ' Field "excerpt" is not found!')
                    try:
                        id_col_idx = loaded_header.index('id')
                    except:
                        id_col_idx = -1
                    if id_col_idx < 0:
                        raise ValueError(err_msg + ' Field "id" is not found!')
                else:
                    sample_id = row[id_col_idx]
                    if sample_id != sample_id.strip():
                        raise ValueError(err_msg + f' {sample_id} is wrong sample ID!')
                    if sample_id in data:
                        err_msg2 = f'{err_msg} {sample_id} is not unique sample ID!'
                        raise ValueError(err_msg2)
                    text = row[text_col_idx].replace('\n', ' ').replace('\r', ' ')
                    text = ' '.join(text.split()).strip()
                    if len(text) == 0:
                        raise ValueError(err_msg + f' Text {sample_id} is empty!')
                    features = calc_text_features([nltk.sent_tokenize(text)], tok) 
                    data[sample_id] = (text, features)
                    if len(data) >= batch_size:
                        yield data
                        del data
                        data = dict()
            line_idx += 1
    if len(data) > 0:
        yield data

In [None]:
def train_feature_scaler(data: Dict[str, Tuple[List[int], float, float,
                                               np.ndarray]]) -> Pipeline:
    features_for_training = []
    for sample_id in data:
        features_for_training.append(data[sample_id][3])
    features_for_training = np.vstack(features_for_training)
    scaler = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('transformer', PowerTransformer())
    ])
    return scaler.fit(features_for_training)

In [None]:
def tokenize_data(
    data: Union[List[Dict[str, Tuple[str, np.ndarray]]],
                List[Dict[str, Tuple[List[str], float, float, np.ndarray]]]],
    tokenizer: AutoTokenizer, max_seq_len: int
) -> Tuple[Union[Dict[str, Tuple[int, np.ndarray]],
                 Dict[str, Tuple[List[int], float, float, np.ndarray]]],
           np.ndarray]:
    tokenized_data = dict()
    all_tokens_matrix = []
    for sample_idx, cur_ID in enumerate(sorted(list(data.keys()))):
        if len(data[cur_ID]) == 2:
            tokens = tokenizer.tokenize(data[cur_ID][0])
            tokenized_data[cur_ID] = (len(all_tokens_matrix), data[cur_ID][1])
            token_ids = tokenizer.convert_tokens_to_ids(
                [tokenizer.cls_token] + tokens + [tokenizer.sep_token]
            )
            ndiff = max_seq_len - len(token_ids)
            if ndiff > 0:
                token_ids += [tokenizer.pad_token_id for _ in range(ndiff)]
            elif ndiff < 0:
                token_ids = token_ids[:max_seq_len]
            all_tokens_matrix.append(token_ids)
        else:
            text_idx_list = []
            for cur_text in data[cur_ID][0]:
                tokens = tokenizer.tokenize(cur_text)
                token_ids = tokenizer.convert_tokens_to_ids(
                    [tokenizer.cls_token] + tokens + [tokenizer.sep_token]
                )
                ndiff = max_seq_len - len(token_ids)
                if ndiff > 0:
                    token_ids += [tokenizer.pad_token_id for _ in range(ndiff)]
                elif ndiff < 0:
                    token_ids = token_ids[:max_seq_len]
                text_idx_list.append(len(all_tokens_matrix))
                all_tokens_matrix.append(token_ids)
            tokenized_data[cur_ID] = (text_idx_list, data[cur_ID][1], data[cur_ID][2],
                                      data[cur_ID][3])
    return tokenized_data, np.array(all_tokens_matrix, dtype=np.int32)

In [None]:
def print_info_about_data(
    data: Union[List[Dict[str, Tuple[str, np.ndarray]]],
                List[Dict[str, Tuple[List[str], float, float, np.ndarray]]]],
    identifiers: List[str]
):
    for_training = (len(data[identifiers[0]]) == 4)
    if for_training:
        print(f'Number of samples for training is {len(data)}.')
    else:
        print(f'Number of samples for submission is {len(data)}.')
    print('')
    print(f'{len(identifiers)} random samples:')
    for cur_id in identifiers:
        print('')
        print(f'  Sample {cur_id}')
        if for_training:
            print('  Text:')
            print(f'    {data[cur_id][0][0]}')
            print(f'  Number of augmented texts is {len(data[cur_id][0]) - 1}.')
            if (len(data[cur_id][0]) - 1) > 0:
                if (len(data[cur_id][0]) - 1) > 1:
                    print('  2 augmented texts:')
                    for augmented in data[cur_id][0][1:3]:
                        print(f'    {augmented}')
                else:
                    print('  Augmented text:')
                    for augmented in data[cur_id][0][1:2]:
                        print(f'    {augmented}')
            print('  Target:')
            print(f'    {data[cur_id][1]} +- {data[cur_id][2]}')
            print('  Features:')
            for it in data[cur_id][3].tolist(): print(f'    {it}') 
        else:
            print(' Text:')
            print(f'    {data[cur_id][0]}')
            print(' Features:')
            print(f'    {data[cur_id][1].tolist()[0]}')

In [None]:
def print_info_about_tokenized_data(
    data: Union[Dict[str, Tuple[int, np.ndarray]],
                Dict[str, Tuple[List[int], float, float, np.ndarray]]],
    matrix: np.ndarray,
    identifiers: List[str]
):
    for_training = (len(data[identifiers[0]]) == 4)
    if for_training:
        print(f'Number of tokenized samples for training is {len(data)}.')
    else:
        print(f'Number of tokenized samples for submission is {len(data)}.')
    print('')
    print(f'{len(identifiers)} random samples:')
    for cur_id in identifiers:
        print('')
        print(f'Sample {cur_id}')
        print('')
        sample_idx = data[cur_id][0][0]
        print(matrix[sample_idx].tolist())
        print('')
        print(data[cur_id][-1][0].tolist())
        print('')

In [None]:
def build_feature_extractor(bert_name: str, max_seq_len: int, feature_vector_size: int,
                            batch_size: int) -> Tuple[tf.keras.Model, int]:
    transformer_model = TFAutoModel.from_pretrained(
        pretrained_model_name_or_path=bert_name,
        name='BaseTransformer'
    )
    united_embedding_size = 256
    transformer_config = AutoConfig.from_pretrained(bert_name)
    united_emb_layer = tf.keras.layers.Dense(
        units=united_embedding_size, input_dim=transformer_config.hidden_size,
        activation='elu',
        kernel_initializer=tf.keras.initializers.HeNormal(seed=42),
        bias_initializer='zeros',
        name='UnitedEmbeddingLayer'
    )
    print('Transformer Configuration')
    print('=========================')
    print(transformer_config)
    tokens_input = tf.keras.layers.Input(shape=(max_seq_len,), batch_size=batch_size,
                                         dtype=tf.int32, name='word_ids_base')
    attention_input = tf.keras.layers.Input(shape=(max_seq_len,), batch_size=batch_size,
                                            dtype=tf.int32, name='attention_mask_base')
    features_input = tf.keras.layers.Input(shape=(feature_vector_size,), dtype=tf.float32,
                                           batch_size=batch_size, name='features_base')
    sequence_output = transformer_model([tokens_input, attention_input])[0]
    output_mask = MaskCalculator(
        output_dim=transformer_config.hidden_size, trainable=False,
        name='OutMaskCalculator'
    )(attention_input)
    masked_output = tf.keras.layers.Multiply(
        name='OutMaskMultiplicator'
    )([output_mask, sequence_output])
    masked_output = tf.keras.layers.Masking(name='OutMasking')(masked_output)
    final_output = tf.keras.layers.GlobalAvgPool1D(name='AvePool')(masked_output)
    final_output = tf.keras.layers.LayerNormalization(
        name='LayerNorm1'
    )(final_output)
    final_output = tf.keras.layers.Concatenate(
        name='Concat'
    )([final_output, features_input])
    final_output = tf.keras.layers.Dropout(
        rate=0.3, seed=42, name='EmbeddingDropout' 
    )(final_output)
    final_output = united_emb_layer(final_output)
    final_output = tf.keras.layers.LayerNormalization(
        name='LayerNorm2'
    )(final_output) 
    fe_model = tf.keras.Model(
        inputs=[tokens_input, attention_input, features_input],
        outputs=final_output,
        name='FeatureExtractionModel'
    )
    fe_model.build(input_shape=[(batch_size, max_seq_len),
                                (batch_size, max_seq_len),
                                (batch_size, feature_vector_size)])
    return fe_model, united_embedding_size

In [None]:
def build_twin_regressor(feature_vector_size: int, batch_size: int) -> tf.keras.Model:
    left_input = tf.keras.layers.Input(shape=(feature_vector_size,), dtype=tf.float32,
                                       batch_size=batch_size, name='features_left')
    right_input = tf.keras.layers.Input(shape=(feature_vector_size,), dtype=tf.float32,
                                        batch_size=batch_size, name='features_right')
    concatenated_features = tf.keras.layers.Concatenate(
        name='ConcatFeatures'
    )([left_input, right_input])
    dropout_layer = tf.keras.layers.Dropout(
        rate=0.3,
        name='RegressionDropout'
    )(concatenated_features) 
    regression_layer = tf.keras.layers.Dense(
        units=1, input_dim=feature_vector_size * 2, activation=None,
        kernel_initializer=tf.keras.initializers.GlorotNormal(seed=42),
        bias_initializer='zeros',
        name='RegressionLayer'
    )(dropout_layer)
    twin_regression_model = tf.keras.Model(
        inputs=[left_input, right_input],
        outputs=regression_layer,
        name='TwinRegressionModel'
    )
    twin_regression_model.build(input_shape=[(batch_size, feature_vector_size),
                                             (batch_size, feature_vector_size)])
    return twin_regression_model

In [None]:
def build_neural_network(bert_name: str, max_seq_len: int, feature_vector_size: int,
                         batch_size: int) -> Tuple[tf.keras.Model, tf.keras.Model,
                                                   tf.keras.Model]:
    fe_layer, ft_vec_size = build_feature_extractor(bert_name, max_seq_len,
                                                    feature_vector_size, batch_size)
    left_tokens = tf.keras.layers.Input(shape=(max_seq_len,), batch_size=batch_size,
                                        dtype=tf.int32, name='word_ids')
    left_attention = tf.keras.layers.Input(shape=(max_seq_len,), batch_size=batch_size,
                                           dtype=tf.int32, name='attention_mask')
    left_features = tf.keras.layers.Input(shape=(feature_vector_size,), dtype=tf.float32,
                                          batch_size=batch_size, name='features')
    right_tokens = tf.keras.layers.Input(shape=(max_seq_len,), batch_size=batch_size,
                                         dtype=tf.int32, name='right_word_ids')
    right_attention = tf.keras.layers.Input(shape=(max_seq_len,), batch_size=batch_size,
                                            dtype=tf.int32, name='right_attention_mask')
    right_features = tf.keras.layers.Input(shape=(feature_vector_size,), dtype=tf.float32,
                                           batch_size=batch_size, name='right_features')
    left_output = fe_layer([left_tokens, left_attention, left_features])
    right_output = fe_layer([right_tokens, right_attention, right_features])
    regression_model = build_twin_regressor(ft_vec_size, batch_size)
    regression_layer = regression_model([left_output, right_output])
    siamese_model = tf.keras.Model(
        inputs=[left_tokens, left_attention, left_features,
                right_tokens, right_attention, right_features],
        outputs=regression_layer,
        name='SiameseModel'
    )
    radam = tfa.optimizers.RectifiedAdam(learning_rate=1e-6)
    ranger = tfa.optimizers.Lookahead(radam, sync_period=6, slow_step_size=0.5)
    siamese_model.compile(optimizer=ranger, loss=tf.keras.losses.MeanSquaredError())
    return siamese_model, fe_layer, regression_model

In [None]:
def show_minibatch(X: List[np.ndarray], y: np.ndarray):
    assert len(X) == 6
    print('')
    print('X1')
    for it in X[0].tolist(): print(it)
    print('')
    print('X2')
    for it in X[1].tolist(): print(it)
    print('')
    print('X3')
    for it in X[2].tolist(): print(it)
    print('')
    print('X4')
    for it in X[3].tolist(): print(it)
    print('')
    print('X5')
    for it in X[4].tolist(): print(it)
    print('X6')
    for it in X[5].tolist(): print(it) 
    print('')
    print('y')
    for it in y.tolist(): print(it)

In [None]:
def show_tsne(fe: tf.keras.Model, batch_size: int,
              data: Dict[str, Tuple[List[int], float, float, np.ndarray]],
              feature_scaler: Pipeline,
              token_matrix: np.ndarray,
              identifiers: List[str], pad_id: int, title: str, figure_id: int):
    indices = list(map(lambda it: data[it][0][0], identifiers))
    colors = np.array(
        list(map(lambda it: data[it][1], identifiers)),
        dtype=np.float64
    )
    area = np.array(
        list(map(lambda it: data[it][2], identifiers)),
        dtype=np.float64
    )
    area /= np.max(area)
    area *= 10.0
    area = np.power(area, 2)
    texts = token_matrix[indices]
    src_features = np.vstack(
        list(map(
            lambda it: data[it][3][0:1], 
            identifiers
        ))
    )
    assert src_features.shape[0] == texts.shape[0]
    ndiff = texts.shape[0] % batch_size
    if ndiff > 0:
        last_text_idx = texts.shape[0] - 1
        texts = np.vstack(
            [texts] + 
            [texts[last_text_idx:(last_text_idx + 1)]
             for _ in range(batch_size - ndiff)]
        )
        src_features = np.vstack(
            [src_features] +
            [src_features[last_text_idx:(last_text_idx + 1)]
             for _ in range(batch_size - ndiff)]
        )
    attentions = generate_attention_mask(texts, pad_id)
    assert texts.shape[0] % batch_size == 0, f'{texts.shape[0] % batch_size}'
    features = fe.predict(
        [texts, attentions, feature_scaler.transform(src_features)],
        batch_size=batch_size
    )
    features = features[:len(indices)]
    projected_features = TSNE(n_components=2, n_jobs=-1).fit_transform(features)
    fig = plt.figure(figure_id, figsize=(11, 11))
    plt.scatter(x=projected_features[:, 0], y=projected_features[:, 1],
                marker='o', cmap=plt.cm.get_cmap("jet"), s=area,
                c=colors, norm=Normalize(vmin=np.min(colors), vmax=np.max(colors)))
    plt.title('t-SNE projections of texts ' + title)
    plt.colorbar()
    plt.show()

In [None]:
def show_training_process(history: tf.keras.callbacks.History, metric_name: str,
                          figure_id: int):
    val_metric_name = 'val_' + metric_name
    possible_metrics = list(history.history.keys())
    if metric_name not in history.history:
        err_msg = f'The metric "{metric_name}" is not found!'
        err_msg += f' Available metrics are: {possible_metrics}.'
        raise ValueError(err_msg)
    fig = plt.figure(figure_id, figsize=(7, 7))
    metric_values = history.history[metric_name]
    plt.plot(list(range(len(metric_values))), metric_values,
             label='Training {0}'.format(metric_name))
    if val_metric_name in history.history:
        val_metric_values = history.history['val_' + metric_name]
        assert len(metric_values) == len(val_metric_values)
        plt.plot(list(range(len(val_metric_values))), val_metric_values,
                 label='Validation {0}'.format(metric_name))
    plt.xlabel('Epochs')
    plt.ylabel(metric_name)
    plt.title('Training process')
    plt.legend(loc='best')
    plt.show()

In [None]:
def generate_new_trainset(fe: tf.keras.Model, feature_scaler: Pipeline,
                          batch_size: int,
                          data: Dict[str, Tuple[List[int], float, float]],
                          token_matrix: np.ndarray, pad_id: int,
                          identifiers: List[str]) -> Tuple[np.ndarray, np.ndarray]:
    indices = list(map(lambda it: data[it][0][0], identifiers))
    texts = token_matrix[indices]
    src_features = np.vstack(list(map(lambda it: data[it][3][0:1], identifiers)))
    targets = np.array(list(map(lambda it: data[it][1], identifiers)),
                       dtype=np.float64)
    assert texts.shape[0] == src_features.shape[0]
    ndiff = texts.shape[0] % batch_size
    if ndiff > 0:
        last_text_idx = texts.shape[0] - 1
        texts = np.vstack(
            [texts] + 
            [texts[last_text_idx:(last_text_idx + 1)]
             for _ in range(batch_size - ndiff)]
        )
        src_features = np.vstack(
            [src_features] +
            [src_features[last_text_idx:(last_text_idx + 1)]
             for _ in range(batch_size - ndiff)]
        )
    attentions = generate_attention_mask(texts, pad_id)
    assert texts.shape[0] % batch_size == 0, f'{texts.shape[0] % batch_size}'
    target_features = fe.predict(
        [texts, attentions, feature_scaler.transform(src_features)],
        batch_size=batch_size
    )
    assert target_features.shape[1] > 1
    target_features = target_features[:len(identifiers)]
    return target_features, targets

In [None]:
def calculate_dist_matrix(y: np.ndarray) -> np.ndarray:
    assert len(y.shape) == 1
    assert y.shape[0] > 1
    d = np.zeros((y.shape[0], y.shape[0]), dtype=np.float32)
    for idx1 in range(y.shape[0]):
        for idx2 in range(y.shape[0]):
            diff = y[idx1] - y[idx2]
            d[idx1, idx2] = np.sqrt(diff * diff)
    return d

In [None]:
def select_train_samples(y: np.ndarray, dist_matrix: np.ndarray,
                         n: int) -> List[int]:
    assert len(y.shape) == 1
    assert len(dist_matrix.shape) == 2
    assert dist_matrix.shape[0] == y.shape[0]
    assert dist_matrix.shape[1] == dist_matrix.shape[0]
    assert n < y.shape[0]
    indices_of_samples = list(range(y.shape[0]))
    selected = {np.random.choice(indices_of_samples)}
    for _ in range(n - 1):
        indices_of_samples = sorted(list(set(indices_of_samples) - selected))
        p = [dist_matrix[idx, list(selected)].mean() for idx in indices_of_samples]
        p = np.array(p, dtype=np.float64)
        p /= p.sum()
        selected.add(np.random.choice(indices_of_samples, p=p))
    return sorted(list(selected))

In [None]:
def do_predictions(fe: tf.keras.Model, regressor: tf.keras.Model,
                   feature_scaler: Pipeline, batch_size: int,
                   data_for_anchors: Tuple[np.ndarray, np.ndarray],
                   dist_matrix: np.ndarray,
                   data: Union[Dict[str, int],
                               Dict[str, Tuple[List[int], float, float]]],
                   token_matrix: np.ndarray, pad_id: int,
                   identifiers: List[str]=None) -> Dict[str, Tuple[float, float]]:
    if identifiers is None:
        identifiers_ = sorted(list(data.keys()))
    else:
        identifiers_ = sorted(identifiers)
    indices = list(map(
        lambda it: data[it][0] if len(data[it]) == 2 else data[it][0][0],
        identifiers_
    ))
    texts = token_matrix[indices]
    src_features = np.vstack(
        list(map(
            lambda it: data[it][1] if len(data[it]) == 2 else data[it][3][0:1],
            identifiers_
        ))
    )
    assert texts.shape[0] == src_features.shape[0]
    ndiff = texts.shape[0] % batch_size
    if ndiff > 0:
        last_text_idx = texts.shape[0] - 1
        texts = np.vstack(
            [texts] + 
            [texts[last_text_idx:(last_text_idx + 1)]
             for _ in range(batch_size - ndiff)]
        )
        src_features = np.vstack(
            [src_features] +
            [src_features[last_text_idx:(last_text_idx + 1)]
             for _ in range(batch_size - ndiff)]
        )
    attentions = generate_attention_mask(texts, pad_id)
    assert texts.shape[0] % batch_size == 0, f'{texts.shape[0] % batch_size}'
    target_features = fe.predict(
        [texts, attentions, feature_scaler.transform(src_features)],
        batch_size=batch_size
    )
    assert target_features.shape[1] > 1
    assert target_features.shape[1] == data_for_anchors[0].shape[1]
    assert target_features.shape[0] >= len(indices)
    target_features = target_features[0:len(indices)]
    selected_inputs = []
    predicted_features = []
    selected_targets = []
    n_selected = batch_size
    while n_selected < 8:
        n_selected += batch_size
    for sample_idx, cur_id in enumerate(identifiers_):
        selected_indices_for_training = select_train_samples(
            y=data_for_anchors[1],
            dist_matrix=dist_matrix,
            n=n_selected
        )
        selected_inputs.append(data_for_anchors[0][selected_indices_for_training])
        selected_targets.append(data_for_anchors[1][selected_indices_for_training])
        predicted_features.append(np.full(
            fill_value=target_features[sample_idx],
            shape=(n_selected, target_features.shape[1])
        ))
    selected_inputs = np.vstack(selected_inputs)
    predicted_features = np.vstack(predicted_features)
    selected_targets = np.concatenate(selected_targets)
    prediction_diff = regressor.predict(
        [selected_inputs, predicted_features],
        batch_size=batch_size
    ).reshape(selected_targets.shape)
    predictions = dict()
    for sample_idx, cur_id in enumerate(identifiers_):
        start_pos = sample_idx * n_selected
        end_pos = start_pos + n_selected
        instant_predictions = selected_targets[start_pos:end_pos] - \
                              prediction_diff[start_pos:end_pos]
        predictions[cur_id] = (np.mean(instant_predictions),
                               np.std(instant_predictions))
    return predictions

In [None]:
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
MAX_TEXT_LEN = 128
PRETRAINED_BERT = '/kaggle/input/tf-distilroberta-base'
MINIBATCH_SIZE = 32

In [None]:
DATA_DIR = '/kaggle/input/commonlitreadabilityprize'
MODEL_DIR = '/kaggle/working'
print(f'{DATA_DIR} {os.path.isdir(DATA_DIR)}')
print(f'{MODEL_DIR} {os.path.isdir(MODEL_DIR)}')

In [None]:
trainset_name = os.path.join(DATA_DIR, 'train.csv')
print(f'{trainset_name} {os.path.isfile(trainset_name)}')

In [None]:
testset_name = os.path.join(DATA_DIR, 'test.csv')
print(f'{testset_name} {os.path.isfile(testset_name)}')

In [None]:
submission_name = os.path.join(MODEL_DIR, 'submission.csv')
print(f'{submission_name} {os.path.isfile(submission_name)}')

In [None]:
fe_model_name = os.path.join(MODEL_DIR, 'fe_nn.h5')
regression_model_name = os.path.join(MODEL_DIR, 'regression_nn.h5')
scaler_name = os.path.join(MODEL_DIR, 'feature_scaler.pkl')
figure_identifier = 1

In [None]:
pretrained_tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_BERT)
print(f'Vocabulary size is {pretrained_tokenizer.vocab_size}.')

In [None]:
data_for_training = load_data_for_training(trainset_name,
                                           pretrained_tokenizer)
assert len(data_for_training) > 100

In [None]:
all_IDs = sorted(list(data_for_training.keys()))
selected_IDs_for_training = random.sample(
    population=all_IDs,
    k=3
)
print_info_about_data(data_for_training, selected_IDs_for_training)

In [None]:
labels_for_training, tokens_for_training = tokenize_data(
    data=data_for_training,
    tokenizer=pretrained_tokenizer,
    max_seq_len=MAX_TEXT_LEN
)
print_info_about_tokenized_data(
    data=labels_for_training,
    matrix=tokens_for_training,
    identifiers=selected_IDs_for_training
)

In [None]:
text_feature_scaler = train_feature_scaler(labels_for_training) 
with open(scaler_name, 'wb') as scaler_fp:
    pickle.dump(text_feature_scaler, scaler_fp)

In [None]:
random.shuffle(all_IDs)
n_train_size = int(round(len(all_IDs) * 0.9))
n_val_size = int(round(len(all_IDs) * 0.05))
IDs_for_training = all_IDs[:n_train_size]
IDs_for_validation = all_IDs[n_train_size:(n_train_size + n_val_size)]
IDs_for_final_testing = all_IDs[(n_train_size + n_val_size):]

In [None]:
datagen_for_validation = DatasetGen(
    data=labels_for_training,
    data_IDs=IDs_for_validation,
    token_indices=tokens_for_training,
    pad_token_id=pretrained_tokenizer.pad_token_id,
    batch_size=MINIBATCH_SIZE,
    apply_augmentation=False,
    feature_scaler=text_feature_scaler
)
n_batches_per_validset = len(datagen_for_validation)
print(f'Mini-batches per validation set is {n_batches_per_validset}.')

In [None]:
X_, y_, _ = datagen_for_validation[0]
show_minibatch(X_, y_)

In [None]:
n_batches_per_epoch = n_batches_per_validset * 10
datagen_for_training = DatasetGen(
    data=labels_for_training,
    data_IDs=IDs_for_training,
    token_indices=tokens_for_training,
    pad_token_id=pretrained_tokenizer.pad_token_id,
    batch_size=MINIBATCH_SIZE,
    batches_per_epoch=n_batches_per_epoch,
    apply_augmentation=True,
    feature_scaler=text_feature_scaler
)

In [None]:
X_, y_, _ = datagen_for_training[0] 
show_minibatch(X_, y_)

In [None]:
model_for_training, fe_model, model_for_regression = build_neural_network(
    bert_name=PRETRAINED_BERT,
    max_seq_len=MAX_TEXT_LEN,
    feature_vector_size=text_feature_scaler.named_steps['scaler'].scale_.shape[0],
    batch_size=MINIBATCH_SIZE
)

In [None]:
model_for_training.summary()

In [None]:
model_for_regression.summary()

In [None]:
fe_model.summary()

In [None]:
show_tsne(fe=fe_model, batch_size=MINIBATCH_SIZE,
          feature_scaler=text_feature_scaler,
          data=labels_for_training, token_matrix=tokens_for_training,
          identifiers=IDs_for_validation + IDs_for_final_testing,
          pad_id=pretrained_tokenizer.pad_token_id,
          title='before training', figure_id=figure_identifier)
figure_identifier += 1

In [None]:
anchor_data = generate_new_trainset(
    fe=fe_model, feature_scaler=text_feature_scaler,
    batch_size=MINIBATCH_SIZE,
    data=labels_for_training, token_matrix=tokens_for_training,
    pad_id=pretrained_tokenizer.pad_token_id,
    identifiers=IDs_for_training
)
anchor_distances = calculate_dist_matrix(anchor_data[1])

In [None]:
start_time = time.time()
predictions_for_validation = do_predictions(
    fe=fe_model, regressor=model_for_regression,
    feature_scaler=text_feature_scaler, batch_size=MINIBATCH_SIZE,
    data_for_anchors=anchor_data, dist_matrix=anchor_distances,
    data=labels_for_training, token_matrix=tokens_for_training,
    pad_id=pretrained_tokenizer.pad_token_id,
    identifiers=IDs_for_validation
)
predict_duration = (time.time() - start_time) / float(len(IDs_for_validation))

In [None]:
error = 0.0
for cur_id in IDs_for_validation:
    difference = predictions_for_validation[cur_id][0] - labels_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(IDs_for_validation))
error = np.sqrt(error)
print(f'RMSE on validation set before training = {error}')
print(f'Prediction duration per sample = {predict_duration} seconds.')

In [None]:
del predictions_for_validation, error

In [None]:
start_time = time.time()
predictions_for_testing = do_predictions(
    fe=fe_model, regressor=model_for_regression,
    feature_scaler=text_feature_scaler, batch_size=MINIBATCH_SIZE,
    data_for_anchors=anchor_data, dist_matrix=anchor_distances,
    data=labels_for_training, token_matrix=tokens_for_training,
    pad_id=pretrained_tokenizer.pad_token_id,
    identifiers=IDs_for_final_testing
)
predict_duration = (time.time() - start_time) / float(len(IDs_for_final_testing))

In [None]:
error = 0.0
for cur_id in IDs_for_final_testing:
    difference = predictions_for_testing[cur_id][0] - labels_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(IDs_for_final_testing))
error = np.sqrt(error)
print(f'RMSE on test set before training = {error}')
print(f'Prediction duration per sample = {predict_duration} seconds.')

In [None]:
del predictions_for_testing, error

In [None]:
del anchor_data, anchor_distances

In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=10,
        verbose=True,
        restore_best_weights=True
    ),
    tfa.callbacks.TimeStopping(
        seconds=int(round(3600 * 1.9)),
        verbose=True
    )
]

In [None]:
history = model_for_training.fit(datagen_for_training,
                                 validation_data=datagen_for_validation,
                                 epochs=1000, callbacks=callbacks)

In [None]:
model_for_regression.save_weights(regression_model_name)
fe_model.save_weights(fe_model_name)

In [None]:
show_training_process(history, "loss", figure_identifier)
figure_identifier += 1

In [None]:
show_tsne(fe=fe_model, batch_size=MINIBATCH_SIZE,
          feature_scaler=text_feature_scaler,
          data=labels_for_training, token_matrix=tokens_for_training,
          identifiers=IDs_for_validation + IDs_for_final_testing,
          pad_id=pretrained_tokenizer.pad_token_id,
          title='after training', figure_id=figure_identifier)
figure_identifier += 1

In [None]:
anchor_data = generate_new_trainset(
    fe=fe_model, feature_scaler=text_feature_scaler,
    batch_size=MINIBATCH_SIZE,
    data=labels_for_training, token_matrix=tokens_for_training,
    pad_id=pretrained_tokenizer.pad_token_id,
    identifiers=IDs_for_training
)
anchor_distances = calculate_dist_matrix(anchor_data[1])

In [None]:
start_time = time.time()
predictions_for_validation = do_predictions(
    fe=fe_model, regressor=model_for_regression,
    feature_scaler=text_feature_scaler, batch_size=MINIBATCH_SIZE,
    data_for_anchors=anchor_data, dist_matrix=anchor_distances,
    data=labels_for_training, token_matrix=tokens_for_training,
    pad_id=pretrained_tokenizer.pad_token_id,
    identifiers=IDs_for_validation
)
predict_duration = (time.time() - start_time) / float(len(IDs_for_validation))

In [None]:
error = 0.0
for cur_id in IDs_for_validation:
    difference = predictions_for_validation[cur_id][0] - labels_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(IDs_for_validation))
error = np.sqrt(error)
print(f'RMSE on validation set after training = {error}')
print(f'Prediction duration per sample = {predict_duration} seconds.')

In [None]:
del predictions_for_validation, error

In [None]:
start_time = time.time()
predictions_for_testing = do_predictions(
    fe=fe_model, regressor=model_for_regression,
    feature_scaler=text_feature_scaler, batch_size=MINIBATCH_SIZE,
    data_for_anchors=anchor_data, dist_matrix=anchor_distances,
    data=labels_for_training, token_matrix=tokens_for_training,
    pad_id=pretrained_tokenizer.pad_token_id,
    identifiers=IDs_for_final_testing
)
predict_duration = (time.time() - start_time) / float(len(IDs_for_final_testing))

In [None]:
error = 0.0
for cur_id in IDs_for_final_testing:
    difference = predictions_for_testing[cur_id][0] - labels_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(IDs_for_final_testing))
error = np.sqrt(error)
print(f'RMSE on test set after training = {error}')
print(f'Prediction duration per sample = {predict_duration} seconds.')

In [None]:
pred = sorted(
    [(cur_id, predictions_for_testing[cur_id][0], predictions_for_testing[cur_id][1])
     for cur_id in predictions_for_testing],
    key=lambda it: (it[2], it[1], it[0])
)

In [None]:
print('Top-5 most certain predictions:')
print('')
for cur_id, pred_mean, pred_std in pred[0:5]:
    print('True:      {0:.6f} +- {1:.6f}'.format(data_for_training[cur_id][1],
                                                 data_for_training[cur_id][2]))
    print('Predicted: {0:.6f} +- {1:.6f}'.format(pred_mean, pred_std))
    print(data_for_training[cur_id][0][0])
    print('')

In [None]:
print('Top-5 most uncertain predictions:')
print('')
for cur_id, pred_mean, pred_std in pred[-5:]:
    print('True:      {0:.6f} +- {1:.6f}'.format(data_for_training[cur_id][1],
                                                 data_for_training[cur_id][2]))
    print('Predicted: {0:.6f} +- {1:.6f}'.format(pred_mean, pred_std))
    print(data_for_training[cur_id][0][0])
    print('')

In [None]:
del predictions_for_testing, error, pred

In [None]:
del datagen_for_training, datagen_for_validation
del labels_for_training, tokens_for_training
del data_for_training
del IDs_for_training, IDs_for_validation, IDs_for_final_testing
del model_for_training
gc.collect()

In [None]:
with codecs.open(submission_name, mode='w', encoding='utf-8') as fp:
    data_writer = csv.writer(fp, quotechar='"', delimiter=',')
    data_writer.writerow(['id', 'target'])
    for data_part in load_data_for_testing(testset_name, pretrained_tokenizer,
                                           MINIBATCH_SIZE * 8):
        labels_for_submission, tokens_for_submission = tokenize_data(
            data=data_part,
            tokenizer=pretrained_tokenizer,
            max_seq_len=MAX_TEXT_LEN
        )
        del data_part
        predictions_for_submission = do_predictions(
            fe=fe_model, regressor=model_for_regression,
            feature_scaler=text_feature_scaler, batch_size=MINIBATCH_SIZE,
            data_for_anchors=anchor_data, dist_matrix=anchor_distances,
            data=labels_for_submission, token_matrix=tokens_for_submission,
            pad_id=pretrained_tokenizer.pad_token_id
        )
        for cur_id in predictions_for_submission:
            predicted = predictions_for_submission[cur_id][0]
            data_writer.writerow([cur_id, f'{predicted}'])
        del predictions_for_submission
        del labels_for_submission, tokens_for_submission
        gc.collect()