In [None]:
import codecs
import copy
import csv
import gc
from itertools import chain
import os
import pickle
import random
import time
from typing import Dict, List, Tuple, Union
import warnings

In [None]:
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
import nltk
from nltk.corpus import wordnet
import numpy as np
from sklearn.manifold import TSNE
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PowerTransformer
import tensorflow as tf
from tensorflow.python.framework import ops, tensor_util
from tensorflow.python.keras.utils import losses_utils, tf_utils
from tensorflow.python.ops import math_ops
from tensorflow.python.ops.losses import util as tf_losses_util
import tensorflow_addons as tfa
from transformers import AutoConfig, AutoTokenizer, TFAutoModel

In [None]:
print(tf.__version__)

In [None]:
# Copyright (C) 2020
# Author: Joseph Sefara
# URL: <https://github.com/dsfsi/textaugment/>

class Wordnet:
    """
    A set of functions used to augment data.
    Typical usage: ::
        >>> import nltk
        >>> nltk.download('punkt')
        >>> nltk.download('wordnet')
        >>> nltk.download('averaged_perceptron_tagger')
        >>> from textaugment import Wordnet
        >>> t = Wordnet(v=True,n=True,p=0.5)
        >>> t.augment('I love school')
        i adore school
    """

    def __init__(self, **kwargs):
        """
        A method to initialize parameters
        :type random_state: int
        :param random_state: seed
        :type v: bool
        :param v: Verb, default is True
        :type n: bool
        :param n: Noun
        :type runs: int
        :param runs: Number of repetition on single text
        :type p: float, optional
        :param p: The probability of success of an individual trial. (0.1<p<1.0), default is 0.5
        :rtype:   None
        :return:  Constructer do not return.
        """

        # Set random state
        if 'random_state' in kwargs:
            self.random_state = kwargs['random_state']
            if isinstance(self.random_state, int):
                np.random.seed(self.random_state)
            else:
                raise TypeError("random_state must have type int, float, str, bytes," \
                                " or bytearray")

        # Set verb to be default if no values given
        try:
            if "v" not in kwargs and "n" not in kwargs:
                kwargs['v'] = True
                kwargs['n'] = False
            elif "v" in kwargs and "n" not in kwargs:
                kwargs['v'] = True
                kwargs['n'] = False
            elif "v" not in kwargs and "n" in kwargs:
                kwargs['n'] = True
                kwargs['v'] = False
            if "runs" not in kwargs:
                kwargs['runs']=1

        except KeyError:
            raise

        try:
            if "p" in kwargs:
                if type(kwargs['p']) is not float:
                    raise TypeError("p represent probability of success and " \
                                    "must be a float from 0.1 to 0.9. E.g p=0.5")
                elif type(kwargs['p']) is float:
                    self.p = kwargs['p']
            else:
                kwargs['p'] = 0.5
        except KeyError:
            raise

        self.p = kwargs['p']
        self.v = kwargs['v']
        self.n = kwargs['n']
        self.runs = kwargs['runs']

    def geometric(self, data):
        """
        Used to generate Geometric distribution.
        
        :type data: list
        :param data: Input data
        :rtype:   ndarray or scalar
        :return:  Drawn samples from the parameterized Geometric distribution.
        """

        data = np.array(data)
        first_trial = np.random.geometric(p=self.p, size=data.shape[0]) == 1
        return data[first_trial]

    def replace(self, data):
        """
        The method to replace words with synonyms
        
        :type data: str
        :param data: sentence used for data augmentation
        :rtype:   str
        :return:  The augmented data
        """
        data = data.lower().split()
        data_tokens = [[i, x, y] for i, (x, y) in enumerate(nltk.pos_tag(data))]
        if self.v:
            for loop in range(self.runs):
                words = [[i, x] for i, x, y in data_tokens if y[0] == 'V']
                words = [i for i in self.geometric(data=words)]
                if len(words) >= 1:  # There are synonyms
                    for word in words:
                        synonyms1 = wordnet.synsets(word[1], wordnet.VERB)
                        synonyms = list(set(
                            chain.from_iterable([syn.lemma_names() for syn in synonyms1])
                        ))
                        synonyms_ = []  # Synonyms with no underscores goes here
                        for w in synonyms:
                            if '_' not in w:
                                synonyms_.append(w)
                        if len(synonyms_) >= 1:
                            synonym = self.geometric(data=synonyms_).tolist()
                            if synonym:  # There is a synonym
                                data[int(word[0])] = synonym[0].lower()

        if self.n:
            for loop in range(self.runs):
                words = [[i, x] for i, x, y in data_tokens if y[0] == 'N']
                words = [i for i in self.geometric(data=words)]
                if len(words) >= 1:  # There are synonyms
                    for word in words:
                        synonyms1 = wordnet.synsets(word[1], wordnet.NOUN)
                        synonyms = list(set(
                            chain.from_iterable([syn.lemma_names() for syn in synonyms1])
                        ))
                        synonyms_ = []
                        for w in synonyms:
                            if '_' not in w:
                                synonyms_.append(w)
                        if len(synonyms_) >= 1:
                            synonym = self.geometric(data=synonyms_).tolist()
                            if synonym:
                                data[int(word[0])] = synonym[0].lower()

        return " ".join(data)

    def augment(self, data):
        """
        Data augmentation for text. Generate new dataset based on verb/nouns synonyms.
        
        :type data: str
        :param data: sentence used for data augmentation 
        :rtype:   str
        :return:  The augmented data
        """
        # Error handling
        if type(data) is not str:
            raise TypeError("Only strings are supported")
        data = self.replace(data)
        return data 

In [None]:
class LossFunctionWrapper(tf.keras.losses.Loss):
    def __init__(self,
                 fn,
                 reduction=losses_utils.ReductionV2.AUTO,
                 name=None,
                 **kwargs):
        super(LossFunctionWrapper, self).__init__(reduction=reduction, name=name)
        self.fn = fn
        self._fn_kwargs = kwargs

    def call(self, y_true, y_pred):
        if tensor_util.is_tensor(y_pred) and tensor_util.is_tensor(y_true):
            y_pred, y_true = tf_losses_util.squeeze_or_expand_dimensions(y_pred, y_true)
        return self.fn(y_true, y_pred, **self._fn_kwargs)

    def get_config(self):
        config = {}
        for k, v in six.iteritems(self._fn_kwargs):
            config[k] = tf.keras.backend.eval(v) if tf_utils.is_tensor_or_variable(v) \
                else v
        base_config = super(LossFunctionWrapper, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [None]:
def distance_based_log_loss(y_true, y_pred):
    y_pred = ops.convert_to_tensor(y_pred)
    y_true = math_ops.cast(y_true, y_pred.dtype)
    margin = 1.0
    p = (1.0 + tf.math.exp(-margin)) / (1.0 + tf.math.exp(y_pred - margin))
    return tf.keras.losses.binary_crossentropy(y_true, p, from_logits=False,
                                               label_smoothing=0.05)

In [None]:
class DBLLogLoss(LossFunctionWrapper):
    def __init__(self, reduction=losses_utils.ReductionV2.AUTO,
                 name='distance_based_log_loss'):
        super(DBLLogLoss, self).__init__(distance_based_log_loss, name=name,
                                         reduction=reduction)

In [None]:
class MaskCalculator(tf.keras.layers.Layer):
    def __init__(self, output_dim, **kwargs):
        self.output_dim = output_dim
        super(MaskCalculator, self).__init__(**kwargs)

    def build(self, input_shape):
        super(MaskCalculator, self).build(input_shape)

    def call(self, inputs, **kwargs):
        return tf.keras.backend.permute_dimensions(
            x=tf.keras.backend.repeat(
                x=tf.keras.backend.cast(
                    x=tf.keras.backend.greater(
                        x=inputs,
                        y=0
                    ),
                    dtype='float32'
                ),
                n=self.output_dim
            ),
            pattern=(0, 2, 1)
        )

    def compute_output_shape(self, input_shape):
        assert len(input_shape) == 1
        shape = list(input_shape)
        shape.append(self.output_dim)
        return tuple(shape)

In [None]:
class DatasetGen(tf.keras.utils.Sequence):
    def __init__(self, data: Dict[str, Tuple[List[int], float, float, np.ndarray]],
                 data_IDs: List[str],
                 feature_scaler: Pipeline,
                 output_scaler: StandardScaler,
                 token_indices: np.ndarray, pad_token_id: int,
                 median_distance_between_pairs: float,
                 batch_size: int, batches_per_epoch: Union[int, None] = None):
        self.data = copy.deepcopy(data)
        self.token_indices = token_indices
        self.pad_token_id = pad_token_id
        self.batch_size = batch_size
        self.median_distance_between_pairs = median_distance_between_pairs
        self.batches_per_epoch = batches_per_epoch
        self.output_scaler = output_scaler
        self.feature_scaler = feature_scaler
        self.pairs = set()
        for key1 in data_IDs:
            for key2 in data_IDs:
                if key1 == key2:
                    continue
                if ((key1, key2) not in self.pairs) and ((key2, key1) not in self.pairs):
                    self.pairs.add((key1, key2))
        self.pairs = list(self.pairs)
        random.shuffle(self.pairs)
        self.n_samples = len(data_IDs)
    
    def __len__(self):
        if self.batches_per_epoch is None:
            return int(np.ceil(min(len(self.pairs), 5 * self.n_samples) / float(self.batch_size)))
        return self.batches_per_epoch

    def __getitem__(self, idx):
        x_left = np.zeros(
            shape=(self.batch_size, self.token_indices.shape[1]),
            dtype=np.int32
        )
        left_features = []
        x_right = np.zeros(
            shape=(self.batch_size, self.token_indices.shape[1]),
            dtype=np.int32
        )
        right_features = []
        batch_y = [
            np.zeros(
                (self.batch_size, 1),
                dtype=np.int32
            ),
            np.zeros(
                (self.batch_size, 1),
                dtype=np.float32
            ),
            np.zeros(
                (self.batch_size, 1),
                dtype=np.float32
            )
        ]
        if self.batches_per_epoch is None:
            batch_start = idx * self.batch_size
            batch_end = min(len(self.pairs), batch_start + self.batch_size)
            for sample_idx in range(batch_end - batch_start):
                left_key, right_key = self.pairs[sample_idx + batch_start]
                left_idx = self.data[left_key][0][0]
                left_features.append(self.data[left_key][3][0:1])
                left_target = self.data[left_key][1]
                right_idx = self.data[right_key][0][0]
                right_target = self.data[right_key][1]
                right_features.append(self.data[right_key][3][0:1])
                x_left[sample_idx] = self.token_indices[left_idx]
                x_right[sample_idx] = self.token_indices[right_idx]
                if abs(left_target - right_target) < self.median_distance_between_pairs:
                    batch_y[0][sample_idx, 0] = 1
                else:
                    batch_y[0][sample_idx, 0] = 0
                batch_y[1][sample_idx, 0] = left_target
                batch_y[2][sample_idx, 0] = right_target
            n_pad = self.batch_size - (batch_end - batch_start)
            if n_pad > 0:
                for sample_idx in range(batch_end - batch_start, self.batch_size):
                    x_left[sample_idx] = x_left[sample_idx - 1]
                    x_right[sample_idx] = x_right[sample_idx - 1]
                    left_features.append(left_features[-1])
                    right_features.append(right_features[-1])
                    batch_y[0][sample_idx, 0] = batch_y[0][sample_idx - 1, 0]
                    batch_y[1][sample_idx, 0] = batch_y[1][sample_idx - 1, 0]
                    batch_y[2][sample_idx, 0] = batch_y[2][sample_idx - 1, 0]
        else:
            for sample_idx in range(self.batch_size):
                left_key, right_key = random.choice(self.pairs)
                p = np.ones((len(self.data[left_key][0]),),
                            dtype=np.float64)
                p[0] = max(2.0, p.shape[0] - 1.0)
                p /= p.sum()
                left_idx_ = np.random.choice(list(range(len(self.data[left_key][0]))), p=p)
                left_idx = self.data[left_key][0][left_idx_]
                left_target = np.random.normal(
                    loc=self.data[left_key][1],
                    scale=self.data[left_key][2]
                )
                left_features.append(self.data[left_key][3][left_idx_:(left_idx_ + 1)])
                p = np.ones((len(self.data[right_key][0]),),
                            dtype=np.float64)
                p[0] = max(2.0, p.shape[0] - 1.0)
                p /= p.sum()
                right_idx_ = np.random.choice(list(range(len(self.data[right_key][0]))), p=p)
                right_idx = self.data[right_key][0][right_idx_]
                right_target = np.random.normal(
                    loc=self.data[right_key][1],
                    scale=self.data[right_key][2]
                )
                right_features.append(self.data[right_key][3][right_idx_:(right_idx_ + 1)])
                x_left[sample_idx] = self.token_indices[left_idx]
                x_right[sample_idx] = self.token_indices[right_idx]
                if abs(left_target - right_target) < self.median_distance_between_pairs:
                    batch_y[0][sample_idx, 0] = 1
                else:
                    batch_y[0][sample_idx, 0] = 0
                batch_y[1][sample_idx, 0] = left_target
                batch_y[2][sample_idx, 0] = right_target
        batch_x = [
            x_left,
            generate_attention_mask(x_left, self.pad_token_id),
            self.feature_scaler.transform(np.vstack(left_features)),
            x_right,
            generate_attention_mask(x_right, self.pad_token_id), 
            self.feature_scaler.transform(np.vstack(right_features))
        ]
        del x_left, x_right
        batch_y[1] = self.output_scaler.transform(batch_y[1])
        batch_y[2] = self.output_scaler.transform(batch_y[2]) 
        return batch_x, batch_y, None

In [None]:
def generate_attention_mask(token_indices: np.ndarray, padding_id: int) -> np.ndarray:
    attention = np.zeros(token_indices.shape, dtype=np.int32)
    for sample_idx in range(token_indices.shape[0]):
        for token_idx in range(token_indices.shape[1]):
            if token_indices[sample_idx, token_idx] == padding_id:
                break
            attention[sample_idx, token_idx] = 1
    return attention

In [None]:
def calc_text_features(texts: List[List[str]], tok: AutoTokenizer) -> np.ndarray:
    f = np.zeros((len(texts), 9), dtype=np.float32)
    for idx, sentences in enumerate(texts):
        f[idx, 0] = len(sentences)
        words = []
        pure_words = []
        for cur_sent in sentences:
            words_in_sentence = nltk.word_tokenize(cur_sent)
            words += words_in_sentence
            pure_words += list(filter(lambda it: it.isalpha(), words_in_sentence))
        f[idx, 1] = len(words) / f[idx, 0]
        f[idx, 2] = len(pure_words) / f[idx, 0]
        f[idx, 3] = len(' '.join(sentences))
        f[idx, 4] = len(pure_words)
        f[idx, 5] = np.mean([len(w) for w in pure_words])
        for w in pure_words:
            syllables = tok.tokenize(w.lower())
            f[idx, 6] += len(syllables)
            f[idx, 7] += sum(map(lambda it: len(it), syllables))
        f[idx, 7] /= f[idx, 6]
        f[idx, 8] = f[idx, 6] / f[idx, 4]
    return f

In [None]:
def load_data_for_training(
    fname: str,
    tok: AutoTokenizer
) -> List[Dict[str, Tuple[List[str], float, float, np.ndarray]]]:
    loaded_header = []
    id_col_idx = -1
    text_col_idx = -1
    target_col_idx = -1
    std_col_idx = -1
    line_idx = 1
    data = dict()
    set_of_texts = set()
    t = Wordnet(v=True, n=True, p=0.5)
    with codecs.open(fname, mode='r', encoding='utf-8') as fp:
        data_reader = csv.reader(fp, quotechar='"', delimiter=',')
        for row in data_reader:
            if len(row) > 0:
                err_msg = f'File {fname}: line {line_idx} is wrong!'
                if len(loaded_header) == 0:
                    loaded_header = copy.copy(row)
                    try:
                        text_col_idx = loaded_header.index('excerpt')
                    except:
                        text_col_idx = -1
                    if text_col_idx <= 0:
                        raise ValueError(err_msg + ' Field "excerpt" is not found!')
                    try:
                        id_col_idx = loaded_header.index('id')
                    except:
                        id_col_idx = -1
                    if id_col_idx < 0:
                        raise ValueError(err_msg + ' Field "id" is not found!')
                    try:
                        target_col_idx = loaded_header.index('target')
                    except:
                        target_col_idx = -1
                    if target_col_idx < 0:
                        raise ValueError(err_msg + ' Field "target" is not found!')
                    try:
                        std_col_idx = loaded_header.index('standard_error')
                    except:
                        std_col_idx = -1
                    if std_col_idx < 0:
                        err_msg2 = f'{err_msg} Field "standard_error" is not found!'
                        raise ValueError(err_msg2)
                else:
                    sample_id = row[id_col_idx]
                    if sample_id != sample_id.strip():
                        raise ValueError(err_msg + f' {sample_id} is wrong sample ID!')
                    if sample_id in data:
                        err_msg2 = f'{err_msg} {sample_id} is not unique sample ID!'
                        raise ValueError(err_msg2)
                    text = row[text_col_idx].replace('\r', '\n')
                    if len(text) == 0:
                        raise ValueError(err_msg + f' Text {sample_id} is empty!')
                    sentences = []
                    for paragraph in map(lambda it: it.strip(), text.split('\n')):
                        if len(paragraph) > 0:
                            sentences += nltk.sent_tokenize(paragraph)
                    if len(sentences) == 0:
                        raise ValueError(err_msg + f' Text {sample_id} is empty!')
                    text = ' '.join([cur_sent.lower() for cur_sent in sentences])
                    if text in set_of_texts:
                        raise ValueError(err_msg + f' Text {sample_id} is not unique!')
                    set_of_texts.add(text.lower())
                    added_texts = [[cur_sent.lower() for cur_sent in sentences]]
                    try:
                        target_val = float(row[target_col_idx])
                        ok = True
                    except:
                        target_val = 0.0
                        ok = False
                    if not ok:
                        err_msg2 = err_msg
                        err_msg2 += f' {row[target_col_idx]} is wrong target for ' \
                                    f'text {sample_id}.'
                        raise ValueError(err_msg2)
                    try:
                        std_val = float(row[std_col_idx])
                        ok = (std_val > 0.0)
                    except:
                        std_val = 0.0
                        ok = False
                    if not ok:
                        err_msg2 = err_msg
                        err_msg2 += f' {row[std_col_idx]} is wrong standard error' \
                                    f' for text {sample_id}.'
                        warnings.warn(err_msg2)
                    else:
                        for _ in range(3):
                            new_augmented_text = []
                            for cur_sent in sentences:
                                new_sent = t.augment(cur_sent.lower()).strip().lower()
                                if len(new_sent) > 0:
                                    new_augmented_text.append(new_sent)
                            assert len(new_augmented_text) > 0
                            random.shuffle(new_augmented_text)
                            new_augmented_text_ = ' '.join(new_augmented_text)
                            if (len(new_augmented_text_) > 0) and \
                                    (new_augmented_text_ not in set_of_texts):
                                set_of_texts.add(new_augmented_text_)
                                added_texts.append(new_augmented_text)
                            del new_augmented_text, new_augmented_text_
                        data[sample_id] = (
                            list(map(lambda it: ' '.join(it), added_texts)),
                            target_val, std_val,
                            calc_text_features(added_texts, tok)
                        )
            line_idx += 1
    return data

In [None]:
def load_data_for_testing(fname: str, tok: AutoTokenizer, batch_size: int):
    loaded_header = []
    id_col_idx = -1
    text_col_idx = -1
    target_col_idx = -1
    std_col_idx = -1
    line_idx = 1
    data = dict()
    with codecs.open(fname, mode='r', encoding='utf-8') as fp:
        data_reader = csv.reader(fp, quotechar='"', delimiter=',')
        for row in data_reader:
            if len(row) > 0:
                err_msg = f'File {fname}: line {line_idx} is wrong!'
                if len(loaded_header) == 0:
                    loaded_header = copy.copy(row)
                    try:
                        text_col_idx = loaded_header.index('excerpt')
                    except:
                        text_col_idx = -1
                    if text_col_idx <= 0:
                        raise ValueError(err_msg + ' Field "excerpt" is not found!')
                    try:
                        id_col_idx = loaded_header.index('id')
                    except:
                        id_col_idx = -1
                    if id_col_idx < 0:
                        raise ValueError(err_msg + ' Field "id" is not found!')
                else:
                    sample_id = row[id_col_idx]
                    if sample_id != sample_id.strip():
                        raise ValueError(err_msg + f' {sample_id} is wrong sample ID!')
                    if sample_id in data:
                        err_msg2 = f'{err_msg} {sample_id} is not unique sample ID!'
                        raise ValueError(err_msg2)
                    text = row[text_col_idx].replace('\n', ' ').replace('\r', ' ')
                    text = ' '.join(text.split()).strip()
                    if len(text) == 0:
                        raise ValueError(err_msg + f' Text {sample_id} is empty!')
                    features = calc_text_features([nltk.sent_tokenize(text)], tok) 
                    data[sample_id] = (text, features)
                    if len(data) >= batch_size:
                        yield data
                        del data
                        data = dict()
            line_idx += 1
    if len(data) > 0:
        yield data

In [None]:
def train_output_scaler(data: Dict[str, Tuple[List[int], float, float,
                                              np.ndarray]]) -> StandardScaler:
    outputs_for_training = np.empty((len(data), 1), dtype=np.float64)
    for idx, sample_id in enumerate(list(data.keys())):
        outputs_for_training[idx, 0] = data[sample_id][1]
    return StandardScaler().fit(outputs_for_training)

In [None]:
def train_feature_scaler(data: Dict[str, Tuple[List[int], float, float,
                                               np.ndarray]]) -> Pipeline:
    features_for_training = []
    for sample_id in data:
        features_for_training.append(data[sample_id][3])
    features_for_training = np.vstack(features_for_training)
    scaler = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('transformer', PowerTransformer())
    ])
    return scaler.fit(features_for_training)

In [None]:
def tokenize_data(
    data: Union[List[Dict[str, Tuple[str, np.ndarray]]],
                List[Dict[str, Tuple[List[str], float, float, np.ndarray]]]],
    tokenizer: AutoTokenizer, max_seq_len: int
) -> Tuple[Union[Dict[str, Tuple[int, np.ndarray]],
                 Dict[str, Tuple[List[int], float, float, np.ndarray]]],
           np.ndarray]:
    tokenized_data = dict()
    all_tokens_matrix = []
    for sample_idx, cur_ID in enumerate(sorted(list(data.keys()))):
        if len(data[cur_ID]) == 2:
            tokens = tokenizer.tokenize(data[cur_ID][0].lower())
            tokenized_data[cur_ID] = (len(all_tokens_matrix), data[cur_ID][1])
            token_ids = tokenizer.convert_tokens_to_ids(
                [tokenizer.cls_token] + tokens + [tokenizer.sep_token]
            )
            ndiff = max_seq_len - len(token_ids)
            if ndiff > 0:
                token_ids += [tokenizer.pad_token_id for _ in range(ndiff)]
            elif ndiff < 0:
                token_ids = token_ids[:max_seq_len]
            all_tokens_matrix.append(token_ids)
        else:
            text_idx_list = []
            for cur_text in data[cur_ID][0]:
                tokens = tokenizer.tokenize(cur_text.lower())
                token_ids = tokenizer.convert_tokens_to_ids(
                    [tokenizer.cls_token] + tokens + [tokenizer.sep_token]
                )
                ndiff = max_seq_len - len(token_ids)
                if ndiff > 0:
                    token_ids += [tokenizer.pad_token_id for _ in range(ndiff)]
                elif ndiff < 0:
                    token_ids = token_ids[:max_seq_len]
                text_idx_list.append(len(all_tokens_matrix))
                all_tokens_matrix.append(token_ids)
            tokenized_data[cur_ID] = (text_idx_list, data[cur_ID][1], data[cur_ID][2],
                                      data[cur_ID][3])
    return tokenized_data, np.array(all_tokens_matrix, dtype=np.int32)

In [None]:
def print_info_about_data(
    data: Union[List[Dict[str, Tuple[str, np.ndarray]]],
                List[Dict[str, Tuple[List[str], float, float, np.ndarray]]]],
    identifiers: List[str]
):
    for_training = (len(data[identifiers[0]]) == 4)
    if for_training:
        print(f'Number of samples for training is {len(data)}.')
    else:
        print(f'Number of samples for submission is {len(data)}.')
    print('')
    print(f'{len(identifiers)} random samples:')
    for cur_id in identifiers:
        print('')
        print(f'  Sample {cur_id}')
        if for_training:
            print('  Text:')
            print(f'    {data[cur_id][0][0]}')
            print(f'  Number of augmented texts is {len(data[cur_id][0]) - 1}.')
            if (len(data[cur_id][0]) - 1) > 0:
                if (len(data[cur_id][0]) - 1) > 1:
                    print('  2 augmented texts:')
                    for augmented in data[cur_id][0][1:3]:
                        print(f'    {augmented}')
                else:
                    print('  Augmented text:')
                    for augmented in data[cur_id][0][1:2]:
                        print(f'    {augmented}')
            print('  Target:')
            print(f'    {data[cur_id][1]} +- {data[cur_id][2]}')
            print('  Features:')
            for it in data[cur_id][3].tolist(): print(f'    {it}') 
        else:
            print(' Text:')
            print(f'    {data[cur_id][0]}')
            print(' Features:')
            print(f'    {data[cur_id][1].tolist()[0]}')

In [None]:
def print_info_about_tokenized_data(
    data: Union[Dict[str, Tuple[int, np.ndarray]],
                Dict[str, Tuple[List[int], float, float, np.ndarray]]],
    matrix: np.ndarray,
    identifiers: List[str]
):
    for_training = (len(data[identifiers[0]]) == 4)
    if for_training:
        print(f'Number of tokenized samples for training is {len(data)}.')
    else:
        print(f'Number of tokenized samples for submission is {len(data)}.')
    print('')
    print(f'{len(identifiers)} random samples:')
    for cur_id in identifiers:
        print('')
        print(f'Sample {cur_id}')
        print('')
        sample_idx = data[cur_id][0][0]
        print(matrix[sample_idx].tolist())
        print('')
        print(data[cur_id][-1][0].tolist())
        print('')

In [None]:
def find_median_distance_between_pairs(data: Dict[str, Tuple[List[int], float, float,
                                                             np.ndarray]],
                                       identifiers: List[str]) -> float:
    distances = []
    assert len(identifiers) == len(set(identifiers))
    for idx, first_id in enumerate(identifiers):
        first_target = data[first_id][1]
        for second_id in identifiers[(idx + 1):]:
            second_target = data[second_id][1]
            distances.append(abs(first_target - second_target))
    distances.sort()
    distances = np.array(distances, dtype=np.float32)
    n = distances.shape[0]
    print('Mean distance between training pairs is {0:.5f}.'.format(
        np.mean(distances)
    ))
    print('Minimal distance between training pairs is {0:.5f}.'.format(
        np.min(distances)
    ))
    print('Maximal distance between training pairs is {0:.5f}.'.format(
        np.max(distances)
    ))
    print('Median distance between training pairs is {0:.5f}.'.format(
        distances[(n - 1) // 2]
    ))
    return distances[(n - 1) // 2]

In [None]:
def tf_euclidean_distance(vects):
    x, y = vects
    sum_square = tf.keras.backend.sum(tf.keras.backend.square(x - y),
                                      axis=1, keepdims=True)
    return tf.keras.backend.sqrt(
        tf.keras.backend.maximum(sum_square, tf.keras.backend.epsilon())
    )

In [None]:
def tf_eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

In [None]:
def build_neural_network(bert_name: str, max_seq_len: int, feature_vector_size: int,
                         batch_size: int) -> Tuple[tf.keras.Model, tf.keras.Model]:
    transformer_model = TFAutoModel.from_pretrained(
        pretrained_model_name_or_path=bert_name,
        name='BaseTransformer'
    )
    united_embedding_size = 512
    transformer_config = AutoConfig.from_pretrained(bert_name)
    united_emb_layer = tf.keras.layers.Dense(
        units=united_embedding_size, input_dim=transformer_config.hidden_size,
        activation='tanh',
        kernel_initializer=tf.keras.initializers.GlorotNormal(seed=42),
        bias_initializer='zeros',
        name='UnitedEmbeddingLayer'
    )
    print('Transformer Configuration')
    print('=========================')
    print(transformer_config)
    left_tokens = tf.keras.layers.Input(shape=(max_seq_len,), batch_size=batch_size,
                                        dtype=tf.int32, name='word_ids')
    left_attention = tf.keras.layers.Input(shape=(max_seq_len,), batch_size=batch_size,
                                           dtype=tf.int32, name='attention_mask')
    left_features = tf.keras.layers.Input(shape=(feature_vector_size,), dtype=tf.float32,
                                          batch_size=batch_size, name='features')
    right_tokens = tf.keras.layers.Input(shape=(max_seq_len,), batch_size=batch_size,
                                         dtype=tf.int32, name='right_word_ids')
    right_attention = tf.keras.layers.Input(shape=(max_seq_len,), batch_size=batch_size,
                                            dtype=tf.int32, name='right_attention_mask')
    right_features = tf.keras.layers.Input(shape=(feature_vector_size,), dtype=tf.float32,
                                           batch_size=batch_size, name='right_features')
    left_sequence_output = transformer_model([left_tokens, left_attention])[0]
    left_output_mask = MaskCalculator(
        output_dim=transformer_config.hidden_size, trainable=False,
        name='OutMaskCalculator'
    )(left_attention)
    left_masked_output = tf.keras.layers.Multiply(
        name='OutMaskMultiplicator'
    )([left_output_mask, left_sequence_output])
    left_masked_output = tf.keras.layers.Masking(
        name='OutMasking'
    )(left_masked_output)
    left_output = tf.keras.layers.GlobalAvgPool1D(name='AvePool')(left_masked_output)
    left_output = tf.keras.layers.LayerNormalization(
        name='Emdedding'
    )(left_output)
    left_output = tf.keras.layers.Concatenate(
        name='Concat'
    )([left_output, left_features])
    left_output = tf.keras.layers.Dropout(
        rate=0.3, seed=42, name='Dropout1' 
    )(left_output)
    left_output = united_emb_layer(left_output)
    right_sequence_output = transformer_model([right_tokens, right_attention])[0]
    right_output_mask = MaskCalculator(
        output_dim=transformer_config.hidden_size, trainable=False,
        name='OutMaskCalculator_right'
    )(right_attention)
    right_masked_output = tf.keras.layers.Multiply(
        name='OutMaskMultiplicator_right'
    )([right_output_mask, right_sequence_output])
    right_masked_output = tf.keras.layers.Masking(
        name='OutMasking_right'
    )(right_masked_output)
    right_output = tf.keras.layers.GlobalAvgPool1D(
        name='AvePool_right'
    )(right_masked_output)
    right_output = tf.keras.layers.LayerNormalization(
       name='Emdedding_right'
    )(right_output)
    right_output = tf.keras.layers.Concatenate(
        name='Concat_right'
    )([right_output, right_features])
    right_output = tf.keras.layers.Dropout(
        rate=0.3, seed=42, name='Dropout1_right'
    )(right_output)
    right_output = united_emb_layer(right_output)
    distance_output = tf.keras.layers.Lambda(
        function=tf_euclidean_distance,
        output_shape=tf_eucl_dist_output_shape,
        name='L2DistLayer'
    )([left_output, right_output])
    regression_layer = tf.keras.layers.Dense(
        units=1, input_dim=united_embedding_size, activation=None,
        kernel_initializer=tf.keras.initializers.GlorotNormal(seed=42),
        bias_initializer='zeros',
        name='RegressionLayer'
    )
    left_regression_output = tf.keras.layers.Dropout(
        rate=0.3, seed=42, name='Dropout2'
    )(left_output)
    left_regression_output = regression_layer(left_regression_output)
    right_regression_output = tf.keras.layers.Dropout(
        rate=0.3, seed=42, name='Dropout2_right'
    )(right_output)
    right_regression_output = regression_layer(right_regression_output)
    regression_model = tf.keras.Model(
        inputs=[left_tokens, left_attention, left_features],
        outputs=[left_regression_output, left_output],
        name='RegressionModel'
    )
    regression_model.build(input_shape=[(batch_size, max_seq_len),
                                        (batch_size, max_seq_len),
                                        (batch_size, feature_vector_size)])
    siamese_model = tf.keras.Model(
        inputs=[left_tokens, left_attention, left_features,
                right_tokens, right_attention, right_features],
        outputs=[distance_output, left_regression_output, right_regression_output],
        name='SiameseModel'
    )
    radam = tfa.optimizers.RectifiedAdam(learning_rate=1e-4)
    ranger = tfa.optimizers.Lookahead(radam, sync_period=6, slow_step_size=0.5)
    losses = [
        DBLLogLoss(),
        tf.keras.losses.MeanSquaredError(),
        tf.keras.losses.MeanSquaredError()
    ]
    loss_weights = [2.0, 1.0, 1.0]
    siamese_model.compile(optimizer=ranger, loss=losses,
                          loss_weights=loss_weights)
    return siamese_model, regression_model

In [None]:
def show_minibatch(X: List[np.ndarray], y: List[np.ndarray]):
    assert len(X) == 6
    assert len(y) == 3
    print('')
    print('X1')
    for it in X[0].tolist(): print(it)
    print('')
    print('X2')
    for it in X[1].tolist(): print(it)
    print('')
    print('X3')
    for it in X[2].tolist(): print(it)
    print('')
    print('X4')
    for it in X[3].tolist(): print(it)
    print('')
    print('X5')
    for it in X[4].tolist(): print(it)
    print('X6')
    for it in X[5].tolist(): print(it) 
    print('')
    print('y1')
    for it in y[0].tolist(): print(it)
    print('')
    print('y2')
    for it in y[1].tolist(): print(it)
    print('')
    print('y3')
    for it in y[2].tolist(): print(it)

In [None]:
def show_tsne(regressor: tf.keras.Model, batch_size: int,
              data: Dict[str, Tuple[List[int], float, float, np.ndarray]],
              feature_scaler: Pipeline,
              token_matrix: np.ndarray,
              identifiers: List[str], pad_id: int, title: str, figure_id: int):
    indices = list(map(lambda it: data[it][0][0], identifiers))
    colors = np.array(
        list(map(lambda it: data[it][1], identifiers)),
        dtype=np.float64
    )
    area = np.array(
        list(map(lambda it: data[it][2], identifiers)),
        dtype=np.float64
    )
    area /= np.max(area)
    area *= 10.0
    area = np.power(area, 2)
    texts = token_matrix[indices]
    src_features = np.vstack(
        list(map(
            lambda it: data[it][3][0:1], 
            identifiers
        ))
    )
    assert src_features.shape[0] == texts.shape[0]
    ndiff = texts.shape[0] % batch_size
    if ndiff > 0:
        last_text_idx = texts.shape[0] - 1
        texts = np.vstack(
            [texts] + 
            [texts[last_text_idx:(last_text_idx + 1)]
             for _ in range(batch_size - ndiff)]
        )
        src_features = np.vstack(
            [src_features] +
            [src_features[last_text_idx:(last_text_idx + 1)]
             for _ in range(batch_size - ndiff)]
        )
    attentions = generate_attention_mask(texts, pad_id)
    assert texts.shape[0] % batch_size == 0, f'{texts.shape[0] % batch_size}'
    _, features = regressor.predict(
        [texts, attentions, feature_scaler.transform(src_features)],
        batch_size=batch_size
    )
    features = features[:len(indices)]
    projected_features = TSNE(n_components=2, n_jobs=-1).fit_transform(features)
    fig = plt.figure(figure_id, figsize=(11, 11))
    plt.scatter(x=projected_features[:, 0], y=projected_features[:, 1],
                marker='o', cmap=plt.cm.get_cmap("jet"), s=area,
                c=colors, norm=Normalize(vmin=np.min(colors), vmax=np.max(colors)))
    plt.title('t-SNE projections of texts ' + title)
    plt.colorbar()
    plt.show()

In [None]:
def show_training_process(history: tf.keras.callbacks.History, metric_name: str,
                          figure_id: int):
    val_metric_name = 'val_' + metric_name
    possible_metrics = list(history.history.keys())
    if metric_name not in history.history:
        err_msg = f'The metric "{metric_name}" is not found!'
        err_msg += f' Available metrics are: {possible_metrics}.'
        raise ValueError(err_msg)
    fig = plt.figure(figure_id, figsize=(7, 7))
    metric_values = history.history[metric_name]
    plt.plot(list(range(len(metric_values))), metric_values,
             label='Training {0}'.format(metric_name))
    if val_metric_name in history.history:
        val_metric_values = history.history['val_' + metric_name]
        assert len(metric_values) == len(val_metric_values)
        plt.plot(list(range(len(val_metric_values))), val_metric_values,
                 label='Validation {0}'.format(metric_name))
    plt.xlabel('Epochs')
    plt.ylabel(metric_name)
    plt.title('Training process')
    plt.legend(loc='best')
    plt.show()

In [None]:
def generate_new_trainset(regressor: tf.keras.Model, feature_scaler: Pipeline,
                          output_scaler: StandardScaler, batch_size: int,
                          data: Dict[str, Tuple[List[int], float, float]],
                          token_matrix: np.ndarray, pad_id: int,
                          identifiers: List[str]) -> Tuple[np.ndarray, np.ndarray,
                                                           np.ndarray]:
    indices = list(map(lambda it: data[it][0][0], identifiers))
    texts = token_matrix[indices]
    src_features = np.vstack(list(map(lambda it: data[it][3][0:1], identifiers)))
    targets = np.array(list(map(lambda it: data[it][1], identifiers)),
                       dtype=np.float64)
    assert texts.shape[0] == src_features.shape[0]
    ndiff = texts.shape[0] % batch_size
    if ndiff > 0:
        last_text_idx = texts.shape[0] - 1
        texts = np.vstack(
            [texts] + 
            [texts[last_text_idx:(last_text_idx + 1)]
             for _ in range(batch_size - ndiff)]
        )
        src_features = np.vstack(
            [src_features] +
            [src_features[last_text_idx:(last_text_idx + 1)]
             for _ in range(batch_size - ndiff)]
        )
    attentions = generate_attention_mask(texts, pad_id)
    assert texts.shape[0] % batch_size == 0, f'{texts.shape[0] % batch_size}'
    predictions, target_features = regressor.predict(
        [texts, attentions, feature_scaler.transform(src_features)],
        batch_size=batch_size
    )
    assert predictions.shape[0] == target_features.shape[0]
    assert target_features.shape[1] > 1
    target_features = target_features[:len(identifiers)]
    predictions = output_scaler.inverse_transform(
        predictions[:len(identifiers)]
    ).reshape((len(identifiers),))
    return target_features, targets, predictions

In [None]:
def do_predictions(regressor: tf.keras.Model, feature_scaler: Pipeline,
                   output_scaler: StandardScaler, 
                   batch_size: int,
                   data: Union[Dict[str, int], Dict[str, Tuple[List[int], float, float]]],
                   token_matrix: np.ndarray, pad_id: int,
                   identifiers: List[str]=None) -> Dict[str, Tuple[float, np.ndarray]]:
    if identifiers is None:
        identifiers_ = sorted(list(data.keys()))
    else:
        identifiers_ = sorted(identifiers)
    indices = list(map(
        lambda it: data[it][0] if len(data[it]) == 2 else data[it][0][0],
        identifiers_
    ))
    texts = token_matrix[indices]
    src_features = np.vstack(
        list(map(
            lambda it: data[it][1] if len(data[it]) == 2 else data[it][3][0:1],
            identifiers_
        ))
    )
    assert texts.shape[0] == src_features.shape[0]
    ndiff = texts.shape[0] % batch_size
    if ndiff > 0:
        last_text_idx = texts.shape[0] - 1
        texts = np.vstack(
            [texts] + 
            [texts[last_text_idx:(last_text_idx + 1)]
             for _ in range(batch_size - ndiff)]
        )
        src_features = np.vstack(
            [src_features] +
            [src_features[last_text_idx:(last_text_idx + 1)]
             for _ in range(batch_size - ndiff)]
        )
    attentions = generate_attention_mask(texts, pad_id)
    assert texts.shape[0] % batch_size == 0, f'{texts.shape[0] % batch_size}'
    predictions, target_features = regressor.predict(
        [texts, attentions, feature_scaler.transform(src_features)],
        batch_size=batch_size
    )
    assert predictions.shape[0] == target_features.shape[0]
    assert target_features.shape[1] > 1
    predictions = np.reshape(predictions, newshape=(predictions.shape[0], 1))
    predictions = output_scaler.inverse_transform(predictions)
    return dict(map(
        lambda idx: (
            identifiers_[idx],
            (predictions[idx, 0], target_features[idx:(idx + 1)])
        ),
        range(len(indices))
    ))

In [None]:
def mixup(X: np.ndarray, y: np.ndarray, mixup_coeff: float,
          n_samples: int) -> Tuple[np.ndarray, np.ndarray]:
    assert (mixup_coeff > 0.0) and (mixup_coeff < 1.0)
    assert len(X.shape) == 2
    assert len(y.shape) == 1
    assert X.shape[0] == y.shape[0]
    X_new = np.empty((n_samples, X.shape[1]), dtype=np.float64)
    y_new = np.empty((n_samples,), dtype=np.float64)
    for sample_idx in range(n_samples):
        idx1 = random.randint(0, X.shape[0] - 1)
        idx2 = random.randint(0, X.shape[0] - 1)
        X_new[sample_idx] = (1.0 - mixup_coeff) * X[idx1] + mixup_coeff * X[idx2]
        y_new[sample_idx] = (1.0 - mixup_coeff) * y[idx1] + mixup_coeff * y[idx2]
    return X_new, y_new

In [None]:
def build_regressor(trainset: Tuple[np.ndarray, np.ndarray],
                    validset: Tuple[np.ndarray, np.ndarray],
                    batch_size: int, ensemble_idx: int) -> tf.keras.Model:
    regressor = tf.keras.Sequential(
        layers=[
            tf.keras.layers.InputLayer(
                input_shape=(trainset[0].shape[1],),
                dtype=tf.float32,
                name=f'input_nn{ensemble_idx}'
            ),
            tf.keras.layers.AlphaDropout(
                rate=0.1, seed=ensemble_idx * 10,
                name=f'dropout1_nn{ensemble_idx}'
            ),
            tf.keras.layers.Dense(
                units=400, activation='selu',
                kernel_initializer=tf.keras.initializers.LecunNormal(
                    seed=(ensemble_idx + 1) * 10
                ),
                name=f'dense1_nn{ensemble_idx}'
            ),
            tf.keras.layers.AlphaDropout(
                rate=0.1, seed=ensemble_idx * 20,
                name=f'dropout2_nn{ensemble_idx}'
            ),
            tf.keras.layers.Dense(
                units=400, activation='selu',
                kernel_initializer=tf.keras.initializers.LecunNormal(
                    seed=(ensemble_idx + 1) * 20
                ),
                name=f'dense2_nn{ensemble_idx}'
            ),
            tf.keras.layers.AlphaDropout(
                rate=0.1, seed=ensemble_idx * 30,
                name=f'dropout3_nn{ensemble_idx}'
            ),
            tf.keras.layers.Dense(
                units=300, activation='selu',
                kernel_initializer=tf.keras.initializers.LecunNormal(
                    seed=(ensemble_idx + 1) * 30
                ),
                name=f'dense3_nn{ensemble_idx}'
            ),
            tf.keras.layers.AlphaDropout(
                rate=0.1, seed=ensemble_idx * 40,
                name=f'dropout4_nn{ensemble_idx}'
            ),
            tf.keras.layers.Dense(
                units=300, activation='selu',
                kernel_initializer=tf.keras.initializers.LecunNormal(
                    seed=(ensemble_idx + 1) * 40
                ),
                name=f'dense4_nn{ensemble_idx}'
            ),
            tf.keras.layers.AlphaDropout(
                rate=0.1, seed=ensemble_idx * 50,
                name=f'dropout5_nn{ensemble_idx}'
            ),
            tf.keras.layers.Dense(
                units=200, activation='selu',
                kernel_initializer=tf.keras.initializers.LecunNormal(
                    seed=(ensemble_idx + 1) * 50
                ),
                name=f'dense5_nn{ensemble_idx}'
            ),
            tf.keras.layers.AlphaDropout(
                rate=0.1, seed=ensemble_idx * 60,
                name=f'dropout6_nn{ensemble_idx}'
            ),
            tf.keras.layers.Dense(
                units=100, activation='selu',
                kernel_initializer=tf.keras.initializers.LecunNormal(
                    seed=(ensemble_idx + 1) * 60
                ),
                name=f'dense6_nn{ensemble_idx}'
            ),
            tf.keras.layers.AlphaDropout(
                rate=0.1, seed=ensemble_idx * 70,
                name=f'dropout7_nn{ensemble_idx}'
            ),
            tf.keras.layers.Dense(
                units=1, activation=None,
                kernel_initializer=tf.keras.initializers.LecunNormal(
                    seed=(ensemble_idx + 1) * 70
                ),
                name=f'dense7_nn{ensemble_idx}'
            )
        ],
        name=f'FinalRegressor{ensemble_idx}'
    )
    radam = tfa.optimizers.RectifiedAdam(learning_rate=1e-3)
    ranger = tfa.optimizers.Lookahead(radam, sync_period=6, slow_step_size=0.5)
    regressor.compile(optimizer=ranger, loss=tf.keras.losses.MeanSquaredError(),
                      metrics=[tf.keras.metrics.RootMeanSquaredError()])
    regressor.summary()
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor="val_root_mean_squared_error",
            patience=10,
            verbose=True,
            restore_best_weights=True
        )
    ]
    steps_per_epoch = (3 * validset[0].shape[0]) // batch_size
    tf_trainset = tf.data.Dataset.from_tensor_slices(
        trainset
    ).repeat().shuffle(trainset[0].shape[0]).batch(batch_size)
    tf_validset = tf.data.Dataset.from_tensor_slices(
        validset
    ).batch(batch_size) 
    regressor.fit(tf_trainset, validation_data=tf_validset,
                  callbacks=callbacks, epochs=1000,
                  steps_per_epoch=steps_per_epoch)
    return regressor

In [None]:
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
MAX_TEXT_LEN = 256
PRETRAINED_BERT = '/kaggle/input/tfdistilbertbaseuncased'
MINIBATCH_SIZE = 32

In [None]:
DATA_DIR = '/kaggle/input/commonlitreadabilityprize'
MODEL_DIR = '/kaggle/working'
print(f'{DATA_DIR} {os.path.isdir(DATA_DIR)}')
print(f'{MODEL_DIR} {os.path.isdir(MODEL_DIR)}')

In [None]:
trainset_name = os.path.join(DATA_DIR, 'train.csv')
print(f'{trainset_name} {os.path.isfile(trainset_name)}')

In [None]:
testset_name = os.path.join(DATA_DIR, 'test.csv')
print(f'{testset_name} {os.path.isfile(testset_name)}')

In [None]:
submission_name = os.path.join(MODEL_DIR, 'submission.csv')
print(f'{submission_name} {os.path.isfile(submission_name)}')

In [None]:
regression_model_name = os.path.join(MODEL_DIR, 'regression_nn.h5')
ensemble_name = os.path.join(MODEL_DIR, 'ensemble')
scaler_name = os.path.join(MODEL_DIR, 'output_scaler.pkl')
figure_identifier = 1

In [None]:
pretrained_tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_BERT)
print(f'Vocabulary size is {pretrained_tokenizer.vocab_size}.')

In [None]:
data_for_training = load_data_for_training(trainset_name,
                                           pretrained_tokenizer)
assert len(data_for_training) > 100

In [None]:
all_IDs = sorted(list(data_for_training.keys()))
selected_IDs_for_training = random.sample(
    population=all_IDs,
    k=3
)
print_info_about_data(data_for_training, selected_IDs_for_training)

In [None]:
labels_for_training, tokens_for_training = tokenize_data(
    data=data_for_training,
    tokenizer=pretrained_tokenizer,
    max_seq_len=MAX_TEXT_LEN
)
print_info_about_tokenized_data(
    data=labels_for_training,
    matrix=tokens_for_training,
    identifiers=selected_IDs_for_training
)

In [None]:
text_feature_scaler = train_feature_scaler(labels_for_training) 
label_scaler = train_output_scaler(labels_for_training)
with open(scaler_name, 'wb') as scaler_fp:
    pickle.dump((text_feature_scaler, label_scaler), scaler_fp)

In [None]:
random.shuffle(all_IDs)
n_train_size = int(round(len(all_IDs) * 0.8))
n_val_size = int(round(len(all_IDs) * 0.1))
IDs_for_training = all_IDs[:n_train_size]
IDs_for_validation = all_IDs[n_train_size:(n_train_size + n_val_size)]
IDs_for_final_testing = all_IDs[(n_train_size + n_val_size):]

In [None]:
median_dist = find_median_distance_between_pairs(
    data=labels_for_training,
    identifiers=IDs_for_training
)

In [None]:
datagen_for_validation = DatasetGen(
    data=labels_for_training,
    data_IDs=IDs_for_validation,
    token_indices=tokens_for_training,
    median_distance_between_pairs=median_dist,
    pad_token_id=pretrained_tokenizer.pad_token_id,
    batch_size=MINIBATCH_SIZE,
    output_scaler=label_scaler,
    feature_scaler=text_feature_scaler
)
n_batches_per_validset = len(datagen_for_validation)
print(f'Mini-batches per validation set is {n_batches_per_validset}.')

In [None]:
X_, y_, _ = datagen_for_validation[0]
show_minibatch(X_, y_)

In [None]:
n_batches_per_epoch = 10 * n_batches_per_validset
datagen_for_training = DatasetGen(
    data=labels_for_training,
    data_IDs=IDs_for_training,
    token_indices=tokens_for_training,
    median_distance_between_pairs=median_dist,
    pad_token_id=pretrained_tokenizer.pad_token_id,
    batch_size=MINIBATCH_SIZE,
    batches_per_epoch=n_batches_per_epoch,
    output_scaler=label_scaler, 
    feature_scaler=text_feature_scaler
)

In [None]:
X_, y_, _ = datagen_for_training[0] 
show_minibatch(X_, y_)

In [None]:
model_for_training, model_for_inference = build_neural_network(
    bert_name=PRETRAINED_BERT,
    max_seq_len=MAX_TEXT_LEN,
    feature_vector_size=text_feature_scaler.named_steps['scaler'].scale_.shape[0],
    batch_size=MINIBATCH_SIZE
)

In [None]:
model_for_training.summary()

In [None]:
model_for_inference.summary()

In [None]:
show_tsne(regressor=model_for_inference, batch_size=MINIBATCH_SIZE,
          feature_scaler=text_feature_scaler,
          data=labels_for_training, token_matrix=tokens_for_training,
          identifiers=IDs_for_validation + IDs_for_final_testing,
          pad_id=pretrained_tokenizer.pad_token_id,
          title='before training', figure_id=figure_identifier)
figure_identifier += 1

In [None]:
predictions_for_validation = do_predictions(
    regressor=model_for_inference, feature_scaler=text_feature_scaler,
    output_scaler=label_scaler, 
    batch_size=MINIBATCH_SIZE,
    data=labels_for_training, token_matrix=tokens_for_training,
    pad_id=pretrained_tokenizer.pad_token_id,
    identifiers=IDs_for_validation
)

In [None]:
error = 0.0
for cur_id in IDs_for_validation:
    difference = predictions_for_validation[cur_id][0] - labels_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(IDs_for_validation))
error = np.sqrt(error)
print(f'RMSE on validation set before training = {error}')

In [None]:
del predictions_for_validation, error

In [None]:
predictions_for_testing = do_predictions(
    regressor=model_for_inference, feature_scaler=text_feature_scaler,
    output_scaler=label_scaler, 
    batch_size=MINIBATCH_SIZE,
    data=labels_for_training, token_matrix=tokens_for_training,
    pad_id=pretrained_tokenizer.pad_token_id,
    identifiers=IDs_for_final_testing
)

In [None]:
error = 0.0
for cur_id in IDs_for_final_testing:
    difference = predictions_for_testing[cur_id][0] - labels_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(IDs_for_final_testing))
error = np.sqrt(error)
print(f'RMSE on test set before training = {error}')

In [None]:
del predictions_for_testing, error

In [None]:
callbacks = [
    tfa.callbacks.TimeStopping(seconds=int(round(3600 * 1.7)), verbose=True),
    tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=7,
        verbose=True,
        restore_best_weights=True
    )
]

In [None]:
history = model_for_training.fit(datagen_for_training,
                                 validation_data=datagen_for_validation,
                                 epochs=1000, callbacks=callbacks)

In [None]:
model_for_inference.save_weights(regression_model_name)

In [None]:
show_training_process(history, "loss", figure_identifier)
figure_identifier += 1

In [None]:
show_tsne(regressor=model_for_inference, batch_size=MINIBATCH_SIZE,
          feature_scaler=text_feature_scaler,
          data=labels_for_training, token_matrix=tokens_for_training,
          identifiers=IDs_for_validation + IDs_for_final_testing,
          pad_id=pretrained_tokenizer.pad_token_id,
          title='after training', figure_id=figure_identifier)
figure_identifier += 1

In [None]:
predictions_for_validation = do_predictions(
    regressor=model_for_inference, feature_scaler=text_feature_scaler,
    output_scaler=label_scaler, 
    batch_size=MINIBATCH_SIZE,
    data=labels_for_training, token_matrix=tokens_for_training,
    pad_id=pretrained_tokenizer.pad_token_id,
    identifiers=IDs_for_validation
)

In [None]:
error = 0.0
for cur_id in IDs_for_validation:
    difference = predictions_for_validation[cur_id][0] - labels_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(IDs_for_validation))
error = np.sqrt(error)
print(f'RMSE on validation set after training = {error}')

In [None]:
del predictions_for_validation, error

In [None]:
predictions_for_testing = do_predictions(
    regressor=model_for_inference, feature_scaler=text_feature_scaler,
    output_scaler=label_scaler, 
    batch_size=MINIBATCH_SIZE,
    data=labels_for_training, token_matrix=tokens_for_training,
    pad_id=pretrained_tokenizer.pad_token_id,
    identifiers=IDs_for_final_testing
)

In [None]:
error = 0.0
for cur_id in IDs_for_final_testing:
    difference = predictions_for_testing[cur_id][0] - labels_for_training[cur_id][1]
    error += (difference * difference)
error /= float(len(IDs_for_final_testing))
error = np.sqrt(error)
print(f'RMSE on test set after training = {error}')

In [None]:
del predictions_for_testing, error

In [None]:
X_train, y_train, _ = generate_new_trainset(
    regressor=model_for_inference, feature_scaler=text_feature_scaler,
    output_scaler=label_scaler, batch_size=MINIBATCH_SIZE,
    data=labels_for_training, token_matrix=tokens_for_training,
    pad_id=pretrained_tokenizer.pad_token_id,
    identifiers=IDs_for_training
)

In [None]:
X_val, y_val, _ = generate_new_trainset(
    regressor=model_for_inference, feature_scaler=text_feature_scaler,
    output_scaler=label_scaler, batch_size=MINIBATCH_SIZE,
    data=labels_for_training, token_matrix=tokens_for_training,
    pad_id=pretrained_tokenizer.pad_token_id,
    identifiers=IDs_for_validation
)

In [None]:
X_test, y_test, ave_test_pred = generate_new_trainset(
    regressor=model_for_inference, feature_scaler=text_feature_scaler,
    output_scaler=label_scaler, batch_size=MINIBATCH_SIZE,
    data=labels_for_training, token_matrix=tokens_for_training,
    pad_id=pretrained_tokenizer.pad_token_id,
    identifiers=IDs_for_final_testing
)

In [None]:
del datagen_for_training, datagen_for_validation
del labels_for_training, tokens_for_training
del data_for_training
del IDs_for_training, IDs_for_validation, IDs_for_final_testing
del model_for_training
gc.collect()

In [None]:
ensemble = []

In [None]:
ensemble.append(
    build_regressor(
        trainset=mixup(
            X=X_train,
            y=label_scaler.transform(
                y_train.reshape((y_train.shape[0], 1))
            ).reshape((y_train.shape[0],)),
            mixup_coeff=0.1,
            n_samples=40000
        ),
        validset=(
            X_val,
            label_scaler.transform(
                y_val.reshape((y_val.shape[0], 1))
            ).reshape((y_val.shape[0],))
        ),
        batch_size=MINIBATCH_SIZE,
        ensemble_idx=1
    )
)

In [None]:
ensemble.append(
    build_regressor(
        trainset=mixup(
            X=X_train,
            y=label_scaler.transform(
                y_train.reshape((y_train.shape[0], 1))
            ).reshape((y_train.shape[0],)),
            mixup_coeff=0.1,
            n_samples=40000
        ),
        validset=(
            X_val,
            label_scaler.transform(
                y_val.reshape((y_val.shape[0], 1))
            ).reshape((y_val.shape[0],))
        ),
        batch_size=MINIBATCH_SIZE,
        ensemble_idx=2
    )
)

In [None]:
ensemble.append(
    build_regressor(
        trainset=mixup(
            X=X_train,
            y=label_scaler.transform(
                y_train.reshape((y_train.shape[0], 1))
            ).reshape((y_train.shape[0],)),
            mixup_coeff=0.1,
            n_samples=40000
        ),
        validset=(
            X_val,
            label_scaler.transform(
                y_val.reshape((y_val.shape[0], 1))
            ).reshape((y_val.shape[0],))
        ),
        batch_size=MINIBATCH_SIZE,
        ensemble_idx=3
    )
)

In [None]:
ensemble.append(
    build_regressor(
        trainset=mixup(
            X=X_train,
            y=label_scaler.transform(
                y_train.reshape((y_train.shape[0], 1))
            ).reshape((y_train.shape[0],)),
            mixup_coeff=0.1,
            n_samples=40000
        ),
        validset=(
            X_val,
            label_scaler.transform(
                y_val.reshape((y_val.shape[0], 1))
            ).reshape((y_val.shape[0],))
        ),
        batch_size=MINIBATCH_SIZE,
        ensemble_idx=4
    )
)

In [None]:
ensemble.append(
    build_regressor(
        trainset=mixup(
            X=X_train,
            y=label_scaler.transform(
                y_train.reshape((y_train.shape[0], 1))
            ).reshape((y_train.shape[0],)),
            mixup_coeff=0.1,
            n_samples=40000
        ),
        validset=(
            X_val,
            label_scaler.transform(
                y_val.reshape((y_val.shape[0], 1))
            ).reshape((y_val.shape[0],))
        ),
        batch_size=MINIBATCH_SIZE,
        ensemble_idx=5
    )
)

In [None]:
ensemble.append(
    build_regressor(
        trainset=mixup(
            X=X_train,
            y=label_scaler.transform(
                y_train.reshape((y_train.shape[0], 1))
            ).reshape((y_train.shape[0],)),
            mixup_coeff=0.1,
            n_samples=40000
        ),
        validset=(
            X_val,
            label_scaler.transform(
                y_val.reshape((y_val.shape[0], 1))
            ).reshape((y_val.shape[0],))
        ),
        batch_size=MINIBATCH_SIZE,
        ensemble_idx=6
    )
)

In [None]:
ensemble.append(
    build_regressor(
        trainset=mixup(
            X=X_train,
            y=label_scaler.transform(
                y_train.reshape((y_train.shape[0], 1))
            ).reshape((y_train.shape[0],)),
            mixup_coeff=0.1,
            n_samples=40000
        ),
        validset=(
            X_val,
            label_scaler.transform(
                y_val.reshape((y_val.shape[0], 1))
            ).reshape((y_val.shape[0],))
        ),
        batch_size=MINIBATCH_SIZE,
        ensemble_idx=7
    )
)

In [None]:
ensemble.append(
    build_regressor(
        trainset=mixup(
            X=X_train,
            y=label_scaler.transform(
                y_train.reshape((y_train.shape[0], 1))
            ).reshape((y_train.shape[0],)),
            mixup_coeff=0.1,
            n_samples=40000
        ),
        validset=(
            X_val,
            label_scaler.transform(
                y_val.reshape((y_val.shape[0], 1))
            ).reshape((y_val.shape[0],))
        ),
        batch_size=MINIBATCH_SIZE,
        ensemble_idx=8
    )
)

In [None]:
ensemble.append(
    build_regressor(
        trainset=mixup(
            X=X_train,
            y=label_scaler.transform(
                y_train.reshape((y_train.shape[0], 1))
            ).reshape((y_train.shape[0],)),
            mixup_coeff=0.1,
            n_samples=40000
        ),
        validset=(
            X_val,
            label_scaler.transform(
                y_val.reshape((y_val.shape[0], 1))
            ).reshape((y_val.shape[0],))
        ),
        batch_size=MINIBATCH_SIZE,
        ensemble_idx=9
    )
)

In [None]:
ensemble.append(
    build_regressor(
        trainset=mixup(
            X=X_train,
            y=label_scaler.transform(
                y_train.reshape((y_train.shape[0], 1))
            ).reshape((y_train.shape[0],)),
            mixup_coeff=0.1,
            n_samples=40000
        ),
        validset=(
            X_val,
            label_scaler.transform(
                y_val.reshape((y_val.shape[0], 1))
            ).reshape((y_val.shape[0],))
        ),
        batch_size=MINIBATCH_SIZE,
        ensemble_idx=10
    )
)

In [None]:
for idx, cur in enumerate(ensemble):
    cur.save_weights(ensemble_name + f'{idx + 1}.h5')

In [None]:
with codecs.open(submission_name, mode='w', encoding='utf-8') as fp:
    data_writer = csv.writer(fp, quotechar='"', delimiter=',')
    data_writer.writerow(['id', 'target'])
    for data_part in load_data_for_testing(testset_name, pretrained_tokenizer,
                                           MINIBATCH_SIZE * 8):
        labels_for_submission, tokens_for_submission = tokenize_data(
            data=data_part,
            tokenizer=pretrained_tokenizer,
            max_seq_len=MAX_TEXT_LEN
        )
        del data_part
        predictions_for_submission = do_predictions(
            regressor=model_for_inference,
            feature_scaler=text_feature_scaler,
            output_scaler=label_scaler, 
            batch_size=MINIBATCH_SIZE,
            data=labels_for_submission, token_matrix=tokens_for_submission,
            pad_id=pretrained_tokenizer.pad_token_id
        )
        features_for_regressor = []
        final_predictions = []
        identifiers = []
        for cur_id in predictions_for_submission:
            identifiers.append(cur_id)
            predicted, nn_features = predictions_for_submission[cur_id]
            final_predictions.append(predicted)
            features_for_regressor.append(nn_features)
        final_predictions = np.array(final_predictions, dtype=np.float32)
        features_for_regressor = np.vstack(features_for_regressor)
        for cur in ensemble:
            final_predictions += label_scaler.inverse_transform(
                cur.predict(
                    features_for_regressor,
                    batch_size=MINIBATCH_SIZE
                )
            ).reshape((final_predictions.shape[0],))
        final_predictions /= float(len(ensemble) + 1.0)
        assert final_predictions.shape[0] == len(identifiers)
        for cur_id, predicted in zip(identifiers, final_predictions.tolist()):
            data_writer.writerow([cur_id, f'{predicted}'])
        del predictions_for_submission
        del labels_for_submission, tokens_for_submission
        gc.collect()