# **Import Libraries**

In [None]:
import os
import gc
import sys
import time
import shutil

import random
import pickle

from tqdm import tqdm as print_progress
from glob import glob

import dask.dataframe as dd
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [None]:
os.environ['TF_KERAS'] = '1'

import tensorflow as tf

from tensorflow.keras import backend as K
from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.metrics import TopKCategoricalAccuracy
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler, Callback
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

In [None]:
from tensorflow.keras.layers import (
    Layer, 
    Input, InputLayer, Embedding, 
    Dropout, Dense, 
    Dot, Concatenate, Average, Add,
    Bidirectional, LSTM,
    Lambda, Reshape
)
from tensorflow.keras.activations import softmax, sigmoid
from tensorflow.keras.initializers import Identity, GlorotNormal
from tensorflow.keras.utils import plot_model

In [None]:
# # detect and init the TPU
# tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
# tf.config.experimental_connect_to_cluster(tpu)
# tf.tpu.experimental.initialize_tpu_system(tpu)

# # instantiate a distribution strategy
# tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [None]:
pip install stellargraph

In [None]:
from stellargraph.layer import GraphAttention
from stellargraph.utils import plot_history

In [None]:
pip install gradient-centralization-tf

# **Load data**

In [None]:
datasets_path = '../input/hotel-comment'
sample_dfs = dict()
for dataset in ['training', 'valuating', 'testing']:
    print(f'\n\n\nProcessing {dataset}-set ...')
    sample_dfs[dataset] = dd.read_csv(
        os.path.join(datasets_path, f'{dataset}_data*.csv')).compute()
    print(f"{dataset}-set contains {len(sample_dfs[dataset])} samples")
    print(sample_dfs[dataset].sample(n=3))

In [None]:
filename = os.path.join(datasets_path, 'label_encoder.pkl')
label_encoder = pickle.load(open(filename, 'rb'))
labels = list(label_encoder.classes_)

# **Pretrained BERT**

In [None]:
pip install keras-bert

In [None]:
from tensorflow.keras import backend as K

from keras_bert import (
    PretrainedList, 
    get_pretrained, 
    get_checkpoint_paths,
    load_trained_model_from_checkpoint, 
    load_vocabulary,
    extract_embeddings,
    Tokenizer
)

In [None]:
bert_version = 'uncased_L-4_H-512_A-8'
bert_model_path = os.path.join('../input/bert-pretrained', bert_version)
paths = get_checkpoint_paths(bert_model_path)

bert_model = load_trained_model_from_checkpoint(
    config_file=paths.config,
    checkpoint_file=paths.checkpoint,
    output_layer_num=1,
)

vocabs = load_vocabulary(paths.vocab)
tokenizer = Tokenizer(vocabs, cased=False if 'uncased' in bert_version else True)

In [None]:
def process_word_embeddings(sentences: list, tokenizer, bert_model,
                            seq_len: int=512, n_pads=10, reduce_output: bool=False):
    tokens, segments, n_tokens = [], [], []
    # Tokenize and numberize tokens
    for sentence in sentences:
        token, segment = tokenizer.encode(sentence, max_len=seq_len)
        tokens.append(token)
        segments.append(segment)
        n_tokens.append(min(seq_len, np.count_nonzero(token)+n_pads))
                
    # 0-padding
    for i in range(len(tokens)):
        tokens[i].extend([0] * (seq_len-len(tokens[i])))
        segments[i].extend([0] * (seq_len-len(segments[i])))
        
    # Get predictions by batch
    tokens, segments = np.array(tokens), np.array(segments)
    predictions = bert_model.predict([tokens, segments])
    if not reduce_output:
        return predictions
    
    # Clip predictions for less memory storage
    outputs = []
    for prediction, len_pred in zip(list(predictions), n_tokens):
        outputs.append(prediction[:len_pred, :])
    return outputs

In [None]:
labels_vector = process_word_embeddings(labels, tokenizer, bert_model, reduce_output=True)
labels_vector = [np.mean(l[~np.all(l==0, axis=1)], axis=0) for l in labels_vector]
labels_matrix = np.vstack(labels_vector)
labels_matrix = np.expand_dims(labels_matrix, axis=0)
# np.save(os.path.join(datasets_path, 'labels_embeddings.npy'), labels_matrix)
labels_matrix.shape

# **Data Generator**

In [None]:
import sklearn
from ast import literal_eval
from tensorflow.keras.utils import Sequence, to_categorical

class DataGenerator(Sequence):

    def __init__(self, data_df: pd.DataFrame,
                       tokenizer,
                       word_embedder,
                       labels_fixed: np.array,
                       batch_size: int = 64, 
                       shuffle: bool = True):
        self.data_df = data_df
        if len(labels_fixed.shape) == 2:
            labels_fixed = np.expand_dims(labels_fixed, axis=0)
        elif len(labels_fixed.shape) != 3:
            raise ValueError("Shape of `labels_fixed` must be 2D or 3D")
        self.labels_fixed = labels_fixed
        self.tokenizer = tokenizer
        self.word_embedder = word_embedder
        self.max_seq_length, \
        self.embedding_dim = K.int_shape(self.word_embedder.outputs[0])[1:]
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.array(list(self.data_df.index))
        self.on_epoch_end()

    def __len__(self):
        " Denotes the number of batches per epoch "
        return int(len(self.data_df) // self.batch_size)

    def __getitem__(self, index):
        " Generate single batch of data "
        # Generate indexes of the batch
        start_index = self.batch_size * index
        end_index = self.batch_size * (index+1)
        indices = self.indices[start_index:end_index]

        # Generate data
        samples = self.data_df.loc[indices, ['Comment', 'label_encoder']].copy()
        labels = samples.label_encoder.values.tolist()
        texts = samples.Comment.values.tolist()
        embeddings = process_word_embeddings(texts, self.tokenizer, self.word_embedder, reduce_output=False)

        # Encoding multi-class labels
        mClss_labels = []
        for l in labels:
            l = literal_eval(l) if ',' in l else [int(ch) for ch in l[1:-1].split()]
                
            # Build multi-class labels
            mtc = np.sum(to_categorical(l, num_classes=self.labels_fixed.shape[-2]), axis=0)
            mClss_labels += [self.smooth_labels(mtc)]
        mClss_labels = np.array(mClss_labels)

        del samples, labels, texts
        _ = gc.collect()
        return [embeddings, self.labels_fixed], mClss_labels

    def smooth_labels(self, labels, factor=0.1):
        " Smooth the labels "
        labels *= (1 - factor)
        labels += (factor / labels.shape[-1])
        return labels

    def on_epoch_end(self):
        " Update indices after each epoch "
        if self.shuffle:
            self.indices = sklearn.utils.shuffle(self.indices)

In [None]:
data_generator = dict()
for dataset in ['training', 'valuating', 'testing']:
    dset_fn = os.path.join(datasets_path, f'{dataset}_data*.csv')
    dset = dd.read_csv(dset_fn).compute()
    data_generator[dataset] = DataGenerator(dset, 
                                            tokenizer=tokenizer,
                                            word_embedder=bert_model, 
                                            labels_fixed=labels_matrix, 
                                            batch_size=64, 
                                            shuffle=True if dataset=='training' else False)

In [None]:
len(data_generator['training']), len(data_generator['valuating'])

# **Load Model**

In [None]:
class CyclicLR(Callback):
    """
    This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with some constant frequency, 
        as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or per-cycle basis.
    
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "halving":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exponential":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
        cycle iteration.

    For more detail, please read the paper.
    
    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    
    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```    
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore 
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {original, halving, exponential}.
            Default 'original'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where 
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored 
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on 
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """
    def __init__(self, base_lr=0.001, max_lr=0.1, step_size=2000., mode='original',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'halving':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exponential':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
            else:
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None, new_step_size=None):
        """
        Resets cycle iterations.
            Optional boundary/step size adjustment.
        """
        if new_base_lr is not None:
            self.base_lr = new_base_lr
        if new_max_lr is not None:
            self.max_lr = new_max_lr
        if new_step_size is not None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}
        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
        
        new_lr = self.clr()
        K.set_value(self.model.optimizer.lr, new_lr)

In [None]:
class Adjacency(Layer):

    def __init__(self, nodes=1, weights=None, init_method='identity'):
        super(Adjacency, self).__init__()

        self.shape = (1, nodes, nodes)

        if weights is not None:
            assert weights.shape==(nodes, nodes), \
                f'Adjacency Matrix must have shape ({nodes}, {nodes})' + \
                f' while its shape is {weights.shape}'
            w_init = tf.convert_to_tensor(weights)
        else:
            init_method = init_method.lower()
            if init_method == 'identity':
                initializer = tf.initializers.Identity()
            elif init_method in ['xavier', 'glorot']:
                initializer = tf.initializers.GlorotNormal()
            w_init = initializer(shape=(nodes, nodes))

        self.w = tf.Variable(
            initial_value=tf.expand_dims(w_init, axis=0), 
            dtype="float32", trainable=True
        )

    def call(self, inputs):
        return tf.convert_to_tensor(self.w)

    def compute_output_shape(self):
        return self.shape

In [None]:
def buil_MAGNET(n_labels,
                embedding_dim: int,
                sequence_length: int=512, 
                lstm_units: int=64,
                dropout_rates=[0.2, 0.3],
                attention_heads=[4, 2],
                adjacency_matrix=None,
                adjacency_generation='xavier', # 'identity' or 'xavier' or 'glorot'
                feed_text_embeddings=True, # if False, add additional Embedding layer
                text_embeddings_matrix=None, # initialized weights for text Embedding layer
                feed_label_embeddings=True, # if False, add additional Embedding layer
                label_embeddings_matrix=None, # initialized weights for label Embedding layer
                ) -> Model:

    if isinstance(attention_heads, int):
        attention_heads = [attention_heads, attention_heads]
    if not isinstance(attention_heads, (list, tuple)):
        raise ValueError('`attention_heads` must be INT, LIST or TUPLE')

    # 1. Sentence Representation
    if feed_text_embeddings:
        sentence_model = Sequential(name='sentence_model')
        sentence_model.add(Dropout(dropout_rates[0], input_shape=(sequence_length, embedding_dim), name='word_embeddings'))
        word_inputs, word_embeddings = sentence_model.inputs, sentence_model.outputs
    else:
        word_inputs = Input(shape=(sequence_length, ), name='word_inputs')
        embedding_args = {
            'input_dim': sequence_length,
            'output_dim': embedding_dim,
            'name': 'word_embeddings'
        }
        if text_embeddings_matrix is not None \
            and text_embeddings_matrix.shape==(sequence_length, embedding_dim):
            embedding_args['weights'] = [text_embeddings_matrix]
        word_embeddings = Embedding(**embedding_args)(word_inputs)
        word_embeddings = Dropout(dropout_rates[0], name='WE_dropout')(word_embeddings)

    forward_rnn = LSTM(units=lstm_units, return_sequences=True, name='forward_rnn')
    backward_rnn = LSTM(units=lstm_units, return_sequences=True, name='backward_rnn', go_backwards=True)
    bidir_rnn = Bidirectional(layer=forward_rnn, backward_layer=backward_rnn, merge_mode="concat", name='bidir_rnn')
    
    sentence_repr = bidir_rnn(word_embeddings)
    sentence_repr = K.mean(sentence_repr, axis=1)
    # print(f"sentence_repr: {K.int_shape(sentence_repr)}")

    # 2. Labels Representation
    if feed_label_embeddings:
        label_inputs = Input(batch_shape=(1, n_labels, embedding_dim), name='label_embeddings')
        label_embeddings = label_inputs
    else:
        label_inputs = Input(batch_shape=(1, n_labels), name='label_inputs')
        embedding_args = {'input_dim': n_labels,
                          'output_dim': embedding_dim,
                          'name': 'label_embeddings'}
        if label_embeddings_matrix is not None \
            and label_embeddings_matrix.shape==(n_labels, embedding_dim):
            embedding_args['weights'] = [label_embeddings_matrix]
        label_embeddings = Embedding(**embedding_args)(label_inputs)
        label_embeddings = Dropout(rate=dropout_rates[0], name='LE_dropout')(label_embeddings)
    label_embeddings = Dense(units=embedding_dim//4, name='label_embeddings_reduced')(label_embeddings)
    # print(f"label_inputs: {K.int_shape(label_inputs)}")

    label_correlation = Adjacency(nodes=n_labels, 
                                  weights=adjacency_matrix,
                                  init_method=adjacency_generation)(label_inputs)
    # print(f"label_correlation: {K.int_shape(label_correlation)}")

    label_attention = GraphAttention(units=embedding_dim//4//attention_heads[0],
                                     activation='tanh',
                                     attn_heads=attention_heads[0],
                                     in_dropout_rate=dropout_rates[1],
                                     attn_dropout_rate=dropout_rates[1], )([label_embeddings, label_correlation])
    # print(f"label_attention: {K.int_shape(label_attention)}")

    label_residual = Add(name='label_residual')([label_attention, label_embeddings])
    # print(f"label_residual: {K.int_shape(label_residual)}")

    label_repr = GraphAttention(units=2*lstm_units,
                                activation='tanh',
                                attn_heads_reduction='average',
                                attn_heads=attention_heads[1],
                                in_dropout_rate=dropout_rates[1],
                                attn_dropout_rate=dropout_rates[1], )([label_residual, label_correlation])

    label_repr = K.sum(label_repr, axis=0, keepdims=False)
    # print(f"label_repr: {K.int_shape(label_repr)}")

    # 3. Prediction
    prediction = tf.einsum('Bk,Nk->BN', sentence_repr, label_repr)
    prediction = sigmoid(prediction)
    # print(f"prediction: {K.int_shape(prediction)}")

    return Model(inputs=[word_inputs, label_inputs], outputs=prediction, name='MAGNET')

In [None]:
import gctf

class MAGNET:

    def __init__(self, n_labels: int, embedding_dim: int):

        self.embedding_dim = embedding_dim

        # Build model(s)
        print(f"\n\n\nBuilding MAGNET ...\n\n\n")
        self.model = buil_MAGNET(n_labels, embedding_dim=embedding_dim, sequence_length=512, lstm_units=32)
        self.model.summary()

    def compile(self, model_saved: str, logs_path: str, schedule_step: int, verbose: int=1):
                    
        # Compile optimizer, loss & metric functions
        print(f"Compiling MAGNET using \n\tgrad-centralized ADAM, \n\ttop-k Accuracy, \n\tweighted Cross-Entropy \n...")
        self.model.compile(optimizer=gctf.optimizers.adam(learning_rate=0.001), 
                           # optimizer=Adam(learning_rate=0.001), 
                           metrics=["accuracy", TopKCategoricalAccuracy(k=3)],
                           loss=categorical_crossentropy)

        # Define Callbacks
        return [
            TensorBoard(log_dir=logs_path),
            # ReduceLROnPlateau(monitor='loss', factor=0.1, patience=3, verbose=verbose),
            CyclicLR(mode='exponential', base_lr=1e-7, max_lr=1e-3, step_size=schedule_step),
            ModelCheckpoint(filepath=model_saved, monitor='accuracy', save_weights_only=True, save_best_only=False, save_freq='epoch'),
            # LearningRateScheduler(noam_scheme),
            # EarlyStopping(monitor='val_accuracy', mode='max', restore_best_weights=True, min_delta=1e-7, patience=7, verbose=verbose),
        ]

    def finetune(self, train_generator, val_generator, model_saved: str, logs_path: str, n_loops: int=3, verbose: int=1):
        # Compile
        schedule_step = len(train_generator) // 2
        custom_callbacks = self.compile(model_saved, logs_path, schedule_step, verbose)

        # Define part(s) of layers for fine-tuning
        label_layers = ['adjacency', 'graph_attention', 'graph_attention_1']
        sentence_layers = ['bidir_rnn', 'label_embeddings_reduced']
        train_histories = []

        ######################################
        #             FINE-TUNING            #
        ######################################

        print(f"[Fine-tuning MAGNET]")
        train_args = {
            'generator': train_generator,
            'steps_per_epoch': len(train_generator),
            'validation_data': val_generator,
            'validation_steps': len(val_generator),
            'callbacks': custom_callbacks
        }
        for l in range(n_loops):
            
            print(f"Training loop {l+1}")

            # Step 1: Train ALL layers
            for layer in self.model.layers:
                layer.trainable = True

            print(f"\tStep 1: Training ALL layers ...")
            train_history = self.model.fit_generator(initial_epoch=l*20, epochs=l*20+5, **train_args)
            train_histories.append(train_history)

            # Step 2: Train LABEL-ATTENTION layers
            for layer in self.model.layers:
                layer.trainable = True if layer.name in label_layers else False

            print(f"\tStep 2: Training LABEL-ATTENTION layers ...")
            train_history = self.model.fit_generator(initial_epoch=l*20+5, epochs=l*20+10, **train_args)
            train_histories.append(train_history)

            # Step 3: Train SENTENCE layers
            for layer in self.model.layers:
                layer.trainable = True if layer.name in sentence_layers else False

            print(f"\tStep 3: Training SENTENCE layers ...")
            train_history = self.model.fit_generator(initial_epoch=l*20+10, epochs=l*20+15, **train_args)
            train_histories.append(train_history)

            # Step 4: Train ALL layers
            for layer in self.model.layers:
                layer.trainable = True

            print(f"\tStep 4: Training ALL layers ...")
            train_history = self.model.fit_generator(initial_epoch=l*20+15, epochs=l*20+20, **train_args)
            train_histories.append(train_history)

            # Reduce learning rate
            # custom_callbacks[0].base_lr /= 1.69
            # custom_callbacks[0].max_lr /= 1.69

        return train_histories

    def train(self, train_generator, val_generator, 
                    model_saved: str, logs_path: str,
                    max_epochs: int=50, verbose: int=1):
        # Compile
        schedule_step = len(train_generator)*2
        custom_callbacks = self.compile(model_saved, logs_path, schedule_step, verbose)

        # Training
        train_history = self.model.fit_generator(generator=train_generator,
                                                 steps_per_epoch=len(train_generator),
                                                 validation_data=val_generator,
                                                 validation_steps=len(val_generator),
                                                 callbacks=custom_callbacks, 
                                                 epochs=max_epochs,
                                                 initial_epoch=0)
        return train_history

    def load_weights(self, weight_path: str):
        self.model.load_weights(weight_path)

    def predict(self, label_embeddings: np.array, sent_embeddings: np.array):
        sent_embeddings = np.reshape(sent_embeddings, (1, 512, self.embedding_dim))
        preds = self.model.predict([sent_embeddings, label_embeddings]).tolist()
        return preds[0]

In [None]:
def weighted_cross_entropy(y_true, y_pred, pos_weight=1.618):
    losses = y_true * -K.log(y_pred) * pos_weight + (1-y_true) * -K.log(1-y_pred)
    losses = K.clip(losses, 0.0, 9.7)
    return K.mean(losses)

In [None]:
N_LABELS = labels_matrix.shape[1]
max_seq_len, embedding_dim = K.int_shape(bert_model.outputs[0])[1:]

model = MAGNET(n_labels=N_LABELS, embedding_dim=embedding_dim)

In [None]:
def noam_scheme(global_step, init_lr, warmup_steps=16):
    """
    Noam scheme learning rate decay
        init_lr: (scalar) initial learning rate. 
        global_step: (scalar) current training step
        warmup_steps: (scalar) During warmup_steps, learning rate increases until it reaches init_lr.
    """
    step = tf.cast(global_step+1, dtype=tf.float32, name="global_step")
    return init_lr * (warmup_steps**0.5) * tf.minimum(step*(warmup_steps**-1.5), step**-0.5)

# **Train**

In [None]:
models_path = '/kaggle/working/models'
if not os.path.isdir(models_path):
    os.makedirs(models_path)

logs_path = '/kaggle/working/logs'
if not os.path.isdir(logs_path):
    os.makedirs(logs_path)
    
pred_dir = '/kaggle/working/predictions'
if not os.path.isdir(pred_dir):
    os.makedirs(pred_dir)
    
model_format = 'ep={epoch:03d}_acc={accuracy:.3f}_val_acc={val_accuracy:.3f}_topk={top_k_categorical_accuracy:.3f}_val_topk={val_top_k_categorical_accuracy:.3f}.h5'
model_saved =  os.path.join(models_path, model_format)

In [None]:
train_history = model.finetune(data_generator['training'], data_generator['valuating'], 
                               model_saved=model_saved, logs_path=logs_path, n_loops=3, verbose=1)

In [None]:
os.chdir(r'/kaggle/working')
dir_path = '/kaggle/working/'
shutil.make_archive(dir_path+"data", 'zip', dir_path)