In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import  LabelEncoder
from tqdm.auto import tqdm
import random
import os
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
import dill
import tensorflow.keras.backend as K
from tqdm.auto import tqdm
from tensorflow.keras import mixed_precision
import matplotlib.pyplot as plt
import tensorflow as tf
from transformers import AutoTokenizer, AutoConfig,TFAutoModel
import json
from sklearn.model_selection import StratifiedKFold,KFold
import gc
import string
import tensorflow_addons as tfa
import re
import ast
#from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
%env TOKENIZERS_PARALLELISM=true

In [None]:
from sklearn.model_selection._split import _BaseKFold, _RepeatedSplits
from sklearn.utils.validation import check_random_state, column_or_1d
from sklearn.utils.multiclass import type_of_target
from collections import defaultdict

class StratifiedGroupKFold(_BaseKFold):
    """Stratified K-Folds iterator variant with non-overlapping groups.
    This cross-validation object is a variation of StratifiedKFold attempts to
    return stratified folds with non-overlapping groups. The folds are made by
    preserving the percentage of samples for each class.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    The difference between GroupKFold and StratifiedGroupKFold is that
    the former attempts to create balanced folds such that the number of
    distinct groups is approximately the same in each fold, whereas
    StratifiedGroupKFold attempts to create folds which preserve the
    percentage of samples for each class as much as possible given the
    constraint of non-overlapping groups between splits.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of folds. Must be at least 2.
    shuffle : bool, default=False
        Whether to shuffle each class's samples before splitting into batches.
        Note that the samples within each split will not be shuffled.
        This implementation can only shuffle groups that have approximately the
        same y distribution, no global shuffle will be performed.
    random_state : int or RandomState instance, default=None
        When `shuffle` is True, `random_state` affects the ordering of the
        indices, which controls the randomness of each fold for each class.
        Otherwise, leave `random_state` as `None`.
        Pass an int for reproducible output across multiple function calls.
        See :term:`Glossary <random_state>`.
    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.model_selection import StratifiedGroupKFold
    >>> X = np.ones((17, 2))
    >>> y = np.array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
    >>> groups = np.array([1, 1, 2, 2, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 7, 8, 8])
    >>> cv = StratifiedGroupKFold(n_splits=3)
    >>> for train_idxs, test_idxs in cv.split(X, y, groups):
    ...     print("TRAIN:", groups[train_idxs])
    ...     print("      ", y[train_idxs])
    ...     print(" TEST:", groups[test_idxs])
    ...     print("      ", y[test_idxs])
    TRAIN: [1 1 2 2 4 5 5 5 5 8 8]
           [0 0 1 1 1 0 0 0 0 0 0]
     TEST: [3 3 3 6 6 7]
           [1 1 1 0 0 0]
    TRAIN: [3 3 3 4 5 5 5 5 6 6 7]
           [1 1 1 1 0 0 0 0 0 0 0]
     TEST: [1 1 2 2 8 8]
           [0 0 1 1 0 0]
    TRAIN: [1 1 2 2 3 3 3 6 6 7 8 8]
           [0 0 1 1 1 1 1 0 0 0 0 0]
     TEST: [4 5 5 5 5]
           [1 0 0 0 0]
    Notes
    -----
    The implementation is designed to:
    * Mimic the behavior of StratifiedKFold as much as possible for trivial
      groups (e.g. when each group contains only one sample).
    * Be invariant to class label: relabelling ``y = ["Happy", "Sad"]`` to
      ``y = [1, 0]`` should not change the indices generated.
    * Stratify based on samples as much as possible while keeping
      non-overlapping groups constraint. That means that in some cases when
      there is a small number of groups containing a large number of samples
      the stratification will not be possible and the behavior will be close
      to GroupKFold.
    See also
    --------
    StratifiedKFold: Takes class information into account to build folds which
        retain class distributions (for binary or multiclass classification
        tasks).
    GroupKFold: K-fold iterator variant with non-overlapping groups.
    """

    def __init__(self, n_splits=5, shuffle=False, random_state=None):
        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    def _iter_test_indices(self, X, y, groups):
        # Implementation is based on this kaggle kernel:
        # https://www.kaggle.com/jakubwasikowski/stratified-group-k-fold-cross-validation
        # and is a subject to Apache 2.0 License. You may obtain a copy of the
        # License at http://www.apache.org/licenses/LICENSE-2.0
        # Changelist:
        # - Refactored function to a class following scikit-learn KFold
        #   interface.
        # - Added heuristic for assigning group to the least populated fold in
        #   cases when all other criteria are equal
        # - Swtch from using python ``Counter`` to ``np.unique`` to get class
        #   distribution
        # - Added scikit-learn checks for input: checking that target is binary
        #   or multiclass, checking passed random state, checking that number
        #   of splits is less than number of members in each class, checking
        #   that least populated class has more members than there are splits.
        rng = check_random_state(self.random_state)
        y = np.asarray(y)
        type_of_target_y = type_of_target(y)
        allowed_target_types = ("binary", "multiclass")
        if type_of_target_y not in allowed_target_types:
            raise ValueError(
                "Supported target types are: {}. Got {!r} instead.".format(
                    allowed_target_types, type_of_target_y
                )
            )

        y = column_or_1d(y)
        _, y_inv, y_cnt = np.unique(y, return_inverse=True, return_counts=True)
        if np.all(self.n_splits > y_cnt):
            raise ValueError(
                "n_splits=%d cannot be greater than the"
                " number of members in each class." % (self.n_splits)
            )
        n_smallest_class = np.min(y_cnt)
        if self.n_splits > n_smallest_class:
            warnings.warn(
                "The least populated class in y has only %d"
                " members, which is less than n_splits=%d."
                % (n_smallest_class, self.n_splits),
                UserWarning,
            )
        n_classes = len(y_cnt)

        _, groups_inv, groups_cnt = np.unique(
            groups, return_inverse=True, return_counts=True
        )
        y_counts_per_group = np.zeros((len(groups_cnt), n_classes))
        for class_idx, group_idx in zip(y_inv, groups_inv):
            y_counts_per_group[group_idx, class_idx] += 1

        y_counts_per_fold = np.zeros((self.n_splits, n_classes))
        groups_per_fold = defaultdict(set)

        if self.shuffle:
            rng.shuffle(y_counts_per_group)

        # Stable sort to keep shuffled order for groups with the same
        # class distribution variance
        sorted_groups_idx = np.argsort(
            -np.std(y_counts_per_group, axis=1), kind="mergesort"
        )

        for group_idx in sorted_groups_idx:
            group_y_counts = y_counts_per_group[group_idx]
            best_fold = self._find_best_fold(
                y_counts_per_fold=y_counts_per_fold,
                y_cnt=y_cnt,
                group_y_counts=group_y_counts,
            )
            y_counts_per_fold[best_fold] += group_y_counts
            groups_per_fold[best_fold].add(group_idx)

        for i in range(self.n_splits):
            test_indices = [
                idx
                for idx, group_idx in enumerate(groups_inv)
                if group_idx in groups_per_fold[i]
            ]
            yield test_indices

    def _find_best_fold(self, y_counts_per_fold, y_cnt, group_y_counts):
        best_fold = None
        min_eval = np.inf
        min_samples_in_fold = np.inf
        for i in range(self.n_splits):
            y_counts_per_fold[i] += group_y_counts
            # Summarise the distribution over classes in each proposed fold
            std_per_class = np.std(y_counts_per_fold / y_cnt.reshape(1, -1), axis=0)
            y_counts_per_fold[i] -= group_y_counts
            fold_eval = np.mean(std_per_class)
            samples_in_fold = np.sum(y_counts_per_fold[i])
            is_current_fold_better = (
                fold_eval < min_eval
                or np.isclose(fold_eval, min_eval)
                and samples_in_fold < min_samples_in_fold
            )
            if is_current_fold_better:
                min_eval = fold_eval
                min_samples_in_fold = samples_in_fold
                best_fold = i
        return best_fold

In [None]:
# NEW on TPU in TensorFlow 24: shorter cross-compatible TPU/GPU/multi-GPU/cluster-GPU detection code

try: # detect TPUs
    tpu  = tf.distribute.cluster_resolver.TPUClusterResolver.connect() # TPU detection
    tf.config.experimental_connect_to_cluster(tpu )
    tf.tpu.experimental.initialize_tpu_system(tpu )
    strategy = tf.distribute.TPUStrategy(tpu )
    print('Using TPU')
except ValueError: # detect GPUs
    tpu = None
    strategy = tf.distribute.MirroredStrategy() # for GPU or multi-GPU machines
    #strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
    #strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() # for clusters of multi-GPU machines

print("Number of accelerators: ", strategy.num_replicas_in_sync)


AUTO = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

In [None]:
seed=1234
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
print('Mixed precision enabled')

In [None]:
TRAIN = False 

 # Load dataframes

In [None]:
features = pd.read_csv("../input/nbme-score-clinical-patient-notes/features.csv")
patient_notes = pd.read_csv("../input/nbme-score-clinical-patient-notes/patient_notes.csv")
test = pd.read_csv("../input/nbme-score-clinical-patient-notes/test.csv")
train= pd.read_csv("../input/nbme-score-clinical-patient-notes/train.csv")
sample_submission= pd.read_csv("../input/nbme-score-clinical-patient-notes/sample_submission.csv")

In [None]:
# https://www.kaggle.com/yasufuminakama/nbme-deberta-base-baseline-train
features.loc[27, 'feature_text'] = 'Last-Pap-smear-1-year-ago'

# incorrect annotation
train.loc[338, 'location'] = ('[[764 783]]')

train.loc[621, 'location'] = ('[[77 100]]')

train.loc[655, 'location'] = ('[[285 292;301 312], [285 287;296 312]]')

train.loc[1262, 'location'] = ('[[551 557;565 580]]')

train.loc[1265, 'location'] = ('[[131 135;181 212]]')

train.loc[1396, 'location'] = ('[[259 280]]')

train.loc[1591, 'location'] = ('[[176 184;201 212]]')

train.loc[1615, 'location'] = ('[[249 257;271 288]]')

train.loc[1664, 'location'] = ('[[822 824;907 924]]')

train.loc[1714, 'location'] = ('[[101 129]]')

train.loc[1929, 'location'] = ('[[531 539;549 561]]')

train.loc[2134, 'location'] = ('[[540 560;581 593]]')

train.loc[2191, 'location'] = ('[[32 57]]')

train.loc[2553, 'location'] = ('[[308 317;376 384]]')

train.loc[3124, 'location'] = ('[[549 557]]')

train.loc[3858, 'location'] = ('[[102 123], [102 112;125 141], [102 112;143 157], [102 112;159 171]]')

train.loc[4373, 'location'] = ('[[33 45]]')

train.loc[4763, 'location'] = ('[[5 16]]')

train.loc[4782, 'location'] = ('[[175 194]]')

train.loc[4908, 'location'] = ('[[700 723]]')

train.loc[6016, 'location'] = ('[[225 250]]')

train.loc[6192, 'location'] = ('[[197 218;236 260]]')

train.loc[6380, 'location'] = ('[[480 482;507 519], [480 482;499 503;512 519], [480 482;521 531], [480 482;533 545], [480 482;564 582]]')

train.loc[6562, 'location'] = ('[[290 320;327 337], [290 320;342 358]]')

train.loc[6862, 'location'] = ('[[288 296;324 363]]')

train.loc[7022, 'location'] = ('[[108 182]]')

train.loc[7422, 'location'] = ('[[102 121]]')

train.loc[8876, 'location'] = ('[[481 483;533 552]]')

train.loc[9027, 'location'] = ('[[92 102], [123 164]]')

train.loc[9938, 'location'] = ('[[89 117], [122 138], [368 402]]')

train.loc[9973, 'location'] = ('[[344 361]]')

train.loc[10513, 'location'] = ('[[600 611], [607 623]]')

train.loc[11551, 'location'] = ('[[386 400;443 461]]')

train.loc[11677, 'location'] = ('[[160 201]]')

train.loc[12124, 'location'] = ('[[325 337;349 366]]')

train.loc[12279, 'location'] = ('[[405 459;488 524]]')

train.loc[12289, 'location'] = ('[[353 400;488 524]]')

train.loc[13238, 'location'] = ('[[293 307], [321 331]]')

train.loc[13297, 'location'] = ('[[182 221], [182 213;225 234]]')

train.loc[13299, 'location'] = ('[[79 88], [409 418]]')

train.loc[13845, 'location'] = ('[[86 94;230 236], [86 94;237 256]]')

train.loc[14083, 'location'] = ('[[56 64;156 179]]')

In [None]:
test = test.merge(patient_notes,on=['case_num','pn_num']).merge(features,on=['case_num','feature_num'])
train = train.merge(patient_notes,on=['case_num','pn_num']).merge(features,on=['case_num','feature_num'])

In [None]:
train.head(5)

In [None]:
train.groupby(['case_num','feature_num']).count()

# Tokenizer

In [None]:
#MODEL_NAME = "bert-large-uncased"
MODEL_NAME = "roberta-large"
DATA_PATH = "../input/nbmebinary"
DATA_EXISTS = os.path.exists(DATA_PATH)

In [None]:
if DATA_EXISTS and TRAIN:
    ! cp -r ../input/nbmebinary/my_tokenizer .
    ! cp ../input/nbmebinary/*.dill .

In [None]:
if TRAIN and not DATA_EXISTS:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,do_lower_case=True)
    config = AutoConfig.from_pretrained(MODEL_NAME)
    tokenizer.save_pretrained('my_tokenizer')
    config.save_pretrained('my_tokenizer')
else:
    tokenizer = AutoTokenizer.from_pretrained(DATA_PATH+"/my_tokenizer",do_lower_case=True)
    config = AutoConfig.from_pretrained(DATA_PATH+"/my_tokenizer/config.json")
tokenizer.special_tokens = {
        "sep": tokenizer.sep_token_id,
        "cls": tokenizer.cls_token_id,
        "pad": tokenizer.pad_token_id,
}

In [None]:
tok = tokenizer("jude le s","jude le s",
        return_token_type_ids=True,
        return_offsets_mapping=True,
        return_attention_mask=False,
        add_special_tokens=True,
)
tok

In [None]:
def filter_offset(offsets):
    a,b = 0,0
    arr = []
    lp = True
    for x,y in offsets:
        if a<=x and b<=y and lp:
            a,b = x,y
            arr.append((-1,-1))
        else:
            lp = False
            arr.append((x,y))
    return arr
filter_offset(tok["offset_mapping"])

# Build data

In [None]:
FEATURES = features.feature_num.unique().tolist()
SEQUENCE_LENGTH = 512

In [None]:
def decode_location(locations):
    for x in ["[","]","'"]:
        locations = locations.replace(x,'')
    locations = locations.replace(',',';')
    locations = locations.split(";")
    res = []
    for location in locations:
        if location:
            x,y = location.split()
            res.append((int(x),int(y)))
    return sorted(res,key=lambda x:x[0])

def process_feature_text(text):
    text = re.sub('I-year', '1-year', text)
    text = re.sub('-OR-', " or ", text)
    text = re.sub('-', ' ', text)
    return text

def clean_spaces(txt):
    txt = re.sub('\n', ' ', txt)
    txt = re.sub('\t', ' ', txt)
    txt = re.sub('\r', ' ', txt)
    return txt

def prepare_df(df):
    df['feature_text'] = df['feature_text'].apply(process_feature_text)
    df['feature_text'] = df['feature_text'].apply(clean_spaces)
    df['pn_history'] = df['pn_history'].apply(clean_spaces)
    return df

In [None]:
train = prepare_df(train)
test = prepare_df(test)

In [None]:
def build_data(df,train=True):
    input_ids_arr,token_type_ids_arr,answers =[],[],[]
    row_ids,feature_ids ,case_ids,offsets_arr = [],[],[],[]
    
    for g1 in tqdm(df.groupby('pn_num')):
        gdf = g1[1]
        pn_history  = gdf.iloc[0].pn_history

        for index, row in gdf.iterrows():
            feature_text = row.feature_text
            
            tokens = tokenizer(
                    feature_text,pn_history,
                    return_token_type_ids=True,
                    return_offsets_mapping=True,
                    return_attention_mask=False,
                    add_special_tokens=True,
                    padding='max_length',
                    max_length=SEQUENCE_LENGTH
            )
            
            input_ids = np.array(tokens['input_ids'],dtype=np.int32)
            token_type_ids = np.array(tokens['token_type_ids'],dtype=np.uint8)
            offsets = filter_offset(tokens['offset_mapping'])
            answer_mask = np.zeros(SEQUENCE_LENGTH,dtype=np.uint8)
            # Answer mask
            if train:
                for i, (w_start, w_end) in enumerate(offsets):
                    if w_end==-1:
                        continue
                    for start,end in decode_location(row.location):
                        start,end = start,end
                        if w_start < w_end and (w_start >= start) and (end >= w_end):
                            answer_mask[i] = 1
                        if w_start >= w_end:
                            break
            row_ids.append(row.id)
            input_ids_arr.append(input_ids)
            token_type_ids_arr.append(token_type_ids)
            answers.append(answer_mask)
            feature_ids.append(row.feature_num)
            case_ids.append(row.case_num)
            offsets_arr.append(offsets)
            
    input_ids_arr = np.array(input_ids_arr,dtype=np.int32)
    token_type_ids_arr = np.array(token_type_ids_arr,dtype=np.uint8)
    answers = np.array(answers,dtype=np.uint8)
    feature_ids = np.array(feature_ids,dtype=np.int32)
    case_ids = np.array(case_ids,dtype=np.int32)
    if train:
        return feature_ids,case_ids,input_ids_arr,token_type_ids_arr,answers
    else:
        return row_ids,offsets_arr,feature_ids,input_ids_arr,token_type_ids_arr

In [None]:
if DATA_EXISTS:
    data = dill.load(open(DATA_PATH+"/data.dill",'rb'))
else:
    data = build_data(train)
    dill.dump(data,open('data.dill','wb'))

In [None]:
data[-1].sum(axis=-1).max()

# CV split

In [None]:
def to_dataset(data,batch_size=32 if tpu else 4,shuffle=True):
    ds = tf.data.Dataset.from_tensor_slices(data).map(lambda a,b,c:((a,b),c))
    size = len(ds)
    steps = size//batch_size
    ds = ds.repeat()
    if shuffle:
        ds = ds.shuffle(size)
    ds = ds.batch(batch_size).prefetch(buffer_size=AUTO)
    return ds,steps

In [None]:
n_splits = 5
cv = StratifiedGroupKFold(n_splits=n_splits,shuffle=True,random_state=seed)
feature_ids,case_ids,input_ids_arr,token_type_ids_arr,answers = data
data_splits = []

In [None]:
groups = train['pn_num'].values
for n, (train_index, val_index) in enumerate(cv.split(feature_ids , feature_ids,case_ids)):
    train_data = input_ids_arr[train_index],token_type_ids_arr[train_index],answers[train_index]
    test_data = input_ids_arr[val_index],token_type_ids_arr[val_index],answers[val_index]
    data_splits.append((to_dataset(train_data),to_dataset(test_data,shuffle=False)))
    del train_data,test_data

# Define Model

In [None]:
def build_model():
    
    tokens = tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,), name = 'tokens', dtype=tf.int32)
    token_type_id = tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,), name = 'token_type_id', dtype=tf.int32)
    
    if not TRAIN:
        config = AutoConfig.from_pretrained(DATA_PATH+"/my_tokenizer/config.json")
        backbone = TFAutoModel.from_config(config)
    else:
        print(f"Loading {MODEL_NAME}...")
        config = AutoConfig.from_pretrained(MODEL_NAME)
        backbone = TFAutoModel.from_pretrained(MODEL_NAME,config=config)
    # Freez some layers
    #backbone.roberta.embeddings.trainable = False
    #for w in backbone.roberta.weights:
    #    for i in range(0,12):
    #        name = f'/layer_._{i}/'
    #        if w.name.find(name) != -1 :
    #            #print(i,w.name)
     #           w._trainable = False
                
    attention = tf.keras.layers.Lambda(lambda x : tf.cast(x != tokenizer.pad_token_id,tf.float32))(tokens)
    out = backbone(tokens, attention_mask=attention,token_type_ids=token_type_id)[0]
    
    out = tf.keras.layers.Dropout(0.2)(out)
    out = tf.keras.layers.Dense(1, activation='sigmoid')(out)
    
    model = tf.keras.Model([tokens,token_type_id],out)
    
    return model

In [None]:
#build_model().summary()

In [None]:
class MyModel(keras.Model):
    def __init__(self,th=0.8):
        super(MyModel,self).__init__()
        self.model = build_model()
        self.th = th
        
    def call(self, inputs, training=True):
        return self.model(inputs, training=training)
    
    @tf.function
    def pseudo_label(self,data):
        (tokens,token_type_id),y = data
        mask = y == 0
        mask = tf.reduce_all(mask,axis=-1)
        if tf.reduce_any(mask):
            y_ps = self((tokens,token_type_id),training=False)
            y_ps = tf.reshape(y_ps,tf.shape(y))
            y_ps = tf.cast(y_ps >= self.th,y.dtype)

            mask = tf.repeat(mask[:,None],SEQUENCE_LENGTH,axis=-1)
            mask = tf.cast(mask,y_ps.dtype)
            y = y + (y_ps*mask)
            return (tokens,token_type_id),y
        else:
            return data
    
    def train_step(self, data):
        data = self.pseudo_label(data)
        return super().train_step(data)
        
    def get_config(self):
        return {}

# Model training

In [None]:
gc.collect()

In [None]:
from tensorflow.keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
epochs = 30
init_lr = 1e-7

In [None]:
if TRAIN:
    val_key = "val_f1_m"
    scores = []
    with strategy.scope():
        i = 0
        for (train_ds,steps_per_epoch),(test_ds,steps) in data_splits:
            print(f">>>>SPLIT : {i+1}")
            model = MyModel()#build_model()
            
            x = np.zeros((1,SEQUENCE_LENGTH)),np.zeros((1,SEQUENCE_LENGTH))
            model(x);
            model.load_weights(f"../input/nbmebinary/model{i}.h5")
            
            callback = tf.keras.callbacks.EarlyStopping(monitor=val_key,mode='max', patience=10)
            ckp_callback = tf.keras.callbacks.ModelCheckpoint(
                                                    filepath=f'model{i}.h5',
                                                    save_weights_only=True,
                                                    monitor=val_key,
                                                    mode='max',
                                                    options=tf.train.CheckpointOptions(experimental_io_device='/job:localhost'),
                                                    save_best_only=True)
            reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor=val_key,mode='max',factor=0.2,patience=5, min_lr=1e-6)
            callbacks=[callback,ckp_callback,reduce_lr]
            # Compile the model
            model.compile(optimizer=tf.keras.optimizers.Adam(init_lr),
                          loss=tf.keras.losses.BinaryCrossentropy(),
                          metrics=['acc',f1_m])

            history = model.fit(train_ds,
                                steps_per_epoch=steps_per_epoch,
                                validation_data=test_ds,
                                validation_steps=steps,
                                epochs=epochs,
                                callbacks=callbacks)
            scores.append(max(history.history[val_key]))
            i += 1

In [None]:
if TRAIN:
    print(scores)
    print(f"CV Score : {np.mean(scores)}")

# Submit

In [None]:
#test = train

In [None]:
row_ids,offsets,feature_ids,input_ids,token_type_ids = build_data(test,train=False)

In [None]:
input_ids.shape,token_type_ids.shape

In [None]:
model = MyModel()
path =  DATA_PATH if not TRAIN else "."

In [None]:
model((input_ids[:2],token_type_ids[:2]));

In [None]:
preds = []
for i in range(n_splits):
    print(f"SPLIT {i}")
    model.load_weights(path+f"/model{i}.h5")
    pred = model.predict((input_ids,token_type_ids),batch_size=16)
    preds.append(pred)
preds = np.mean(preds,axis=0)

In [None]:
all_special_ids = set(tokenizer.all_special_ids)

In [None]:
def decode_position(pos):
    return ";".join([" ".join(np.array(p).astype(str)) for p in pos])

def prediction_pad(preds,spans=5):
    if len(preds)<2:
        return preds
    preds = sorted(preds)
    stop = False
    while not stop:
        stop = True
        for i in range(len(preds)-1):
            pred1 = preds[i]
            pred2 = preds[i+1]
            if pred2[0]-pred1[1] <= spans:
                new_pred = (pred1[0],max(pred1[1],pred2[1]))
                preds = preds[:i]+[new_pred,]+preds[i+2:]
                stop = False
                break
    return preds


def translate(preds,row_ids,input_ids,offsets,token_type_ids,feature_ids):
    all_ids = []
    all_pos = []
    preds = preds[:,:,0]

    for k in range(len(preds)):
        offset = offsets[k]
        pred = preds[k]
        row_id = row_ids[k]
        input_id = input_ids[k]
        token_type_id = token_type_ids[k]
        feature_id = feature_ids[k]
        prediction = []
        pred = (pred>0.5).astype(np.uint8)
        
        i = 0
        while i<SEQUENCE_LENGTH:
            if int(input_id[i]) in all_special_ids:
                i += 1
                continue
            if pred[i] == 0:
                i += 1
                continue
            if offset[i][0] == -1:
                i += 1
                continue
            if pred[i] == 1:
                start = min(offset[i])
                end = max(offset[i])
                while i<SEQUENCE_LENGTH:
                    if pred[i] != 1:
                        break
                    elif int(input_id[i]) in all_special_ids:
                        break
                    else:
                        end = max(offset[i])
                    i += 1
                prediction.append((start,end))
                i += 1
            else:
                i+=1
        all_ids.append(row_id)
        all_pos.append(decode_position(prediction_pad(prediction)))
            
    df = pd.DataFrame({
        "id":all_ids,
        "location": all_pos
    })
    return df

In [None]:
sub = translate(preds,row_ids,input_ids,offsets,token_type_ids,feature_ids)
sub.to_csv('submission.csv',index=False)
sub.head(5)

In [None]:
train.sort_values(by="id").reset_index()[["id","location"]].head(5)