In [1]:
""" 
    A simple stratified 5 fold implementation of Distill bert 
    with augment data
    
"""
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from tqdm.notebook import tqdm
from tokenizers import BertWordPieceTokenizer

In [2]:
transformers.__version__

'2.7.0'

## Helper Functions

In [3]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=128):
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [4]:
def build_model(transformer, max_len=512):
    
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(144, activation='softmax')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

## TPU Configs

In [5]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [6]:
AUTO = tf.data.experimental.AUTOTUNE

# Data access
GCS_DS_PATH = KaggleDatasets().get_gcs_path()

# Configuration
EPOCHS = 20
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 64

## Create fast tokenizer

In [7]:
# First load the real tokenizer
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# Save the loaded tokenizer locally
tokenizer.save_pretrained('.')
# Reload it with the huggingface tokenizers library
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




Tokenizer(vocabulary_size=30522, model=BertWordPiece, add_special_tokens=True, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=False, wordpieces_prefix=##)

## Load text data into memory

In [8]:
train = pd.read_csv('../input/data-set-augment-intent/data_file.csv')
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
labels = le.fit_transform(train['Intent'].values)

In [9]:
len(set(labels))

144

In [10]:
from sklearn.utils import shuffle
x_train = fast_encode(train.Utterance.astype(str), fast_tokenizer, maxlen=MAX_LEN)
y_train = tf.keras.utils.to_categorical(labels, num_classes=len(set(labels)), dtype='float32')
x_train,y_train = shuffle(x_train,y_train, random_state=42)

HBox(children=(FloatProgress(value=0.0, max=38.0), HTML(value='')))




## Build datasets objects

In [11]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

## Load model into the TPU

In [12]:
%%time
transformer_layer = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased')
model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=363423424.0, style=ProgressStyle(descri…


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 64)]              0         
_________________________________________________________________
tf_distil_bert_model (TFDist ((None, 64, 768),)        66362880  
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 768)]             0         
_________________________________________________________________
dense (Dense)                (None, 144)               110736    
Total params: 66,473,616
Trainable params: 66,473,616
Non-trainable params: 0
_________________________________________________________________
CPU times: user 14.3 s, sys: 2.51 s, total: 16.8 s
Wall time: 17.9 s


## Train Model

First, we train on the subset of the training set, which is completely in English.

In [13]:
from sklearn.model_selection import StratifiedKFold

splits = list(StratifiedKFold(n_splits=5).split(x_train,y_train.argmax(1)))

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import keras.backend as K
import numpy as np
BATCH_SIZE = 16
NUM_EPOCHS = 15

oof_preds = np.zeros((x_train.shape[0],len(set(labels))))
for fold in [0,1,2,3,4]:
    K.clear_session()
    tr_ind, val_ind = splits[fold]
    ckpt = ModelCheckpoint(f'distill_bert_{fold}.hdf5',save_weights_only=True, verbose=1)
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3,min_delta=0.008)
    model = build_model(transformer_layer, max_len=MAX_LEN)
    model.fit(x_train[tr_ind],
        y_train[tr_ind],
        batch_size=BATCH_SIZE,
        epochs=NUM_EPOCHS,
        validation_data=(x_train[val_ind], y_train[val_ind]),
        callbacks = [es,ckpt])

    oof_preds[val_ind] += model.predict(x_train[val_ind])
    #test_preds += model.predict(X_test)[:,0]

Using TensorFlow backend.


Train on 7696 samples, validate on 1925 samples
Epoch 1/15
Epoch 00001: saving model to distill_bert_0.hdf5
Epoch 2/15
Epoch 00002: saving model to distill_bert_0.hdf5
Epoch 3/15
Epoch 00003: saving model to distill_bert_0.hdf5
Epoch 4/15
Epoch 00004: saving model to distill_bert_0.hdf5
Epoch 5/15
Epoch 00005: saving model to distill_bert_0.hdf5
Epoch 6/15
Epoch 00006: saving model to distill_bert_0.hdf5
Epoch 7/15
Epoch 00007: saving model to distill_bert_0.hdf5
Epoch 8/15
Epoch 00008: saving model to distill_bert_0.hdf5
Epoch 9/15
Epoch 00009: saving model to distill_bert_0.hdf5
Epoch 10/15
Epoch 00010: saving model to distill_bert_0.hdf5
Epoch 00010: early stopping
Train on 7697 samples, validate on 1924 samples
Epoch 1/15
Epoch 00003: saving model to distill_bert_1.hdf5
Epoch 4/15
Epoch 00004: saving model to distill_bert_1.hdf5
Epoch 5/15
Epoch 00005: saving model to distill_bert_1.hdf5
Epoch 6/15
Epoch 00006: saving model to distill_bert_1.hdf5
Epoch 7/15
Epoch 00007: saving mode

## Submission