# **import lib** # 

the main part for bert is in transformers


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from transformers import TFAutoModel, AutoTokenizer
import matplotlib.pyplot as plt
import math, re, os
import string

# Set seed for experiment reproducibility
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

# define paths for data and  the offline load of transformers lib

In [None]:
train_loc = '../input/nlp-getting-started/train.csv'
test_loc = '../input/nlp-getting-started/test.csv'
file_path = '/kaggle/input/huggingface-bert/'
MODEL_NAME = "bert-large-uncased"
batch_size = 16
epochs = 6

# Data Read

In [None]:
train_df = pd.read_csv(train_loc)
test_df = pd.read_csv(test_loc)
print('training items ' + str(train_df.shape[0]) )
print('test items ' + str(test_df.shape[0]) )

In [None]:
train_df.head(10)

In [None]:
maxlenght = train_df.text.map(len).max()
maxlenght_test = test_df.text.map(len).max()

print(f'maxmum str lenght in training  is : {maxlenght}\n   max str lunght in test is : {maxlenght_test}\n')

# very simple cleaning  function

In [None]:
def clean(title):

    title = re.sub(r"\-"," ",title)
    title = re.sub(r"\+"," ",title)
    title = re.sub (r"&","and",title)
    title = re.sub(r"\|"," ",title)
    title = re.sub(r"\\"," ",title)
    title = re.sub(r"\W"," ",title)
    title = title.lower()
    for p in string.punctuation :
        title = re.sub(r"f{p}"," ",title)
    
    title = re.sub(r"\s+"," ",title)
    
    return title

In [None]:
train_df["text"] = train_df["text"].map(clean)
test_df["text"] = test_df["text"].map(clean)


**function to do tokenize and padd or truncation on the data **

In [None]:
def tokeniz_dataset(tokenizer,max_len):

    return {
        "train": {
            "data": tokenizer(list(train_df["text"].values), padding = "max_length", max_length = max_len, truncation = True, return_tensors = "tf").data,
            "labels": train_df["target"].values,
        },
        "test": {
            "data": tokenizer(list(test_df["text"].values), padding = "max_length", max_length = max_len, truncation = True, return_tensors = "tf").data
        }
    }

# Define the model 

**bert + our own linear layers**

In [None]:
class ClassifModel(tf.keras.Model):

    def __init__(self, checkpoint):
        super(ClassifModel, self).__init__()
        
        self.base_model = TFAutoModel.from_pretrained(checkpoint)
        self.flatten = layers.Flatten()
        
        self.dropout1 = layers.Dropout(rate = 0.2)
        self.linear1 = layers.Dense(units = 1024, kernel_regularizer = "l1_l2")
        self.batchNorm1 = layers.BatchNormalization()
        self.activation1 = layers.Activation("relu")
        
        self.out = layers.Dense(units = 1, activation = "sigmoid")

    def call(self, inputs, training = False):
        x = self.base_model(inputs).last_hidden_state
        x = self.flatten(x)
        
        x = self.dropout1(x) if training else x
        x = self.linear1(x)
        x = self.batchNorm1(x)
        x = self.activation1(x)

        x = self.out(x)
        return x

**f1 score for the compution porpuse **

In [None]:
class F1_score(tf.keras.metrics.Metric):

    def __init__(self, name = "f1_score", **kwargs):
        super(F1_score, self).__init__(name = name, **kwargs)
        
        self.precision = tf.keras.metrics.Precision()
        self.recall = tf.keras.metrics.Recall()
        
    def update_state(self, y_true, y_pred, sample_weight = None):
        self.precision.update_state(y_true, y_pred, sample_weight)
        self.recall.update_state(y_true, y_pred, sample_weight)
        
    def reset_states(self):
        self.precision.reset_states()
        self.recall.reset_states()
        
    def result(self):
        return 2 / ((1 / self.precision.result()) + (1 / self.recall.result()))

# learning rate schudler 

In [None]:
def lrfn(epoch, bs=batch_size, epochs=epochs):
    # Config
    LR_START = 1e-5
    LR_MAX = 2e-3
    LR_FINAL = 1e-5
    LR_RAMPUP_EPOCHS = 4
    LR_SUSTAIN_EPOCHS = 0
    DECAY_EPOCHS = epochs  - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS - 1
    LR_EXP_DECAY = (LR_FINAL / LR_MAX) ** (1 / (epochs - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS - 1))

    if epoch < LR_RAMPUP_EPOCHS: # exponential warmup
        lr = LR_START + (LR_MAX + LR_START) * (epoch / LR_RAMPUP_EPOCHS) ** 2.5
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS: # sustain lr
        lr = LR_MAX
    else: # cosine decay
        epoch_diff = epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS
        decay_factor = (epoch_diff / DECAY_EPOCHS) * math.pi
        decay_factor= (tf.math.cos(decay_factor).numpy() + 1) / 2        
        lr = LR_FINAL + (LR_MAX - LR_FINAL) * decay_factor

    return lr


In [None]:
# plots the learning rate schedule
def show_lr_schedule(bs=batch_size, epochs=epochs):
    rng = [i for i in range(epochs)]
    y = [lrfn(x, bs=bs, epochs=epochs) for x in rng]
    x = np.arange(epochs)
    x_axis_labels = list(map(str, np.arange(1, epochs+1)))
    print('init lr {:.1e} to {:.1e} final {:.1e}'.format(y[0], max(y), y[-1]))
    
    plt.figure(figsize=(30, 10))
    plt.xticks(x, x_axis_labels, fontsize=16) # set tick step to 1 and let x axis start at 1
    plt.yticks(fontsize=16)
    plt.plot(rng, y)
    plt.grid()
    plt.show()
    
show_lr_schedule()

# init tokenizer and model 

In [None]:

tokenizer = AutoTokenizer.from_pretrained('../input/huggingface-bert/bert-large-uncased' )
model = ClassifModel('../input/huggingface-bert/bert-large-uncased')
loss = BinaryCrossentropy()
model.compile(loss = loss, optimizer = tf.keras.optimizers.Adam(), metrics = ["accuracy", F1_score()])
tokenized_dataset = tokeniz_dataset(tokenizer,65)

# Start training 

In [None]:
checkpoint_filepath = './modebest-stlr.h5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_f1_score',
    mode='max',
    save_best_only=True)
earlystop = tf.keras.callbacks.EarlyStopping(
    monitor='val_f1_score', patience=3,  
    mode='max' 
)
lr_callback = tf.keras.callbacks.LearningRateScheduler(lambda epoch: lrfn(epoch, epochs=epochs), verbose=1)
history = model.fit(
    x = tokenized_dataset["train"]["data"],
    y = tokenized_dataset["train"]["labels"],
    batch_size = batch_size,
    epochs = epochs,
    validation_split = 0.2, callbacks=[model_checkpoint_callback , earlystop , lr_callback]
)

In [None]:

plt.figure(figsize = (14, 14))


plt.plot(history.history["f1_score"], label = "f1_score")
plt.plot(history.history["val_f1_score"], label = "val_f1_score")
plt.title("F1 Score")
plt.ylabel("F1 Score")
plt.xlabel("Epoch")
plt.legend(loc = "best")


# load best model and make prediactions

In [None]:
model.load_weights(checkpoint_filepath)

predictions = model.predict(tokenized_dataset["test"]["data"], verbose = True)
predictions = np.where(predictions >= 0.5, 1, 0)
predictions

In [None]:
submissions = test_df.drop(labels = ["keyword", "location", "text"], axis = 1)
submissions["target"] = predictions
submissions.to_csv("submissions.csv", index = False)