## Importing Libraries

In [None]:
!pip -q install pyspellchecker

In [None]:
import os
import numpy as np
import pandas as pd
import random
import re
import string
from spellchecker import SpellChecker
from tqdm.notebook import tqdm
import warnings

from sklearn.metrics import roc_auc_score, f1_score, matthews_corrcoef
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.backend as K
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dropout, Dense, Input
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.models import Model

from tokenizers import BertWordPieceTokenizer
from transformers import TFBertModel, AdamWeightDecay, BertTokenizerFast

In [None]:
warnings.filterwarnings("ignore")

## Constants

In [None]:
SEED = 42
MAX_LEN = 192
EPOCHS = 6
NUM_SPLITS = 5
LR = 3e-5

In [None]:
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [None]:
train = pd.read_csv("../input/nlp-getting-started/train.csv")
test = pd.read_csv("../input/nlp-getting-started/test.csv")

In [None]:
test["target"] = -1

## Text Preprocessing

In [None]:
df = pd.concat([train, test])

In [None]:
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df["text"] = df["text"].apply(lambda text: remove_punctuation(text))

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

df["text"] = df["text"].apply(lambda text: remove_emoji(text))

In [None]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    try:
        return url_pattern.sub(r'', text)
    except:
        print(text)
    
df["text"] = df["text"].apply(lambda text: remove_urls(text))

In [None]:
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

df["text"] = df["text"].apply(lambda text: remove_html(text))

In [None]:
with open("../input/slangtext/slang.txt", "r") as file:
    chat_words_str = file.read()

chat_words_map_dict = {}
chat_words_list = []
for line in chat_words_str.split("\n"):
    if line != "" and "=" in line:
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

In [None]:
def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

df["text"] = df["text"].apply(lambda text: chat_words_conversion(text))

In [None]:
spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

df["text"] = df["text"].apply(lambda text: chat_words_conversion(text))

In [None]:
train = df[df["target"]!=-1]
test = df[df["target"]==-1]

In [None]:
train.shape, test.shape, df.shape

## Tokenizing Data

In [None]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [None]:
# First load the real tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Save the loaded tokenizer locally
save_path = 'distilbert_base_uncased/'
if not os.path.exists(save_path):
    os.makedirs(save_path)
tokenizer.save_pretrained(save_path)

# Reload it with the huggingface tokenizers library
fast_tokenizer = BertWordPieceTokenizer('distilbert_base_uncased/vocab.txt', lowercase=True)

In [None]:
X_test_tokenized = fast_encode(test["text"].astype(str), fast_tokenizer, maxlen=MAX_LEN)

## Loss & Metric

In [None]:
def focal_loss(gamma=2., alpha=.25):
    def focal_loss_fixed(y_true, y_pred):
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        return -K.mean(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) - K.mean((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0))
    return focal_loss_fixed

In [None]:
def auc(y_true, y_pred):
    def fallback_auc(y_true, y_pred):
        try:
            return roc_auc_score(y_true, y_pred)
        except:
            return 0.5
    return tf.py_function(fallback_auc, (y_true, y_pred), tf.double)

## Model

In [None]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)
BATCH_SIZE = 16 * strategy.num_replicas_in_sync

In [None]:
def build_model(transformer, loss='binary_crossentropy', max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    # last hidden state : (batch_size, sequence_length, hidden_size)
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    x = Dropout(0.35)(cls_token)
    out = Dense(2, activation='softmax')(x)
    
    optimizer = tfa.optimizers.RectifiedAdam(lr=LR)
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(optimizer=optimizer, loss=loss, metrics=[tf.keras.metrics.AUC()])
    
    return model

## Callbacks

In [None]:
def build_lrfn(lr_start=0.000001, lr_max=0.000004, 
               lr_min=0.0000001, lr_rampup_epochs=7, 
               lr_sustain_epochs=0, lr_exp_decay=.87):
    lr_max = lr_max * strategy.num_replicas_in_sync
    def lrfn(epoch):
        if epoch < lr_rampup_epochs:
            lr = (lr_max - lr_start) / lr_rampup_epochs * epoch + lr_start
        elif epoch < lr_rampup_epochs + lr_sustain_epochs:
            lr = lr_max
        else:
            lr = (lr_max - lr_min) * lr_exp_decay**(epoch - lr_rampup_epochs - lr_sustain_epochs) + lr_min
        return lr
    return lrfn

In [None]:
eas = EarlyStopping(monitor='val_auc', min_delta=0.001, patience=3,
                    verbose=1, mode='max', baseline=None, restore_best_weights=True)
lrfn = build_lrfn()
lrs = LearningRateScheduler(lrfn, verbose=1)

## Training

In [None]:
test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(X_test_tokenized)
    .batch(BATCH_SIZE)
)

In [None]:
num_steps = len(train) // BATCH_SIZE
final_preds = np.zeros((len(test)))
total_preds = np.zeros((len(train)))

In [None]:
kfold = StratifiedKFold(n_splits=NUM_SPLITS, shuffle=True, random_state=SEED)

In [None]:
for fold, (train_idx, valid_idx) in enumerate(kfold.split(X=train['text'], y=train['target'])):
    print("*"*60)
    print("*"+" "*26+f"FOLD {fold+1}"+" "*26+"*")
    print("*"*60, end="\n\n")

    X_train = train.iloc[train_idx].reset_index(drop=True)
    X_valid = train.iloc[valid_idx].reset_index(drop=True)
    
    y_train = X_train["target"]
    y_valid = X_valid["target"]
    
    X_train_tokenized = fast_encode(X_train.text.astype(str), fast_tokenizer, maxlen=MAX_LEN)
    X_valid_tokenized = fast_encode(X_valid.text.astype(str), fast_tokenizer, maxlen=MAX_LEN)

    train_dataset = tf.data.Dataset.from_tensor_slices((X_train_tokenized, to_categorical(y_train)))
    train_dataset = train_dataset.repeat()
    train_dataset = train_dataset.shuffle(2048)
    train_dataset = train_dataset.batch(BATCH_SIZE)
    train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
    
    valid_dataset = tf.data.Dataset
    valid_dataset = valid_dataset.from_tensor_slices((X_valid_tokenized, to_categorical(y_valid)))
    valid_dataset = valid_dataset.batch(BATCH_SIZE)
    
    # release memory when building models in a loop
    K.clear_session()
    with strategy.scope():
        transformer_layer = TFBertModel.from_pretrained('bert-base-uncased')
        model = build_model(transformer_layer, loss=focal_loss(gamma=1.5), max_len=MAX_LEN)
    
    history = model.fit(train_dataset,
                    steps_per_epoch=num_steps,
                    validation_data=valid_dataset,
                    callbacks=[eas, lrs], 
                    epochs=EPOCHS)
    
    valid_preds = model.predict(valid_dataset)[:, 1]
    total_preds[valid_idx] = valid_preds
    
    test_preds = model.predict(test_dataset)[:, 1]
    final_preds += test_preds

In [None]:
actuals = train["target"].values
total_preds[total_preds >= 0.5] = 1
total_preds[total_preds < 0.5] = 0

In [None]:
print(f"AUC: {auc(actuals, total_preds)}")
print(f"F1 Score: {f1_score(actuals, total_preds)}")
print(f"MCC: {matthews_corrcoef(actuals, total_preds)}")

## Submission

In [None]:
submission = pd.read_csv("../input/nlp-getting-started/sample_submission.csv")

In [None]:
submission["target"] = final_preds/NUM_SPLITS
submission["target"] = submission["target"].apply(lambda x: 1 if x>=0.5 else 0)

In [None]:
submission.to_csv("submission.csv", index=False)