# Introduction
**This notebook introduced how to solve an imbalanced text classification problem with LSTM networks and word embedding.**

Import some required libraries.

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

import gc
import sys

from tqdm.notebook import tqdm
tqdm().pandas()
pd.set_option('display.max_colwidth', None)

# Set seed for experiment reproducibility
seed = 1024
tf.random.set_seed(seed)
np.random.seed(seed)

def print_size(var):  
    print('%.2fMB' % (sys.getsizeof(var)/1024/1024))

Unzip the word2vec. It may take several minutes.

In [None]:
GLOVE_FILE = 'glove.840B.300d/glove.840B.300d.txt'

In [None]:
!unzip -n /kaggle/input/quora-insincere-questions-classification/embeddings.zip {GLOVE_FILE} -d .

In [None]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
def get_lines_count(file_name): return sum(1 for _ in open(file_name, encoding="utf8", errors='ignore'))
def load_vec(file_name): return dict(get_coefs(*o.split(" ")) for o in tqdm(open(file_name, encoding="utf8", errors='ignore'), total=get_lines_count(file_name)) if len(o) > 100)

Load the train and test dataset.

In [None]:
train_data = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/train.csv')
test_data = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/test.csv')

It's not necessary using entire dataset to train if you just run it quickly.

In [None]:
# train_data = train_data[0:100000]
# test_data = test_data[0:10000]

Let's see what's in the dataset and print the first 5 rows in train data.

In [None]:
train_data.head()

Let's see how imbalanced the dataset is.

In [None]:
negative, positive = np.bincount(train_data['target'])
total = negative + positive
print('total: {}    positive: {} ({:.2f}% of total)'.format(total, positive, 100 * positive / total))

Word vectorizing. converting words into numbers so that can be fed into neural network.

In [None]:
import re

def clean_tag(text):
    if '[math]' in text:
        text = re.sub('\[math\].*?math\]', '[formula]', text) #replacing with [formuala]

    if 'http' in text or 'www' in text:
        text = re.sub('(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-?=%.]+', '[url]', text) #replacing with [url]
    return text

contraction_mapping = {"We'd": "We had", "That'd": "That had", "AREN'T": "Are not", "HADN'T": "Had not", "Could've": "Could have", "LeT's": "Let us", "How'll": "How will", "They'll": "They will", "DOESN'T": "Does not", "HE'S": "He has", "O'Clock": "Of the clock", "Who'll": "Who will", "What'S": "What is", "Ain't": "Am not", "WEREN'T": "Were not", "Y'all": "You all", "Y'ALL": "You all", "Here's": "Here is", "It'd": "It had", "Should've": "Should have", "I'M": "I am", "ISN'T": "Is not", "Would've": "Would have", "He'll": "He will", "DON'T": "Do not", "She'd": "She had", "WOULDN'T": "Would not", "She'll": "She will", "IT's": "It is", "There'd": "There had", "It'll": "It will", "You'll": "You will", "He'd": "He had", "What'll": "What will", "Ma'am": "Madam", "CAN'T": "Can not", "THAT'S": "That is", "You've": "You have", "She's": "She is", "Weren't": "Were not", "They've": "They have", "Couldn't": "Could not", "When's": "When is", "Haven't": "Have not", "We'll": "We will", "That's": "That is", "We're": "We are", "They're": "They' are", "You'd": "You would", "How'd": "How did", "What're": "What are", "Hasn't": "Has not", "Wasn't": "Was not", "Won't": "Will not", "There's": "There is", "Didn't": "Did not", "Doesn't": "Does not", "You're": "You are", "He's": "He is", "SO's": "So is", "We've": "We have", "Who's": "Who is", "Wouldn't": "Would not", "Why's": "Why is", "WHO's": "Who is", "Let's": "Let us", "How's": "How is", "Can't": "Can not", "Where's": "Where is", "They'd": "They had", "Don't": "Do not", "Shouldn't":"Should not", "Aren't":"Are not", "ain't": "is not", "What's": "What is", "It's": "It is", "Isn't":"Is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }

def clean_contractions(text):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    
    text = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in text.split(" ")])
    return text

puncts = [",",".",'"',":",")","(","-","!","?","|",";","'","$","&","/","[","]",">","%","=","#","*","+","\\","•","~","@","£","·","_","{","}","©","^","®","`","<","→","°","€","™","›","♥","←","×","§","″","′","█","…","“","★","”","–","●","►","−","¢","¬","░","¡","¶","↑","±","¿","▾","═","¦","║","―","¥","▓","—","‹","─","▒","：","⊕","▼","▪","†","■","’","▀","¨","▄","♫","☆","¯","♦","¤","▲","¸","⋅","‘","∞","∙","）","↓","、","│","（","»","，","♪","╩","╚","・","╦","╣","╔","╗","▬","❤","≤","‡","√","◄","━","⇒","▶","≥","╝","♡","◊","。","✈","≡","☺","✔","↵","≈","✓","♣","☎","℃","◦","└","‟","～","！","○","◆","№","♠","▌","✿","▸","⁄","□","❖","✦","．","÷","｜","┃","／","￥","╠","↩","✭","▐","☼","☻","┐","├","«","∼","┌","℉","☮","฿","≦","♬","✧","〉","－","⌂","✖","･","◕","※","‖","◀","‰","\x97","↺","∆","┘","┬","╬","،","⌘","⊂","＞","〈","⎙","？","☠","⇐","▫","∗","∈","≠","♀","♔","˚","℗","┗","＊","┼","❀","＆","∩","♂","‿","∑","‣","➜","┛","⇓","☯","⊖","☀","┳","；","∇","⇑","✰","◇","♯","☞","´","↔","┏","｡","◘","∂","✌","♭","┣","┴","┓","✨","\xa0","˜","❥","┫","℠","✒","［","∫","\x93","≧","］","\x94","∀","♛","\x96","∨","◎","↻","⇩","＜","≫","✩","✪","♕","؟","₤","☛","╮","␊","＋","┈","％","╋","▽","⇨","┻","⊗","￡","।","▂","✯","▇","＿","➤","✞","＝","▷","△","◙","▅","✝","∧","␉","☭","┊","╯","☾","➔","∴","\x92","▃","↳","＾","׳","➢","╭","➡","＠","⊙","☢","˝","∏","„","∥","❝","☐","▆","╱","⋙","๏","☁","⇔","▔","\x91","➚","◡","╰","\x85","♢","˙","۞","✘","✮","☑","⋆","ⓘ","❒","☣","✉","⌊","➠","∣","❑","◢","ⓒ","\x80","〒","∕","▮","⦿","✫","✚","⋯","♩","☂","❞","‗","܂","☜","‾","✜","╲","∘","⟩","＼","⟨","·","✗","♚","∅","ⓔ","◣","͡","‛","❦","◠","✄","❄","∃","␣","≪","｢","≅","◯","☽","∎","｣","❧","̅","ⓐ","↘","⚓","▣","˘","∪","⇢","✍","⊥","＃","⎯","↠","۩","☰","◥","⊆","✽","⚡","↪","❁","☹","◼","☃","◤","❏","ⓢ","⊱","➝","̣","✡","∠","｀","▴","┤","∝","♏","ⓐ","✎",";","␤","＇","❣","✂","✤","ⓞ","☪","✴","⌒","˛","♒","＄","✶","▻","ⓔ","◌","◈","❚","❂","￦","◉","╜","̃","✱","╖","❉","ⓡ","↗","ⓣ","♻","➽","׀","✲","✬","☉","▉","≒","☥","⌐","♨","✕","ⓝ","⊰","❘","＂","⇧","̵","➪","▁","▏","⊃","ⓛ","‚","♰","́","✏","⏑","̶","ⓢ","⩾","￠","❍","≃","⋰","♋","､","̂","❋","✳","ⓤ","╤","▕","⌣","✸","℮","⁺","▨","╨","ⓥ","♈","❃","☝","✻","⊇","≻","♘","♞","◂","✟","⌠","✠","☚","✥","❊","ⓒ","⌈","❅","ⓡ","♧","ⓞ","▭","❱","ⓣ","∟","☕","♺","∵","⍝","ⓑ","✵","✣","٭","♆","ⓘ","∶","⚜","◞","்","✹","➥","↕","̳","∷","✋","➧","∋","̿","ͧ","┅","⥤","⬆","⋱","☄","↖","⋮","۔","♌","ⓛ","╕","♓","❯","♍","▋","✺","⭐","✾","♊","➣","▿","ⓑ","♉","⏠","◾","▹","⩽","↦","╥","⍵","⌋","։","➨","∮","⇥","ⓗ","ⓓ","⁻","⎝","⌥","⌉","◔","◑","✼","♎","♐","╪","⊚","☒","⇤","ⓜ","⎠","◐","⚠","╞","◗","⎕","ⓨ","☟","ⓟ","♟","❈","↬","ⓓ","◻","♮","❙","♤","∉","؛","⁂","ⓝ","־","♑","╫","╓","╳","⬅","☔","☸","┄","╧","׃","⎢","❆","⋄","⚫","̏","☏","➞","͂","␙","ⓤ","◟","̊","⚐","✙","↙","̾","℘","✷","⍺","❌","⊢","▵","✅","ⓖ","☨","▰","╡","ⓜ","☤","∽","╘","˹","↨","♙","⬇","♱","⌡","⠀","╛","❕","┉","ⓟ","̀","♖","ⓚ","┆","⎜","◜","⚾","⤴","✇","╟","⎛","☩","➲","➟","ⓥ","ⓗ","⏝","◃","╢","↯","✆","˃","⍴","❇","⚽","╒","̸","♜","☓","➳","⇄","☬","⚑","✐","⌃","◅","▢","❐","∊","☈","॥","⎮","▩","ு","⊹","‵","␔","☊","➸","̌","☿","⇉","⊳","╙","ⓦ","⇣","｛","̄","↝","⎟","▍","❗","״","΄","▞","◁","⛄","⇝","⎪","♁","⇠","☇","✊","ி","｝","⭕","➘","⁀","☙","❛","❓","⟲","⇀","≲","ⓕ","⎥","\u06dd","ͤ","₋","̱","̎","♝","≳","▙","➭","܀","ⓖ","⇛","▊","⇗","̷","⇱","℅","ⓧ","⚛","̐","̕","⇌","␀","≌","ⓦ","⊤","̓","☦","ⓕ","▜","➙","ⓨ","⌨","◮","☷","◍","ⓚ","≔","⏩","⍳","℞","┋","˻","▚","≺","ْ","▟","➻","̪","⏪","̉","⎞","┇","⍟","⇪","▎","⇦","␝","⤷","≖","⟶","♗","̴","♄","ͨ","̈","❜","̡","▛","✁","➩","ா","˂","↥","⏎","⎷","̲","➖","↲","⩵","̗","❢","≎","⚔","⇇","̑","⊿","̖","☍","➹","⥊","⁁","✢"];

def clean_punct(x):
    for punct in puncts:
        if punct in x:
            x = x.replace(punct, f' {punct} ')
    return x

def data_cleaning(x):
    x = clean_tag(x)
    x = clean_contractions(x)
    x = clean_punct(x)
    return x

train_data['preprocessed_question_text'] = train_data['question_text'].progress_map(lambda x: data_cleaning(x))
test_data['preprocessed_question_text'] = test_data['question_text'].progress_map(lambda x: data_cleaning(x))

In [None]:
from tensorflow import keras
import spacy

nlp = spacy.load('en_core_web_lg', disable=['parser','ner','tagger'])

vocab_freq = {}
word2index = {}
lemma_dict = {}

sentences = pd.concat([train_data["preprocessed_question_text"], test_data["preprocessed_question_text"]])
docs = nlp.pipe(sentences, n_threads = 2)
word_sequences = []

for doc in tqdm(docs, total=len(sentences)):
    word_seq = []
    for token in doc:
        if token.is_punct or token.is_space:
            continue
        try:
            vocab_freq[token.text] += 1
        except KeyError:
            vocab_freq[token.text] = 1
        if token.text not in word2index:
            word2index[token.text] = len(vocab_freq)
            lemma_dict[token.text] = token.lemma_
        word_seq.append(word2index[token.text])
    word_sequences.append(word_seq)

vocab_size = len(word2index) + 1

print('Found %s unique tokens.' % vocab_size)

Define the max sentence length. The length should be longer than most sentences in the dataset, otherwise it will lose a lot of useful features.

In [None]:
MAX_SENTENCE_LENGTH = 100

max_text_len = len(max(word_sequences, key=len))
print("max text length in data: ", max_text_len)

percentage = 100 * sum(1 for seq in word_sequences if len(seq) > MAX_SENTENCE_LENGTH)/total
print("percentage of sentences that's length longer than max length > %d in data: %.4f%%" % (MAX_SENTENCE_LENGTH, percentage))

X_data = word_sequences[:len(train_data)]
X_data = keras.preprocessing.sequence.pad_sequences(X_data, maxlen=MAX_SENTENCE_LENGTH)
print('Shape of data tensor:', X_data.shape)

X_test_data = word_sequences[len(train_data):]
X_test_data = keras.preprocessing.sequence.pad_sequences(X_test_data, maxlen=MAX_SENTENCE_LENGTH)

Y_data = train_data['target']

del word_sequences, docs, sentences, train_data
gc.collect()

Fill the embedding layer's weights with word2vec we had loaded before.

In [None]:
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer
 
EMBEDDING_DIM = 300

ps = PorterStemmer()
lc = LancasterStemmer()
sb = SnowballStemmer('english')
lm = WordNetLemmatizer() 

def correction(word):
    return list(candidates(word))[0]

def candidates(word):
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or [word])

def known(words):
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in word2index)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters = 'abcdefghijklmnopqrstuvwxyz'
    splits = [(word[:i], word[i:])        for i in range(len(word) + 1)]
    deletes = [L + R[1:]                  for L, R in splits if R]  
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]  
    replaces = [L + c + R[1:]             for L, R in splits if R for c in letters]
    inserts = [L + c + R                  for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def load_embedding(word2vec):
    oov_count = 0
    vocab_count = 0
    embedding_weights = np.zeros((vocab_size, EMBEDDING_DIM))
    unknown_vector = np.zeros((EMBEDDING_DIM,), dtype=np.float32) - 1.
    unknown_words = {}

    for key, i in tqdm(word2index.items()):
        word = key
        if word in word2vec:
            vocab_count += vocab_freq[key]
            embedding_weights[i] = word2vec[word]
            continue
        word = key.lower()         #Lower
        if word in word2vec:
            vocab_count += vocab_freq[key]
            embedding_weights[i] = word2vec[word]
            continue
        word = key.upper()         #Upper
        if word in word2vec:
            vocab_count += vocab_freq[key]
            embedding_weights[i] = word2vec[word]
            continue
        word = key.capitalize()    #Capitalize 
        if word in word2vec:
            vocab_count += vocab_freq[key]
            embedding_weights[i] = word2vec[word]
            continue
        word = ps.stem(key)        #PorterStemmer
        if word in word2vec:
            vocab_count += vocab_freq[key]
            embedding_weights[i] = word2vec[word]
            continue
        word = lc.stem(key)        #LancasterStemmer
        if word in word2vec:
            vocab_count += vocab_freq[key]
            embedding_weights[i] = word2vec[word]
            continue
        word = sb.stem(key)        #SnowballStemmer
        if word in word2vec:
            vocab_count += vocab_freq[key]
            embedding_weights[i] = word2vec[word]
            continue
        word = lemma_dict[key]     #Lemmanization
        if word in word2vec: 
            vocab_count += vocab_freq[key]
            embedding_weights[i] = word2vec[word]
            continue
        if len(key) > 1:
            word = correction(key)
            if word in word2vec: 
                vocab_count += vocab_freq[key]
                embedding_weights[i] = word2vec[word]
                continue

        try:
            unknown_words[key] += 1
        except KeyError:
            unknown_words[key] = 1
            
        embedding_weights[i] = unknown_vector
        oov_count += vocab_freq[key]

    print('Top 10 Null word embeddings: ')
    print(list(unknown_words.items())[:10])
    print('\n')
    print('Null word embeddings: %d' % np.sum(np.sum(embedding_weights, axis=1) == -1 * EMBEDDING_DIM))
    print('Null word embeddings percentage: %.2f%%' % (100 * oov_count / vocab_count))
    
    return embedding_weights

In [None]:
print('loading glove_vec')
glove_vec = load_vec(GLOVE_FILE)
glove_weights = load_embedding(glove_vec)
del glove_vec
gc.collect()

Define model. I usually use metrics like f1-score, auc for binary classification model. There has three layers in the model. The first layer is Embedding layer that turns the X_train data(now that's the word indexes of vocabulary) into EMBEDDING_DIM dimensional vectors. The second layer is a bidirectinal LSTM that is well-suited to process data base on time-series. The last layer is output layer with a sigmoid activation function. I had set the [bias_initializer](http://https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#optional_set_the_correct_initial_bias) parameter for the output layer due to the imbalanced dataset.

In [None]:
import tensorflow as tf

strategy = None

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect() # TPU detection
    strategy = tf.distribute.TPUStrategy(tpu)
    print('Use TPU')
except ValueError:
    if len(tf.config.list_physical_devices('GPU')) > 0:
        strategy = tf.distribute.MirroredStrategy()
        print('Use GPU')
    else:
        strategy = tf.distribute.get_strategy()
        print('Use CPU')

In [None]:
def f1_smart(y_true, y_pred):
    args = np.argsort(y_pred)
    tp = y_true.sum()
    fs = (tp - np.cumsum(y_true[args[:-1]])) / np.arange(y_true.shape[0] + tp - 1, tp, -1)
    res_idx = np.argmax(fs)
    return 2 * fs[res_idx], (y_pred[args[res_idx]] + y_pred[args[res_idx + 1]]) / 2

def plot_history(history):
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'])
    plt.show()

weight_for_0 = (1 / negative) * (total) / 2.0 
weight_for_1 = (1 / positive) * (total) / 2.0

class_weight = {0: weight_for_0, 1: weight_for_1}

early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    verbose=1,
    patience=1,
    mode='min',
    restore_best_weights=True)

checkpoint = keras.callbacks.ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, mode='min')

reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1)

In [None]:
from tensorflow.keras import layers

class TransformerLayer(layers.Layer):
    def __init__(self, num_heads, hidden_size, dropout_rate=0.1, **kwargs):
        super(TransformerLayer, self).__init__(**kwargs)
        self.num_heads = num_heads
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        
    def build(self, input_shape):
        self.att = layers.MultiHeadAttention(num_heads=self.num_heads, key_dim=input_shape[2])
        self.ffn = keras.Sequential(
            [layers.Dense(self.hidden_size, activation="relu"), layers.Dense(input_shape[2]),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(self.dropout_rate)
        self.dropout2 = layers.Dropout(self.dropout_rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'num_heads': self.num_heads,
            'hidden_size': self.hidden_size,
            'dropout_rate': self.dropout_rate
        })
        return config
    
class PositionEmbedding(layers.Layer):
    def __init__(self, embeding_dim, **kwargs):
        super(PositionEmbedding, self).__init__(**kwargs)
        self.embeding_dim = embeding_dim
        
    def build(self, input_shape): 
        self.max_length = input_shape[-1]
        self.position_embedding = layers.Embedding(input_dim=self.max_length, output_dim=self.embeding_dim, name='position_embedding')

    def call(self, inputs):
        positions = tf.range(start=0, limit=self.max_length, delta=1)
        output = self.position_embedding(positions)
        return output
    
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'embeding_dim': self.embeding_dim
        })
        return config

In [None]:
import tensorflow_addons as tfa
from keras import backend as K
from keras.layers import *
from keras.models import *
from keras.initializers import Constant

def create_model(units):
    output_bias = Constant(np.log([positive/negative]))
    
    x_input = Input(shape=(MAX_SENTENCE_LENGTH,))
    posistion_x = PositionEmbedding(EMBEDDING_DIM)(x_input)
    x = Embedding(vocab_size, EMBEDDING_DIM, input_length=MAX_SENTENCE_LENGTH, weights=[glove_weights], trainable=False)(x_input)
    x = SpatialDropout1D(0.2)(x)
    rnn = Bidirectional(GRU(units, return_sequences=True))(x)
    att = TransformerLayer(2, 128, 0.2)(posistion_x + x)
    x = Concatenate()([rnn, att])
    
    x = GlobalAveragePooling1D()(x)
    x_output = Dense(1, activation='sigmoid', bias_initializer=output_bias)(x)
    
    model = Model(inputs=x_input, outputs=x_output)
    model.compile(loss='binary_crossentropy', optimizer='adam')
    
    return model

In [None]:
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from IPython.display import Image
from keras.utils import plot_model

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

best_f1score = 0

with strategy.scope():
    model = create_model(64)
    model.summary()
    
    plot_model(model, 'model.png', show_shapes=True, show_layer_names=True)
    
    for index, (train_index, valid_index) in enumerate(kfold.split(X_data, Y_data)):
        if index > 1:
            break
        X_train, X_val, Y_train, Y_val = X_data[train_index], X_data[valid_index], Y_data[train_index], Y_data[valid_index]
        history = model.fit(
            X_train, Y_train, 
            epochs=10, 
            batch_size=128, 
            validation_data=(X_val, Y_val),
            callbacks=[reduce_lr], 
            class_weight=class_weight
        )
        plot_history(history)
        Y_pred = model.predict(X_val)
        f1, threshold = f1_smart(Y_val.to_numpy(), np.squeeze(Y_pred))
        best_f1score = max(best_f1score, f1)
        print('Optimal F1: {:.4f} at threshold: {:.4f}\n'.format(f1, threshold))
        
print(f'{"#" * 30} best f1score: {best_f1score} {"#" * 30}')

In [None]:
Image("model.png")

Predict on the test dataset and write to the file named as submission.csv.

In [None]:
# best_model = keras.models.load_model('best_model.h5', custom_objects={"TransformerLayer": TransformerLayer, "PositionEmbedding": PositionEmbedding})

Y_test = (model.predict(X_test_data) > threshold).astype("int32")

print('Write results to submission.csv')
submit_data = pd.DataFrame({'qid': test_data.qid, 'prediction': Y_test.reshape(-1)})
submit_data.to_csv('submission.csv', index=False)

!head submission.csv

References:

https://www.kaggle.com/ronaksvijay/qiqc-nn