# Introduction
**This notebook introduced how to solve an imbalanced text classification problem with LSTM networks and word embedding.**

Import some required libraries.

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

import gc
import sys

from tqdm.notebook import tqdm
tqdm().pandas()
pd.set_option('display.max_colwidth', None)

# Set seed for experiment reproducibility
seed = 1024
tf.random.set_seed(seed)
np.random.seed(seed)

def print_size(var):  
    print('%.2fMB' % (sys.getsizeof(var)/1024/1024))

Load the train and test dataset.

In [None]:
train_data = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/train.csv')
test_data = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/test.csv')

It's not necessary using entire dataset to train if you just run it quickly.

In [None]:
# train_data = train_data[0:100000]
# test_data = test_data[0:10000]

Let's see what's in the dataset and print the first 5 rows in train data.

In [None]:
train_data.head()

Let's see how imbalanced the dataset is.

In [None]:
negative, positive = np.bincount(train_data['target'])
total = negative + positive
print('total: {}    positive: {} ({:.2f}% of total)'.format(total, positive, 100 * positive / total))

Word vectorizing. converting words into numbers so that can be fed into neural network.

In [None]:
import re

def clean_tag(text):
    if '[math]' in text:
        text = re.sub('\[math\].*?math\]', '[formula]', text) #replacing with [formuala]

    if 'http' in text or 'www' in text:
        text = re.sub('(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-?=%.]+', '[url]', text) #replacing with [url]
    return text

contraction_mapping = {"We'd": "We had", "That'd": "That had", "AREN'T": "Are not", "HADN'T": "Had not", "Could've": "Could have", "LeT's": "Let us", "How'll": "How will", "They'll": "They will", "DOESN'T": "Does not", "HE'S": "He has", "O'Clock": "Of the clock", "Who'll": "Who will", "What'S": "What is", "Ain't": "Am not", "WEREN'T": "Were not", "Y'all": "You all", "Y'ALL": "You all", "Here's": "Here is", "It'd": "It had", "Should've": "Should have", "I'M": "I am", "ISN'T": "Is not", "Would've": "Would have", "He'll": "He will", "DON'T": "Do not", "She'd": "She had", "WOULDN'T": "Would not", "She'll": "She will", "IT's": "It is", "There'd": "There had", "It'll": "It will", "You'll": "You will", "He'd": "He had", "What'll": "What will", "Ma'am": "Madam", "CAN'T": "Can not", "THAT'S": "That is", "You've": "You have", "She's": "She is", "Weren't": "Were not", "They've": "They have", "Couldn't": "Could not", "When's": "When is", "Haven't": "Have not", "We'll": "We will", "That's": "That is", "We're": "We are", "They're": "They' are", "You'd": "You would", "How'd": "How did", "What're": "What are", "Hasn't": "Has not", "Wasn't": "Was not", "Won't": "Will not", "There's": "There is", "Didn't": "Did not", "Doesn't": "Does not", "You're": "You are", "He's": "He is", "SO's": "So is", "We've": "We have", "Who's": "Who is", "Wouldn't": "Would not", "Why's": "Why is", "WHO's": "Who is", "Let's": "Let us", "How's": "How is", "Can't": "Can not", "Where's": "Where is", "They'd": "They had", "Don't": "Do not", "Shouldn't":"Should not", "Aren't":"Are not", "ain't": "is not", "What's": "What is", "It's": "It is", "Isn't":"Is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }

def clean_contractions(text):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    
    text = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in text.split(" ")])
    return text

puncts = [",",".",'"',":",")","(","-","!","?","|",";","'","$","&","/","[","]",">","%","=","#","*","+","\\","•","~","@","£","·","_","{","}","©","^","®","`","<","→","°","€","™","›","♥","←","×","§","″","′","█","…","“","★","”","–","●","►","−","¢","¬","░","¡","¶","↑","±","¿","▾","═","¦","║","―","¥","▓","—","‹","─","▒","：","⊕","▼","▪","†","■","’","▀","¨","▄","♫","☆","¯","♦","¤","▲","¸","⋅","‘","∞","∙","）","↓","、","│","（","»","，","♪","╩","╚","・","╦","╣","╔","╗","▬","❤","≤","‡","√","◄","━","⇒","▶","≥","╝","♡","◊","。","✈","≡","☺","✔","↵","≈","✓","♣","☎","℃","◦","└","‟","～","！","○","◆","№","♠","▌","✿","▸","⁄","□","❖","✦","．","÷","｜","┃","／","￥","╠","↩","✭","▐","☼","☻","┐","├","«","∼","┌","℉","☮","฿","≦","♬","✧","〉","－","⌂","✖","･","◕","※","‖","◀","‰","\x97","↺","∆","┘","┬","╬","،","⌘","⊂","＞","〈","⎙","？","☠","⇐","▫","∗","∈","≠","♀","♔","˚","℗","┗","＊","┼","❀","＆","∩","♂","‿","∑","‣","➜","┛","⇓","☯","⊖","☀","┳","；","∇","⇑","✰","◇","♯","☞","´","↔","┏","｡","◘","∂","✌","♭","┣","┴","┓","✨","\xa0","˜","❥","┫","℠","✒","［","∫","\x93","≧","］","\x94","∀","♛","\x96","∨","◎","↻","⇩","＜","≫","✩","✪","♕","؟","₤","☛","╮","␊","＋","┈","％","╋","▽","⇨","┻","⊗","￡","।","▂","✯","▇","＿","➤","✞","＝","▷","△","◙","▅","✝","∧","␉","☭","┊","╯","☾","➔","∴","\x92","▃","↳","＾","׳","➢","╭","➡","＠","⊙","☢","˝","∏","„","∥","❝","☐","▆","╱","⋙","๏","☁","⇔","▔","\x91","➚","◡","╰","\x85","♢","˙","۞","✘","✮","☑","⋆","ⓘ","❒","☣","✉","⌊","➠","∣","❑","◢","ⓒ","\x80","〒","∕","▮","⦿","✫","✚","⋯","♩","☂","❞","‗","܂","☜","‾","✜","╲","∘","⟩","＼","⟨","·","✗","♚","∅","ⓔ","◣","͡","‛","❦","◠","✄","❄","∃","␣","≪","｢","≅","◯","☽","∎","｣","❧","̅","ⓐ","↘","⚓","▣","˘","∪","⇢","✍","⊥","＃","⎯","↠","۩","☰","◥","⊆","✽","⚡","↪","❁","☹","◼","☃","◤","❏","ⓢ","⊱","➝","̣","✡","∠","｀","▴","┤","∝","♏","ⓐ","✎",";","␤","＇","❣","✂","✤","ⓞ","☪","✴","⌒","˛","♒","＄","✶","▻","ⓔ","◌","◈","❚","❂","￦","◉","╜","̃","✱","╖","❉","ⓡ","↗","ⓣ","♻","➽","׀","✲","✬","☉","▉","≒","☥","⌐","♨","✕","ⓝ","⊰","❘","＂","⇧","̵","➪","▁","▏","⊃","ⓛ","‚","♰","́","✏","⏑","̶","ⓢ","⩾","￠","❍","≃","⋰","♋","､","̂","❋","✳","ⓤ","╤","▕","⌣","✸","℮","⁺","▨","╨","ⓥ","♈","❃","☝","✻","⊇","≻","♘","♞","◂","✟","⌠","✠","☚","✥","❊","ⓒ","⌈","❅","ⓡ","♧","ⓞ","▭","❱","ⓣ","∟","☕","♺","∵","⍝","ⓑ","✵","✣","٭","♆","ⓘ","∶","⚜","◞","்","✹","➥","↕","̳","∷","✋","➧","∋","̿","ͧ","┅","⥤","⬆","⋱","☄","↖","⋮","۔","♌","ⓛ","╕","♓","❯","♍","▋","✺","⭐","✾","♊","➣","▿","ⓑ","♉","⏠","◾","▹","⩽","↦","╥","⍵","⌋","։","➨","∮","⇥","ⓗ","ⓓ","⁻","⎝","⌥","⌉","◔","◑","✼","♎","♐","╪","⊚","☒","⇤","ⓜ","⎠","◐","⚠","╞","◗","⎕","ⓨ","☟","ⓟ","♟","❈","↬","ⓓ","◻","♮","❙","♤","∉","؛","⁂","ⓝ","־","♑","╫","╓","╳","⬅","☔","☸","┄","╧","׃","⎢","❆","⋄","⚫","̏","☏","➞","͂","␙","ⓤ","◟","̊","⚐","✙","↙","̾","℘","✷","⍺","❌","⊢","▵","✅","ⓖ","☨","▰","╡","ⓜ","☤","∽","╘","˹","↨","♙","⬇","♱","⌡","⠀","╛","❕","┉","ⓟ","̀","♖","ⓚ","┆","⎜","◜","⚾","⤴","✇","╟","⎛","☩","➲","➟","ⓥ","ⓗ","⏝","◃","╢","↯","✆","˃","⍴","❇","⚽","╒","̸","♜","☓","➳","⇄","☬","⚑","✐","⌃","◅","▢","❐","∊","☈","॥","⎮","▩","ு","⊹","‵","␔","☊","➸","̌","☿","⇉","⊳","╙","ⓦ","⇣","｛","̄","↝","⎟","▍","❗","״","΄","▞","◁","⛄","⇝","⎪","♁","⇠","☇","✊","ி","｝","⭕","➘","⁀","☙","❛","❓","⟲","⇀","≲","ⓕ","⎥","\u06dd","ͤ","₋","̱","̎","♝","≳","▙","➭","܀","ⓖ","⇛","▊","⇗","̷","⇱","℅","ⓧ","⚛","̐","̕","⇌","␀","≌","ⓦ","⊤","̓","☦","ⓕ","▜","➙","ⓨ","⌨","◮","☷","◍","ⓚ","≔","⏩","⍳","℞","┋","˻","▚","≺","ْ","▟","➻","̪","⏪","̉","⎞","┇","⍟","⇪","▎","⇦","␝","⤷","≖","⟶","♗","̴","♄","ͨ","̈","❜","̡","▛","✁","➩","ா","˂","↥","⏎","⎷","̲","➖","↲","⩵","̗","❢","≎","⚔","⇇","̑","⊿","̖","☍","➹","⥊","⁁","✢"];

def clean_punct(x):
    for punct in puncts:
        if punct in x:
            x = x.replace(punct, f' {punct} ')
    return x

def data_cleaning(x):
    x = clean_tag(x)
    x = clean_contractions(x)
    x = clean_punct(x)
    return x

train_data['preprocessed_question_text'] = train_data['question_text'].progress_map(lambda x: data_cleaning(x))
test_data['preprocessed_question_text'] = test_data['question_text'].progress_map(lambda x: data_cleaning(x))

Define the max sentence length. The length should be longer than most sentences in the dataset, otherwise it will lose a lot of useful features.

In [None]:
from transformers import BertConfig, BertTokenizer, TFBertModel

pretrained_model_name = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(pretrained_model_name, do_lower_case=True)

In [None]:
batch_size = 64
max_length = 100

X_data = tokenizer(
    train_data['preprocessed_question_text'].tolist(), 
    max_length=max_length, 
    padding='max_length',
    truncation=True,
    return_tensors='np'
)

X_test = tokenizer(
    test_data['preprocessed_question_text'].tolist(), 
    max_length=max_length, 
    padding='max_length',
    truncation=True,
    return_tensors='np'
)

X_data = {
    "input_ids": X_data["input_ids"],
    "token_type_ids": X_data["token_type_ids"],
    "attention_mask": X_data["attention_mask"],
}

X_test = {
    "input_ids": X_test["input_ids"],
    "token_type_ids": X_test["token_type_ids"],
    "attention_mask": X_test["attention_mask"],
}

y_data = train_data['target'].to_numpy().reshape(-1,1)

In [None]:
def create_model():
    output_bias = tf.keras.initializers.Constant(np.log([positive/negative]))

    config = BertConfig.from_pretrained(pretrained_model_name) 
    config.output_hidden_states=True

    transformers_model = TFBertModel.from_pretrained(pretrained_model_name, config=config)
    transformers_model.bert.trainable = False

    input_ids = tf.keras.layers.Input(
        shape=(max_length,), 
        name='input_ids', 
        dtype='int32'
    )
    input_token = tf.keras.layers.Input(
        shape=(max_length,), 
        name='token_type_ids', 
        dtype='int32'
    )
    input_attention = tf.keras.layers.Input(
        shape=(max_length,), 
        name='attention_mask', 
        dtype='int32'
    )

    x = transformers_model(input_ids=input_ids, token_type_ids=input_token, attention_mask=input_attention)
    x = tf.keras.layers.concatenate(tuple([x.hidden_states[i] for i in [0, -2, -1]]))
    
    lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(x)
    gru = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True))(x)
    x = tf.keras.layers.Concatenate()([lstm, gru])
    x = tf.keras.layers.GlobalAveragePooling1D()(x)

    outputs = tf.keras.layers.Dense(1, activation='sigmoid', bias_initializer=output_bias)(x)

    model = tf.keras.Model(inputs=[input_ids, input_attention, input_token], outputs=outputs)
    
    return model

In [None]:
def f1_smart(y_true, y_pred):
    args = np.argsort(y_pred)
    tp = y_true.sum()
    fs = (tp - np.cumsum(y_true[args[:-1]])) / np.arange(y_true.shape[0] + tp - 1, tp, -1)
    res_idx = np.argmax(fs)
    return 2 * fs[res_idx], (y_pred[args[res_idx]] + y_pred[args[res_idx + 1]]) / 2

In [None]:
import tensorflow as tf

strategy = None

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
    strategy = tf.distribute.TPUStrategy(tpu)
    print('Use TPU')
except ValueError:
    if len(tf.config.list_physical_devices('GPU')) > 0:
        strategy = tf.distribute.MirroredStrategy()
        print('Use GPU')
    else:
        strategy = tf.distribute.get_strategy()
        print('Use CPU')

In [None]:
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from IPython.display import Image
from keras.utils import plot_model

weight_for_0 = (1 / negative) * (total) / 2.0 
weight_for_1 = (1 / positive) * (total) / 2.0

class_weight = {0: weight_for_0, 1: weight_for_1}

checkpoint = tf.keras.callbacks.ModelCheckpoint('best_model.h5', monitor='val_loss', save_weights_only=True, save_best_only=True, mode='min')

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=1)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

with strategy.scope():
    model = create_model()
    model.compile(loss='binary_crossentropy', optimizer='adam')
    model.summary()
    
    for index, (train_index, valid_index) in enumerate(kfold.split(np.zeros(len(y_data)), y_data)):
        if index > 1:
            break
            
        y_train, y_val = y_data[train_index], y_data[valid_index]
        X_train = {
            "input_ids": X_data["input_ids"][train_index],
            "token_type_ids": X_data["token_type_ids"][train_index],
            "attention_mask": X_data["attention_mask"][train_index],
        }
        X_val = {
            "input_ids": X_data["input_ids"][valid_index],
            "token_type_ids": X_data["token_type_ids"][valid_index],
            "attention_mask": X_data["attention_mask"][valid_index],
        }

        history = model.fit(
            X_train,
            y_train,
            epochs=5,
            batch_size=batch_size,
            validation_data=(X_val, y_val),
            class_weight=class_weight,
            callbacks=[reduce_lr, checkpoint]
        )

        y_pred = model.predict(X_val)
        f1, threshold = f1_smart(y_val, np.squeeze(y_pred))
        print('Optimal F1: {:.4f} at threshold: {:.4f}\n'.format(f1, threshold))

Predict on the test dataset and write to the file named as submission.csv.

In [None]:
with strategy.scope():
    best_model = create_model()
    best_model.load_weights('best_model.h5')

    y_pred = model.predict(X_val)
    f1, threshold = f1_smart(y_val, np.squeeze(y_pred))
    print('Optimal F1: {:.4f} at threshold: {:.4f}\n'.format(f1, threshold))
    
    Y_test = (best_model.predict(X_test) > threshold).astype("int32")

    print('Write results to submission.csv')
    submit_data = pd.DataFrame({'qid': test_data.qid, 'prediction': Y_test.reshape(-1)})
    submit_data.to_csv('submission.csv', index=False)

!head submission.csv