# Disaster Tweets Classification: Transformer

## Table of Contents
- [1. Overview](#1.)
- [2. Import Packages and Datasets](#2.)
- [3. Data Wrangling](#3.)
- [4. Exploratory Data Analysis & Data Preprocessing](#4.)
- [5. Model Development](#5.)
- [6. Submission](#6.)

<a id="1."></a>
## 1. Overview
In this notebook I will build a Disaster Tweets Classification Model using Transformer.

<a id="2."></a>
## 2. Import Packages and Datasets 

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import StratifiedKFold

In [None]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
train.head()

In [None]:
train.shape

In [None]:
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
test.head()

In [None]:
train.location.value_counts()

<a id="3."></a>
## 3. Data Wrangling
Let's see null values for each column.

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train["keyword"].replace(np.NAN, "", inplace=True)
train["location"].replace(np.NAN, "", inplace=True)
test["keyword"].replace(np.NAN, "", inplace=True)
test["location"].replace(np.NAN, "", inplace=True)

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
len(test["keyword"])

<a id="4."></a>
## 4. Exploratory Data Analysis & Data Preprocessing
- Tokenize Texts
- Show Staticstic info of texts

In [None]:
contents = []
for data in [train, test]:
    for i in range(data.shape[0]):
        item = data.iloc[i]
        sentence = item["keyword"] + " " + item["text"] + " " + item["location"]
        contents.append(sentence.lower())

### Tokenize texts

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()

tokenizer.fit_on_texts(contents)

In [None]:
tokens = tokenizer.texts_to_sequences(contents)

In [None]:
word_counter = dict()
for token in tokens:
    for item in token:
        key = tokenizer.index_word[item]
        if key in word_counter:
            word_counter[key] += 1
        else:
            word_counter[key] = 1

In [None]:
word_freq = pd.DataFrame({"word": word_counter.keys(), "count": word_counter.values()})
word_freq.sort_values(ascending=False, by="count", inplace=True)
word_freq.head(10)

As we can see, there are 25000 words just appear once and 3000 words appear twice. It would be hard for us to find patterns in them without prior knowledge. Machine can't learn from words just appear once or twice.

In [None]:
word_freq["count"].plot(kind="hist")

In [None]:
word_freq[word_freq["count"] < 100].plot(kind="hist")

In [None]:
word_freq[word_freq["count"] <= 10].plot(kind="hist")

In [None]:
word_freq[word_freq["count"] <= 3].plot(kind="hist")

### Remove words that seldom appears

In [None]:
lower_thresold = 1
word_appear_less = list(word_freq[word_freq["count"] <= lower_thresold]["word"])

In [None]:
len(word_appear_less)

### Remove words that appear too often

Let's see first 100 words. Choose stop words based on that. But remove some words related to disasters.

In [None]:
list(word_freq["word"][:100])

In [None]:
stop_words = ['co',
 't',
 'http',
 'the',
 'a',
 'in',
 'to',
 'of',
 'and',
 'i',
 'is',
 'on',
 'for',
 'you',
 'my',
 'it',
 'with',
 'that',
 'by',
 'at',
 'this',
 'new',
 'from',
 'https',
 'are',
 'be',
 'was',
 'have',
 'like',
 'as',
 'up',
 'just',
 'your',
 'not',
 'but',
 'me',
 'so',
 'no',
 'all',
 'will',
 'after',
 'an',
 'we',
 "i'm",
 'if',
 'when',
 'has',
 'via',
 'get',
 'or',
 '2',
 'more',
 'about',
 'now',
 'he',
 'how',
 'they',
 'one',
 'people',
 'what',
 "it's",
 'who',
 'news',
 'over',
 'been',
 'do',
 'ca',
 'into',
 'can',
 'there',
 'video',
 'u',
 '3',
 'would',
 'world',
 'her',
 'us',
 's',
 'his',
 'than',
 "'",
 '1',
 'still',
 'some'
]

In [None]:
exclude_set = set(word_appear_less + stop_words)

In [None]:
new_sentences = []
for token in tokens:
    new_token = []
    for item in token:
        word = tokenizer.index_word[item]
        if not word in exclude_set:
            new_token.append(word)
    new_sentences.append(" ".join(new_token))

In [None]:
new_sentences[:10]

Create a new tokenizer to preprocess these texts again.

In [None]:
new_tokenizer = tf.keras.preprocessing.text.Tokenizer()
new_tokenizer.fit_on_texts(new_sentences)

In [None]:
new_tokens = new_tokenizer.texts_to_sequences(new_sentences)

### Lengths

In [None]:
new_tokens_lengths = [len(token) for token in  new_tokens]

In [None]:
lengths = pd.DataFrame({"length":new_tokens_lengths})

In [None]:
lengths.describe()

In [None]:
padding_tokens = tf.keras.preprocessing.sequence.pad_sequences(new_tokens, maxlen=30, padding='post', truncating='post')

In [None]:
x_train = padding_tokens[:len(train)]
y_train = train["target"]
x_test = padding_tokens[len(train):]

In [None]:
train["target"].value_counts()

In [None]:
x_train.shape

In [None]:
x_train[2]

<a id="5."></a>
## Model Development

### BinaryCrossEntropy with weights 
Use this version of BinaryCrossEntropy to solve class imbalance problem.

In [None]:
class BinaryCrossEntropy(tf.keras.losses.Loss):

    def __init__(self, postive_rate = 0.5):
        super().__init__()
        self.negative_weights = postive_rate
        self.positive_weights = 1 - postive_rate
        
    def call(self, y_true, y_pred):
        print(y_true, y_pred)
        y_true = tf.cast(y_true, y_pred.dtype)
        pos = self.positive_weights * y_true * tf.math.log(y_pred + tf.keras.backend.epsilon())
        neg = self.negative_weights * (1.0 - y_true) * tf.math.log(1.0 - y_pred + tf.keras.backend.epsilon())
        return -(pos + neg)

### Transformer Block

In [None]:
def tranformer_block(inputs, embed_dim, num_heads, ff_dim, dropout_rate=0.1, training=True):
    attention_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(inputs, inputs)
    attention_output = layers.Dropout(dropout_rate)(attention_output, training=training)
    out1 = layers.LayerNormalization(epsilon=1e-6)(inputs + attention_output)
    ffn_output = layers.Dense(ff_dim, activation="relu")(out1)
    ffn_output = layers.Dense(embed_dim)(ffn_output)
    ffn_output = layers.Dropout(dropout_rate)(ffn_output, training=training)
    output = layers.LayerNormalization(epsilon=1e-6)(out1 + ffn_output)
    return output

### Embeding Layer
Two separate Embedding Layers, one for tokens, another for positions.

In [None]:
def embedding_block(inputs, maxlen, vocab_size, embed_dim):
    positions = tf.range(start=0, limit=maxlen, delta=1)
    positions = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)(positions)
    x = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)(inputs)
    return  x + positions

### Text Classification Model

In [None]:
def get_model(maxlen, vocab_size, embed_dim, num_heads, ff_dim):
    inputs = layers.Input(shape=(maxlen, ))
    x = embedding_block(inputs, maxlen, vocab_size, embed_dim)
    x = tranformer_block(x, embed_dim, num_heads, ff_dim, dropout_rate=0.1)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Dense(20, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    return keras.Model(inputs=inputs, outputs=outputs)

In [None]:
vocab_size = len(new_tokenizer.index_word) + 1
max_content_length = 30
print(vocab_size, max_content_length)

In [None]:
model = get_model(max_content_length, vocab_size, 128, 2, 32)
keras.utils.plot_model(model, show_shapes=True)

In [None]:
index = 1
models = []
tf.keras.backend.clear_session()
for train_indices, val_indices in StratifiedKFold(5, shuffle=True, random_state=42).split(x_train, y_train):
    print("Fold %d" %(index))
    train_features, train_targets = x_train[train_indices], y_train[train_indices]
    validation_features, validation_targets = x_train[val_indices], y_train[val_indices]
    model_checkpoint_path = "model%d.h5"%(index)
    model = get_model(max_content_length, vocab_size, 128, 4, 64)
    loss = BinaryCrossEntropy(train_targets.mean())
    adam = tf.keras.optimizers.Adam(1e-4)
    model.compile(loss=loss, optimizer=adam, metrics=["accuracy"])
    early_stop = tf.keras.callbacks.EarlyStopping(patience=5)
    recuce_Lr = tf.keras.callbacks.ReduceLROnPlateau(patience=2)
    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(model_checkpoint_path, monitor="val_accuracy", save_best_only=True, save_weights_only=True)
    history = model.fit(train_features, train_targets, validation_data=(validation_features, validation_targets), epochs=100, callbacks=[early_stop, model_checkpoint])
    pd.DataFrame(history.history).plot(kind="line")
    plt.title("Performance of Fold %d"%(index))
    plt.show()
    model.load_weights(model_checkpoint_path)
    y_val_pred = np.array(model.predict(validation_features) > 0.5, dtype="int").reshape(-1)
    cm = confusion_matrix(validation_targets, y_val_pred)
    sns.heatmap(cm)
    plt.show()
    print("Classification Report: \n")
    print(classification_report(validation_targets, y_val_pred))
    acc_score = accuracy_score(validation_targets, y_val_pred)
    print("Accuracy Score: %.2f"%(acc_score))
    models.append(model)
    index += 1

<a id="6."></a>
## 6. Submission

In [None]:
y_test = np.mean([model.predict(x_test).reshape(-1) for model in models], axis=0)
y_test = np.array(y_test > 0.5, dtype=int)
submission = pd.DataFrame({"id": test["id"], "target": y_test})
submission.to_csv("submission.csv", index=False)