# Disaster Tweets Classification: Switch Transformer

## Table of Contents
- Overview
- Configuration
- Import Packages and Datasets
- Data Wrangling
- EDA & Preprocessing
- Model Development
- Model Evaluation
- Submission
- References

# Overview
In this notebook I will build a Disaster Tweets Classification Model with Switch Transformer. I also make this notebook compatible with Colab and maybe other platforms. When you use this notebook in Colab, it's even more convinient that you can submit your result automatically.

## Setup

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import time
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import StratifiedKFold
import os

In [None]:
class Config:
    
    embed_dim = 64  # Embedding size for each token.
    num_heads = 2  # Number of attention heads
    ff_dim = 32  # Hidden layer size in feedforward network.
    num_experts = 10  # Number of experts used in the Switch Transformer.
    batch_size = 50  # Batch size.
    learning_rate = 1e-4  # Learning rate.
    dropout_rate = 0.25  # Dropout rate.
    num_epochs = 3  # Number of epochs.
    vocab_size = 5000  # Only consider the top 5000 words
    num_tokens_per_example = 30 
    num_tokens_per_batch= (
        batch_size * num_tokens_per_example
    )  # Total number of tokens per batch.
    is_kaggle_platform = os.path.exists("/kaggle/input")
    
    dataset_name = "nlp-getting-started"

    data_path = "/kaggle/input/%s/"%(dataset_name) if is_kaggle_platform else ""
    
    submit_filename = "submission.csv"
config = Config()

In [None]:
if not config.is_kaggle_platform:
  try:
    import kaggle
  except:
    !pip install kaggle
  if not os.path.exists("/root/.kaggle/kaggle.json"):
    # Replace this place to your user name and API key
    !echo "{\"username\":\"{Your user name}\",\"key\":\"{Your API Key}\"}" >> /root/.kaggle/kaggle.json
    !chmod 600 /root/.kaggle/kaggle.json
  !kaggle competitions download -c $config.dataset_name

In [None]:
train = pd.read_csv(config.data_path + "train.csv")
train.head()

In [None]:
test = pd.read_csv(config.data_path + "test.csv")
test.head()

In [None]:
test.shape

## Data Wrangling
Let's see null values for each column.

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train["keyword"].replace(np.NAN, "", inplace=True)
train["location"].replace(np.NAN, "", inplace=True)
test["keyword"].replace(np.NAN, "", inplace=True)
test["location"].replace(np.NAN, "", inplace=True)

## EDA & Preprocessing
- Tokenize Texts
- Remove words that seldom appears
- Remove stop words

In [None]:
contents = []
for data in [train, test]:
    for i in range(data.shape[0]):
        item = data.iloc[i]
        sentence = item["keyword"] + " " + item["text"] + " " + item["location"]
        contents.append(sentence.lower())

## Tokenize texts

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()

tokenizer.fit_on_texts(contents)

In [None]:
tokens = tokenizer.texts_to_sequences(contents)

In [None]:
word_counter = dict()
for token in tokens:
    for item in token:
        key = tokenizer.index_word[item]
        if key in word_counter:
            word_counter[key] += 1
        else:
            word_counter[key] = 1

In [None]:
word_freq = pd.DataFrame({"word": word_counter.keys(), "count": word_counter.values()})
word_freq.sort_values(ascending=False, by="count", inplace=True)
word_freq.head(10)

### Number of words

In [None]:
len(word_counter)

There are 25000 words just appear once and 3000 words appear twice. It would be hard for us to find patterns in them without prior knowledge. Machine can't learn from words just appear once or twice.

In [None]:
word_freq["count"].plot(kind="hist")

In [None]:
word_freq[word_freq["count"] < 100].plot(kind="hist")

In [None]:
word_freq[word_freq["count"] <= 10].plot(kind="hist")

In [None]:
word_freq[word_freq["count"] <= 3].plot(kind="hist")

### Remove words that seldom appears

In [None]:
lower_thresold = 3
word_appear_less = list(word_freq[word_freq["count"] <= lower_thresold]["word"])

In [None]:
len(word_appear_less)

### Remove stop words

Let's see first 100 words. Choose stop words based on that. But remove some words related to disasters.

In [None]:
stopword_candidates = list(word_freq["word"][:100])
print(stopword_candidates)
#stopword_candidate_indices = set([tokenizer.word_index[word] for word in stopword_candidates])

In [None]:
stop_words = ['co', 't', 'http', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is', 'on', 'for', 'you', 'my', 'it', 'with', 'that', 'by', 'at', 'this', 'new', 'from', 'https', 'are', 'be', 'was', 'have', 'like', 'as', 'up', 'just', 'your', 'not', 'but', 'me', 'so', 'no', 'all', 'will', 'after', 'an', 'we', "i'm", 'if', 'when', 'has', 'via', 'get', 'or', '2', 'more', 'about', 'now', 'he', 'how', 'they', 'one', 'people', 'what', "it's", 'who', 'news', 'over', 'been', 'do', 'ca', 'into', 'can', 'there', 'video', 'u', '3', 'would', 'world', 'her', 'us', 's', 'his', 'than', "'", '1', 'still', 'some']
print(stop_words)

In [None]:
exclude_set = set(word_appear_less + stop_words)

In [None]:
new_sentences = []
for token in tokens:
    new_token = []
    for item in token:
        word = tokenizer.index_word[item]
        if not word in exclude_set:
            new_token.append(word)
    new_sentences.append(" ".join(new_token))

In [None]:
new_sentences[:10]

Create a new tokenizer to preprocess these texts again.

In [None]:
new_tokenizer = tf.keras.preprocessing.text.Tokenizer()
new_tokenizer.fit_on_texts(new_sentences)

In [None]:
new_tokens = new_tokenizer.texts_to_sequences(new_sentences)

## Lengths

In [None]:
new_tokens_lengths = [len(token) for token in new_tokens]

In [None]:
lengths = pd.DataFrame({"length":new_tokens_lengths})

In [None]:
lengths.describe()

In [None]:
padding_tokens = tf.keras.preprocessing.sequence.pad_sequences(new_tokens, maxlen=30, padding='post', truncating='post')

In [None]:
x_train = padding_tokens[:len(train)]
y_train = train["target"]
x_test = padding_tokens[len(train):]

There's slightly Class Imbalance Problem.

In [None]:
train["target"].value_counts()

## Model Development

### Implement token & position embedding layer
It consists of two seperate embedding layers, one for tokens, one for token index (positions).

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

### Implement the feedforward network

In [None]:
def create_feedforward_network(ff_dim, name=None):
    return keras.Sequential(
        [layers.Dense(ff_dim, activation="relu"), layers.Dense(ff_dim)], name=name
    )

### Implement the load-balanced loss
This is an auxiliary loss to encourage a balanced load across experts.

In [None]:
def load_balanced_loss(router_probs, expert_mask):
    # router_probs [tokens_per_batch, num_experts] is the probability assigned for
    # each expert per token. expert_mask [tokens_per_batch, num_experts] contains
    # the expert with the highest router probability in one−hot format.

    num_experts = tf.shape(expert_mask)[-1]
    # Get the fraction of tokens routed to each expert.
    # density is a vector of length num experts that sums to 1.
    density = tf.reduce_mean(expert_mask, axis=0)
    # Get fraction of probability mass assigned to each expert from the router
    # across all tokens. density_proxy is a vector of length num experts that sums to 1.
    density_proxy = tf.reduce_mean(router_probs, axis=0)
    # Want both vectors to have uniform allocation (1/num experts) across all
    # num_expert elements. The two vectors will be pushed towards uniform allocation
    # when the dot product is minimized.
    loss = tf.reduce_mean(density_proxy * density) * tf.cast(
        (num_experts ** 2), tf.dtypes.float32
    )
    return loss

### Implement the router as a layer

In [None]:
class Router(layers.Layer):
    def __init__(self, num_experts, expert_capacity):
        self.num_experts = num_experts
        self.route = layers.Dense(units=num_experts)
        self.expert_capacity = expert_capacity
        super(Router, self).__init__()

    def call(self, inputs, training=False):
        # inputs shape: [tokens_per_batch, embed_dim]
        # router_logits shape: [tokens_per_batch, num_experts]
        router_logits = self.route(inputs)

        if training:
            # Add noise for exploration across experts.
            router_logits += tf.random.uniform(
                shape=router_logits.shape, minval=0.9, maxval=1.1
            )
        # Probabilities for each token of what expert it should be sent to.
        router_probs = keras.activations.softmax(router_logits, axis=-1)
        # Get the top−1 expert for each token. expert_gate is the top−1 probability
        # from the router for each token. expert_index is what expert each token
        # is going to be routed to.
        expert_gate, expert_index = tf.math.top_k(router_probs, k=1)
        # expert_mask shape: [tokens_per_batch, num_experts]
        expert_mask = tf.one_hot(expert_index, depth=self.num_experts)
        # Compute load balancing loss.
        aux_loss = load_balanced_loss(router_probs, expert_mask)
        self.add_loss(aux_loss)
        # Experts have a fixed capacity, ensure we do not exceed it. Construct
        # the batch indices, to each expert, with position in expert make sure that
        # not more that expert capacity examples can be routed to each expert.
        position_in_expert = tf.cast(
            tf.math.cumsum(expert_mask, axis=0) * expert_mask, tf.dtypes.int32
        )
        # Keep only tokens that fit within expert capacity.
        expert_mask *= tf.cast(
            tf.math.less(
                tf.cast(position_in_expert, tf.dtypes.int32), self.expert_capacity
            ),
            tf.dtypes.float32,
        )
        expert_mask_flat = tf.reduce_sum(expert_mask, axis=-1)
        # Mask out the experts that have overflowed the expert capacity.
        expert_gate *= expert_mask_flat
        # Combine expert outputs and scaling with router probability.
        # combine_tensor shape: [tokens_per_batch, num_experts, expert_capacity]
        combined_tensor = tf.expand_dims(
            expert_gate
            * expert_mask_flat
            * tf.squeeze(tf.one_hot(expert_index, depth=self.num_experts), 1),
            -1,
        ) * tf.squeeze(tf.one_hot(position_in_expert, depth=self.expert_capacity), 1)
        # Create binary dispatch_tensor [tokens_per_batch, num_experts, expert_capacity]
        # that is 1 if the token gets routed to the corresponding expert.
        dispatch_tensor = tf.cast(combined_tensor, tf.dtypes.float32)

        return dispatch_tensor, combined_tensor

###  Switch layer

In [None]:
class Switch(layers.Layer):
    def __init__(self, num_experts, embed_dim, num_tokens_per_batch, capacity_factor=1):
        self.num_experts = num_experts
        self.embed_dim = embed_dim
        self.experts = [
            create_feedforward_network(embed_dim) for _ in range(num_experts)
        ]
        self.num_tokens_per_batch = num_tokens_per_batch
        self.expert_capacity = num_tokens_per_batch // self.num_experts
        self.router = Router(self.num_experts, self.expert_capacity)
        super(Switch, self).__init__()

    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]
        num_tokens_per_example = tf.shape(inputs)[1]

        # inputs shape: [num_tokens_per_batch, embed_dim]
        #self.num_tokens_per_batch
        inputs = tf.reshape(inputs, [-1, self.embed_dim])
        # dispatch_tensor shape: [expert_capacity, num_experts, tokens_per_batch]
        # combine_tensor shape: [tokens_per_batch, num_experts, expert_capacity]
        dispatch_tensor, combine_tensor = self.router(inputs)
        # expert_inputs shape: [num_experts, expert_capacity, embed_dim]
        expert_inputs = tf.einsum("ab,acd->cdb", inputs, dispatch_tensor)
        expert_inputs = tf.reshape(
            expert_inputs, [self.num_experts, self.expert_capacity, self.embed_dim]
        )
        # Dispatch to experts
        expert_input_list = tf.unstack(expert_inputs, axis=0)
        expert_output_list = [
            self.experts[idx](expert_input)
            for idx, expert_input in enumerate(expert_input_list)
        ]
        # expert_outputs shape: [expert_capacity, num_experts, embed_dim]
        expert_outputs = tf.stack(expert_output_list, axis=1)
        # expert_outputs_combined shape: [tokens_per_batch, embed_dim]
        expert_outputs_combined = tf.einsum(
            "abc,xba->xc", expert_outputs, combine_tensor
        )
        # output shape: [batch_size, num_tokens_per_example, embed_dim]
        outputs = tf.reshape(
            expert_outputs_combined,
            [batch_size, num_tokens_per_example, self.embed_dim],
        )
        return outputs

### Transformer Block

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ffn, dropout_rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        # The ffn can be either a standard feedforward network or a switch
        # layer with a Mixture of Experts.
        self.ffn = ffn
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout_rate)
        self.dropout2 = layers.Dropout(dropout_rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

### Text Classification Model

In [None]:
def get_model(config):
    switch = Switch(config.num_experts, config.embed_dim, config.num_tokens_per_batch)
    transformer_block = TransformerBlock(config.ff_dim, config.num_heads, switch)

    inputs = layers.Input(shape=(config.num_tokens_per_example,))
    embedding_layer = TokenAndPositionEmbedding(
        config.num_tokens_per_example, config.vocab_size, config.embed_dim
    )
    x = embedding_layer(inputs)
    x = transformer_block(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(config.dropout_rate)(x)
    x = layers.Dense(config.ff_dim, activation="relu")(x)
    x = layers.Dropout(config.dropout_rate)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)

    classifier = keras.Model(inputs=inputs, outputs=outputs)
    return classifier

### Calculate Vocabulary Size

In [None]:
config.vocab_size = len(new_tokenizer.index_word) + 1
print(config.vocab_size)

### Visulize architecture of the Model

In [None]:
model = get_model(config)
keras.utils.plot_model(model, show_shapes=True)

In [None]:
model.summary()

### KFold Training

In [None]:
models = []
tf.keras.backend.clear_session()
for index, (train_indices, val_indices) in enumerate(StratifiedKFold(5, shuffle=True, random_state=42).split(x_train, y_train)):
    print("Fold %d" %(index))
    train_idx = len(train_indices) // config.batch_size * config.batch_size
    train_indices = train_indices[:train_idx]
    val_idx = len(val_indices) // config.batch_size * config.batch_size
    val_indices = val_indices[:val_idx]
    train_features, train_targets = x_train[train_indices], y_train[train_indices]
    validation_features, validation_targets = x_train[val_indices], y_train[val_indices]
    model_checkpoint_path = "model%d.h5"%(index)
    model = get_model(config)
    loss ="binary_crossentropy"
    postive_rate = train_targets.mean()
    class_weight = {0: postive_rate, 1: 1 - postive_rate}
    adam = tf.keras.optimizers.Adam(config.learning_rate)
    model.compile(loss=loss, optimizer=adam, metrics=["accuracy"])
    early_stop = tf.keras.callbacks.EarlyStopping(patience=5)
    recuce_Lr = tf.keras.callbacks.ReduceLROnPlateau(patience=2)
    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(model_checkpoint_path, monitor="val_accuracy", save_best_only=True, save_weights_only=True)
    history = model.fit(train_features, train_targets, 
                        validation_data=(validation_features, validation_targets), 
                        batch_size=config.batch_size, epochs=100, 
                        callbacks=[early_stop, model_checkpoint], class_weight=class_weight
                       )
    pd.DataFrame(history.history).plot(kind="line")
    plt.title("Performance of Fold %d"%(index))
    plt.show()
    model.load_weights(model_checkpoint_path)
    y_val_pred = np.array(model.predict(validation_features) > 0.5, dtype="int").reshape(-1)
    cm = confusion_matrix(validation_targets, y_val_pred)
    sns.heatmap(cm)
    plt.show()
    print("Classification Report: \n")
    print(classification_report(validation_targets, y_val_pred))
    acc_score = accuracy_score(validation_targets, y_val_pred)
    print("Accuracy Score: %.2f"%(acc_score))
    models.append(model)

## Submission

In [None]:
y_test = np.mean([model.predict(x_test).reshape(-1) for model in models], axis=0)
y_test = np.array(y_test > 0.5, dtype=int)
submission = pd.DataFrame({"id": test["id"], "target": y_test})
submission.to_csv(config.submit_filename, index=False)
if not config.is_kaggle_platform:
  !kaggle competitions submit $config.dataset_name -m "Submission" -f $config.submit_filename

## References

* [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961)
* [Text classification with Switch Transformer]( https://keras.io/examples/nlp/text_classification_with_switch_transformer/)