### Please upvote if you find this notebook useful.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# **Data Analysis**

*Train Data*

In [None]:
train_data = pd.read_csv("../input/nlp-getting-started/train.csv")
train_data

*Test Data*

In [None]:
test_data = pd.read_csv("../input/nlp-getting-started/test.csv")
test_data

*Plotting % missing values*

In [None]:
missing_cols = ["keyword", "location"]
train_missing_vals = [train_data[col].isna().sum() / train_data.shape[0] * 100 for col in missing_cols]
test_missing_vals = [test_data[col].isna().sum() / test_data.shape[0] * 100 for col in missing_cols]

plt.figure(figsize = (12, 4))

plt.subplot(121)
plt.bar(missing_cols, train_missing_vals)
plt.title("% missing values in training set")
plt.xlabel("Column")
plt.ylabel("Missing values %")

plt.subplot(122)
plt.bar(missing_cols, test_missing_vals)
plt.title("% missing values in test set")
plt.xlabel("Column")
plt.ylabel("Missing values %")

# **Transformer Model**

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from transformers import TFAutoModel, AutoTokenizer

*Helper Functions and Classes*

In [None]:
def tokenize_dataset(tokenizer):
    """
    Returns a dict of the train and test datasets based on the given tokenizer.
    """
    return {
        "train": {
            "data": tokenizer(list(train_data["text"].values), padding = "max_length", max_length = 84, truncation = True, return_tensors = "tf").data,
            "labels": train_data["target"].values,
        },
        "test": {
            "data": tokenizer(list(test_data["text"].values), padding = "max_length", max_length = 84, truncation = True, return_tensors = "tf").data
        }
    }

In [None]:
class disasterClassificationModel(tf.keras.Model):
    """
    Adds a classification head to the transformers base model.
    """
    def __init__(self, checkpoint):
        super(disasterClassificationModel, self).__init__()
        self.dropout_rate = 0.7
        
        self.base_model = TFAutoModel.from_pretrained(checkpoint)
        self.flatten = layers.Flatten()
        
        self.dropout1 = layers.Dropout(rate = self.dropout_rate)
        self.dense1 = layers.Dense(units = 768, kernel_regularizer = "l1_l2")
        self.batchNorm1 = layers.BatchNormalization()
        self.activation1 = layers.Activation("relu")
        
        self.dropout2 = layers.Dropout(rate = self.dropout_rate)
        self.dense2 = layers.Dense(units = 32, kernel_regularizer = "l1_l2")
        self.batchNorm2 = layers.BatchNormalization()
        self.activation2 = layers.Activation("relu")
        
        self.dropout3 = layers.Dropout(rate = self.dropout_rate)
        self.dense3 = layers.Dense(units = 1, activation = "sigmoid")

    def call(self, inputs, training = False):
        x = self.base_model(inputs).last_hidden_state
        x = self.flatten(x)
        
        x = self.dropout1(x) if training else x
        x = self.dense1(x)
        x = self.batchNorm1(x)
        x = self.activation1(x)
        
        x = self.dropout2(x) if training else x
        x = self.dense2(x)
        x = self.batchNorm2(x)
        x = self.activation2(x) 
        
        x = self.dropout3(x) if training else x
        x = self.dense3(x)
        return x

*Custom F1 Score Metric*

In [None]:
class F1_score(tf.keras.metrics.Metric):
    """
    F1 score metric based on Keras Precision and Recall metrics.
    """
    def __init__(self, name = "f1_score", **kwargs):
        super(F1_score, self).__init__(name = name, **kwargs)
        
        self.precision = tf.keras.metrics.Precision()
        self.recall = tf.keras.metrics.Recall()
        
    def update_state(self, y_true, y_pred, sample_weight = None):
        self.precision.update_state(y_true, y_pred, sample_weight)
        self.recall.update_state(y_true, y_pred, sample_weight)
        
    def reset_states(self):
        self.precision.reset_states()
        self.recall.reset_states()
        
    def result(self):
        return 2 / ((1 / self.precision.result()) + (1 / self.recall.result()))

In [None]:
def compile_model(model, batch_size, epochs, tokenized_dataset):
    """
    Compiles the given model by adding a learning rate scheduler to the Adam optimizer.
    """
    train_steps = (len(tokenized_dataset["train"]["data"]["input_ids"]) // batch_size) * epochs

    # Learning rate scheduler to linearly reduce the learning rate from an initial value to an end value
    lr_scheduler = PolynomialDecay(
        initial_learning_rate = 5e-5,
        end_learning_rate = 0,
        decay_steps = train_steps,
    )

    optimizer = Adam(learning_rate = lr_scheduler)
    loss = BinaryCrossentropy()
    
    model.compile(loss = loss, optimizer = optimizer, metrics = ["accuracy", F1_score()])

**ROBERTa base model**

*This particular model uses a Roberta base and has been finetuned for sentiment-analysis using 58M tweets.*

In [None]:
checkpoint = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = disasterClassificationModel(checkpoint)
tokenized_dataset = tokenize_dataset(tokenizer)
compile_model(model, batch_size = 10, epochs = 10, tokenized_dataset = tokenized_dataset)

In [None]:
history = model.fit(
    x = tokenized_dataset["train"]["data"],
    y = tokenized_dataset["train"]["labels"],
    batch_size = 10,
    epochs = 10,
    validation_split = 0.1,
)

*Plotting model history*

In [None]:
plt.figure(figsize = (14, 4))

plt.subplot(131)
plt.plot(history.history["loss"], label = "loss")
plt.plot(history.history["val_loss"], label = "val_loss")
plt.title("Loss")
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.legend(loc = "best")

plt.subplot(132)
plt.plot(history.history["accuracy"], label = "accuracy")
plt.plot(history.history["val_accuracy"], label = "val_accuracy")
plt.title("Accuracy")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(loc = "best")

plt.subplot(133)
plt.plot(history.history["f1_score"], label = "f1_score")
plt.plot(history.history["val_f1_score"], label = "val_f1_score")
plt.title("F1 Score")
plt.ylabel("F1 Score")
plt.xlabel("Epoch")
plt.legend(loc = "best")

# **Submission**

In [None]:
predictions = model.predict(tokenized_dataset["test"]["data"], verbose = True)
predictions = np.where(predictions >= 0.5, 1, 0)

In [None]:
submissions = test_data.drop(labels = ["keyword", "location", "text"], axis = 1)
submissions["target"] = predictions
submissions.to_csv("submissions.csv", index = False)