# Disaster Tweets Classification: TF-IDF

## Table of Contents
- [1. Overview](#1.)
- [2. Import Packages and Datasets](#2.)
- [3. Data Wrangling](#3.)
- [4. Exploratory Data Analysis & Data Preprocessing](#4.)
- [5. Model Development](#5.)
- [6. Submission](#6.)

<a id="1."></a>
## 1. Overview
In this notebook I will build a Disaster Tweets Classification Model using TF-IDF vectorization in Keras.

<a id="2."></a>
## 2. Import Packages and Datasets 

In [None]:
!pip install -q tensorflow==2.7.0

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import StratifiedKFold

In [None]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
train.head()

In [None]:
train.shape

In [None]:
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
test.head()

In [None]:
train.location.value_counts()

<a id="3."></a>
## 3. Data Wrangling
Let's see null values for each column.

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train["keyword"].replace(np.NAN, "", inplace=True)
train["location"].replace(np.NAN, "", inplace=True)
test["keyword"].replace(np.NAN, "", inplace=True)
test["location"].replace(np.NAN, "", inplace=True)

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
len(test["keyword"])

<a id="4."></a>
## 4. Exploratory Data Analysis & Data Preprocessing
- Tokenize Texts
- Show Staticstic info of texts

In [None]:
contents = []
for data in [train, test]:
    for i in range(data.shape[0]):
        item = data.iloc[i]
        sentence = item["keyword"] + " " + item["text"] + " " + item["location"]
        contents.append(sentence.lower())

In [None]:
train_contents = contents[:len(train)]
test_contents = contents[len(train):]

In [None]:
train["X"] = train_contents
test["X"] = test_contents

### TF-IDF Vectorization

In [None]:
vocab_size = 10000
text_vectorizer = layers.TextVectorization(max_tokens=vocab_size, output_mode="tf-idf", ngrams=2)
# Index the bigrams and learn the TF-IDF weights via `adapt()`

with tf.device("CPU"):
    # A bug that prevents this from running on GPU for now.
    text_vectorizer.adapt(contents)

<a id="5."></a>
## Model Development

### BinaryCrossEntropy with weights 
Use this version of BinaryCrossEntropy to solve class imbalance problem.

In [None]:
class BinaryCrossEntropy(tf.keras.losses.Loss):

    def __init__(self, postive_rate = 0.5):
        super().__init__()
        self.negative_weights = postive_rate
        self.positive_weights = 1 - postive_rate
        
    def call(self, y_true, y_pred):
        print(y_true, y_pred)
        y_true = tf.cast(y_true, y_pred.dtype)
        pos = self.positive_weights * y_true * tf.math.log(y_pred + tf.keras.backend.epsilon())
        neg = self.negative_weights * (1.0 - y_true) * tf.math.log(1.0 - y_pred + tf.keras.backend.epsilon())
        return -(pos + neg)

### Text Classification Model

In [None]:
def get_model():
    inputs = layers.Input(shape=(None, ), dtype="string")
    x = text_vectorizer(inputs)
    x = layers.Dense(32, activation="relu")(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    return keras.Model(inputs=inputs, outputs=outputs)

In [None]:
model = get_model()
keras.utils.plot_model(model, show_shapes=True)

In [None]:
index = 1
models = []
tf.keras.backend.clear_session()
for train_indices, val_indices in StratifiedKFold(5, shuffle=True, random_state=42).split(train, train["target"]):
    print("Fold %d" %(index))
    train_features, train_targets = train.iloc[train_indices]["X"], train.iloc[train_indices]["target"]
    validation_features, validation_targets = train.iloc[val_indices]["X"], train.iloc[val_indices]["target"]
    model_checkpoint_path = "model%d.tf"%(index)
    model = get_model()
    loss = BinaryCrossEntropy(train_targets.mean())
    adam = tf.keras.optimizers.Adam(3e-4)
    model.compile(loss=loss, optimizer=adam, metrics=["accuracy"])
    early_stop = tf.keras.callbacks.EarlyStopping(patience=10)
    recuce_Lr = tf.keras.callbacks.ReduceLROnPlateau(patience=2)
    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(model_checkpoint_path, monitor="val_accuracy", save_best_only=True, save_weights_only=True)
    history = model.fit(train_features, train_targets, validation_data=(validation_features, validation_targets), epochs=100, callbacks=[early_stop, model_checkpoint])
    pd.DataFrame(history.history).plot(kind="line")
    plt.title("Performance of Fold %d"%(index))
    plt.show()
    model.load_weights(model_checkpoint_path)
    y_val_pred = np.array(model.predict(validation_features) > 0.5, dtype="int").reshape(-1)
    cm = confusion_matrix(validation_targets, y_val_pred)
    sns.heatmap(cm, annot=True)
    plt.show()
    print("Classification Report: \n")
    print(classification_report(validation_targets, y_val_pred))
    acc_score = accuracy_score(validation_targets, y_val_pred)
    print("Accuracy Score: %.2f"%(acc_score))
    models.append(model)
    index += 1

<a id="6."></a>
## 6. Submission

In [None]:
y_test = np.mean([model.predict(test["X"]).reshape(-1) for model in models], axis=0)
y_test = np.array(y_test > 0.5, dtype=int)
submission = pd.DataFrame({"id": test["id"], "target": y_test})
submission.to_csv("submission.csv", index=False)