In [None]:
!pip install keras-core --upgrade
!pip install -q keras-nlp --upgrade
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'



In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras_core as keras
import keras_nlp
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

print("TensorFlow version:", tf.__version__, "\nKerasNLP version:",keras_nlp.__version__)

Using TensorFlow backend
TensorFlow version: 2.16.1 
KerasNLP version: 0.12.0


Load the Disaster Tweets: **id, keyword, location, text, target**

In [None]:
df_train = pd.read_csv('/content/train.csv')
df_test = pd.read_csv('/content/test.csv')
print('Training Set Shape = {}'.format(df_train.shape))
print('Training Set Memory Usage = {:.2f} MB'.format(df_train.memory_usage().sum() / 1024**2))
print('Test set shape {}'.format(df_train.shape))
print('Test Set Memory Usage = {:.2f} MB'.format(df_test.memory_usage().sum() / 1024**2 ))

Training Set Shape = (7613, 5)
Training Set Memory Usage = 0.29 MB
Test set shape (7613, 5)
Test Set Memory Usage = 0.10 MB


In [None]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


Explore Df

In [None]:
df_train["length"] = df_train["text"].apply(lambda x: len(x))
df_test["length"] = df_test["text"].apply(lambda x: len(x))

print("Training Length Stat {}".format(df_train["length"].describe()))
print(55*"-")
print("Test Length Stat {} \n".format(df_test["length"].describe()))

Training Length Stat count    7613.000000
mean      101.037436
std        33.781325
min         7.000000
25%        78.000000
50%       107.000000
75%       133.000000
max       157.000000
Name: length, dtype: float64
-------------------------------------------------------
Test Length Stat count    3263.000000
mean      102.108183
std        33.972158
min         5.000000
25%        78.000000
50%       109.000000
75%       134.000000
max       151.000000
Name: length, dtype: float64 



Preprocess the data

In [None]:
BATCH_SIZE = 32
NUM_TRAINING_EXAMPLES = df_train.shape[0]
TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.2
STEPS_PER_EPOCH = int(NUM_TRAINING_EXAMPLES)*TRAIN_SPLIT // BATCH_SIZE

EPOCHS = 2
AUTO = tf.data.experimental.AUTOTUNE

In [None]:
from sklearn.model_selection import train_test_split
X = df_train["text"]
y = df_train["target"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_SPLIT, random_state=42)
X_test = df_test["text"]

##Loading BERT model from Keras NLP
Text inputs nends to be transformad to numeric toekn ids and arranged in several Tensors sbefore being input to BERT MODEL.

In [None]:
preset = "distil_bert_base_en_uncased"

# Shorter sequence length.
preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(preset, sequence_length=160, name="preprocessor_4_tweets")

# Pretrained classifier.
classifier = keras_nlp.models.DistilBertClassifier.from_preset(preset, preprocessor = preprocessor, num_classes=2)

classifier.summary()

Downloading from https://www.kaggle.com/api/v1/models/keras/distil_bert/keras/distil_bert_base_en_uncased/2/download/metadata.json...
100%|██████████| 140/140 [00:00<00:00, 282kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/distil_bert/keras/distil_bert_base_en_uncased/2/download/preprocessor.json...
Downloading from https://www.kaggle.com/api/v1/models/keras/distil_bert/keras/distil_bert_base_en_uncased/2/download/tokenizer.json...
100%|██████████| 580/580 [00:00<00:00, 406kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/distil_bert/keras/distil_bert_base_en_uncased/2/download/assets/tokenizer/vocabulary.txt...
100%|██████████| 226k/226k [00:00<00:00, 362kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/distil_bert/keras/distil_bert_base_en_uncased/2/download/task.json...
Downloading from https://www.kaggle.com/api/v1/models/keras/distil_bert/keras/distil_bert_base_en_uncased/2/download/config.json...
100%|██████████| 515/515 [00:00<0

Training own model, fine tunning BERT MODEL

In [None]:
from tensorflow.keras.optimizers import Adam

# Compile
classifier.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), #'binary_crossentropy',
    optimizer=Adam(1e-5),
    metrics=["accuracy"]
)

# Fit
history = classifier.fit(
    x=X_train,
    y=y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(X_val, y_val)
)


Epoch 1/2
[1m 92/191[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m1:03:51[0m 39s/step - accuracy: 0.6725 - loss: 0.6172

In [None]:
def ConfusionMatrix(y_true, y_pred, dataset):
  disp = ConfusionMatrixDisplay.from_predictions(
      y_true,
      np.argmax(y_pred, axis=1)
      display_labels=["Not Disaster", "Disaster"]
      cmap=plt.cm.Blues
  )

  tn, fp, fn, tp = confusion_matrix(y_true, np.argmax(y_pred, axis=1).ravel()
  f1_score = tp / tp((fn+fp)/2))
  disp.ax_.set_title("Confusion Matrix on" + dataset + "Dataset -- F1 Score: " + str(f1_score.round(2)))

In [None]:
y_pred_train = classifier.predict(X_train)
ConfusionMatrix(y_train, y_pred_train, "Training")

In [None]:
y_pred_val = classifier.predict(X_val)
ConfusionMatrix(y_val, y_pred_val, "Validation")

Generate the submission file

In [None]:
submission_sample = pd.read_csv('/content/sample_submission.csv')
submission_sample.head()

In [None]:
submission_sample["target"] = np.argmax(classifier.predict(X_test), axis=1)

In [None]:
submission_sample.describe()

In [None]:
submission_sample.to_csv("submission_preds.csv", index=False)