<a href="https://colab.research.google.com/github/punsnprotons/deep-learning-projects/blob/main/Using_NLP_to_predict_disaster_tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using NLP to predict disaster tweets

Here we will be using the DistilBERT pretrained model from KerasNLP.
The size of the BERT model was reduced by 40% via knowledge distillation during the pre-training phase while it retained 97% of its language understanding abilities also making it 60% faster

#Importing libraries

In [1]:
!pip install keras-core --upgrade
!pip install -q keras-nlp --upgrade
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'

Collecting keras-core
  Downloading keras_core-0.1.7-py3-none-any.whl (950 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting namex (from keras-core)
  Downloading namex-0.0.7-py3-none-any.whl (5.8 kB)
Installing collected packages: namex, keras-core
Successfully installed keras-core-0.1.7 namex-0.0.7
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m584.5/584.5 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m68.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.2/475.2 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m85.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m442.0/442.0 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[2K

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import keras_nlp
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

Using TensorFlow backend


# Loading datasets

In [3]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
print('Training set shape = {}'.format(df_train.shape))
print('Test set shape = {}'.format(df_test.shape))


Training set shape = (7613, 5)
Test set shape = (3263, 4)


In [4]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


## EDA

In [6]:
df_train['length'] = df_train['text'].apply(lambda x: len(x))
df_test['length'] = df_test['text'].apply(lambda x: len(x))

print('Training set statistics')
print(df_train['length'].describe())

Training set statistics
count    7613.000000
mean      101.037436
std        33.781325
min         7.000000
25%        78.000000
50%       107.000000
75%       133.000000
max       157.000000
Name: length, dtype: float64


## Preprocessing

In [9]:
BATCH_SIZE = 32
NUM_TRAINING_EXAMPLES = df_train.shape[0]
TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.2
STEPS_PER_EPOCH = int(NUM_TRAINING_EXAMPLES*TRAIN_SPLIT//BATCH_SIZE)
EPOCHS = 2
AUTO = tf.data.experimental.AUTOTUNE
RANDOM_STATE = 42

In [10]:
from sklearn.model_selection import train_test_split

X = df_train['text']
y = df_train['target']

X_train,X_val,y_train,y_val = train_test_split(X,y,test_size= VAL_SPLIT, random_state=RANDOM_STATE)

## Loading the DistilBERT model

In [11]:
preset = 'distil_bert_base_en_uncased'
preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(
    preset,
    sequence_length = 160,
    name = 'preprocessor_4_tweets'
)

Downloading data from https://storage.googleapis.com/keras-nlp/models/distil_bert_base_en_uncased/v1/vocab.txt
[1m231508/231508[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step       


In [12]:
classifier = keras_nlp.models.DistilBertClassifier.from_preset(
    preset,
    preprocessor = preprocessor,
    num_classes = 2
)
classifier.summary()

Downloading data from https://storage.googleapis.com/keras-nlp/models/distil_bert_base_en_uncased/v1/model.h5
[1m265570304/265570304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 0us/step


## Training and fine tuning model

In [17]:
classifier.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), #'binary_crossentropy',
    optimizer='adam',
    metrics= ["accuracy"]
)

history = classifier.fit(
    x=X_train,
    y=y_train,
    batch_size = BATCH_SIZE,
    epochs = EPOCHS,
    validation_data = (X_val,y_val)
)

Epoch 1/2
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5715s[0m 30s/step - accuracy: 0.5708 - loss: 0.7211 - val_accuracy: 0.5739 - val_loss: 0.6829
Epoch 2/2
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5632s[0m 29s/step - accuracy: 0.5716 - loss: 0.6835 - val_accuracy: 0.5739 - val_loss: 0.6826


In [19]:
def displayConfusionMatrix(y_true, y_pred, dataset):
  disp = ConfusionMatrixDisplay.from_predictions(
      y_true,
      np.argmax(y_pred,axis=1),
      display_labels = ['Not Disaster', 'Disaster'],
      cmap = plt.cm.Blues
  )
  tn,fp,fn,tp = confusion_matrix(y_true,np.argmax(y_pred,axis=1)).ravel()
  f1_score = tp / (tp+((fn+fp)/2))

  y_pred_train = classifier.predict(X_train)
  displayConfusionMatrix(y_train,y_pred_train,"Training")
  y_pred_val = classifier.predict(X_val)
  displayConfusionMatrix(y_val,y_pred_val,"Validation")