In [1]:
# Libraries 

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import bigrams, trigrams
from collections import Counter
from collections import defaultdict
import random
from operator import mul
from functools import reduce

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import tensorflow_addons as tfa

print("Successfully imported all packages")

Successfully imported all packages


## Data

In [2]:
#importing the data and drop unnecessary columns
data = pd.read_csv('socialmedia-disaster-tweets-DFE 3.csv', error_bad_lines=False)
data = data.drop(columns=["_unit_id", "_golden", "_unit_state", "_trusted_judgments","choose_one:confidence", "choose_one_gold", "_last_judgment_at"])
data["target"] = data.apply(lambda row: 1 if row["choose_one"]== "Relevant" else 0 ,axis =1)
full_dataset = data 



  exec(code_obj, self.user_global_ns, self.user_ns)


## Preprocessing 

### Tokenization

In [3]:
# Tokenization 
data["tweet_tokens"] = data.apply(lambda row: row["text"].split(), axis = 1)
data["tweet_tokens_joined"] = data.apply(lambda row: " ".join(row["tweet_tokens"]), axis = 1)

### Stopwords

In [4]:
#removing the stopwords
stopset = set(stopwords.words('english'))

data['cleaned_tweets'] = data.apply(lambda row: [w for w in row["tweet_tokens"] if w not in stopset], axis = 1)
data["cleaned_tweets_joined"] = data.apply(lambda row: " ".join(row["cleaned_tweets"]), axis = 1)

data.head(10)

Unnamed: 0,choose_one,keyword,location,text,tweetid,userid,target,tweet_tokens,tweet_tokens_joined,cleaned_tweets,cleaned_tweets_joined
0,Relevant,,,Just happened a terrible car crash,1.0,,1,"[Just, happened, a, terrible, car, crash]",Just happened a terrible car crash,"[Just, happened, terrible, car, crash]",Just happened terrible car crash
1,Relevant,,,Our Deeds are the Reason of this #earthquake M...,13.0,,1,"[Our, Deeds, are, the, Reason, of, this, #eart...",Our Deeds are the Reason of this #earthquake M...,"[Our, Deeds, Reason, #earthquake, May, ALLAH, ...",Our Deeds Reason #earthquake May ALLAH Forgive us
2,Relevant,,,"Heard about #earthquake is different cities, s...",14.0,,1,"[Heard, about, #earthquake, is, different, cit...","Heard about #earthquake is different cities, s...","[Heard, #earthquake, different, cities,, stay,...","Heard #earthquake different cities, stay safe ..."
3,Relevant,,,"there is a forest fire at spot pond, geese are...",15.0,,1,"[there, is, a, forest, fire, at, spot, pond,, ...","there is a forest fire at spot pond, geese are...","[forest, fire, spot, pond,, geese, fleeing, ac...","forest fire spot pond, geese fleeing across st..."
4,Relevant,,,Forest fire near La Ronge Sask. Canada,16.0,,1,"[Forest, fire, near, La, Ronge, Sask., Canada]",Forest fire near La Ronge Sask. Canada,"[Forest, fire, near, La, Ronge, Sask., Canada]",Forest fire near La Ronge Sask. Canada
5,Relevant,,,All residents asked to 'shelter in place' are ...,17.0,,1,"[All, residents, asked, to, 'shelter, in, plac...",All residents asked to 'shelter in place' are ...,"[All, residents, asked, 'shelter, place', noti...",All residents asked 'shelter place' notified o...
6,Relevant,,,"13,000 people receive #wildfires evacuation or...",18.0,,1,"[13,000, people, receive, #wildfires, evacuati...","13,000 people receive #wildfires evacuation or...","[13,000, people, receive, #wildfires, evacuati...","13,000 people receive #wildfires evacuation or..."
7,Relevant,,,Just got sent this photo from Ruby #Alaska as ...,19.0,,1,"[Just, got, sent, this, photo, from, Ruby, #Al...",Just got sent this photo from Ruby #Alaska as ...,"[Just, got, sent, photo, Ruby, #Alaska, smoke,...",Just got sent photo Ruby #Alaska smoke #wildfi...
8,Relevant,,,#RockyFire Update => California Hwy. 20 closed...,20.0,,1,"[#RockyFire, Update, =>, California, Hwy., 20,...",#RockyFire Update => California Hwy. 20 closed...,"[#RockyFire, Update, =>, California, Hwy., 20,...",#RockyFire Update => California Hwy. 20 closed...
9,Relevant,,,Apocalypse lighting. #Spokane #wildfires,21.0,,1,"[Apocalypse, lighting., #Spokane, #wildfires]",Apocalypse lighting. #Spokane #wildfires,"[Apocalypse, lighting., #Spokane, #wildfires]",Apocalypse lighting. #Spokane #wildfires


### Lemmetization

In [5]:
#using lemmatization
wnl = nltk.WordNetLemmatizer()

data['lemmatized_tweets'] = data.apply(lambda row: [wnl.lemmatize(w) for w in row["cleaned_tweets"]], axis = 1)
data["lemmatized_tweets_joined"] = data.apply(lambda row: " ".join(row["lemmatized_tweets"]), axis = 1)

data.head()

Unnamed: 0,choose_one,keyword,location,text,tweetid,userid,target,tweet_tokens,tweet_tokens_joined,cleaned_tweets,cleaned_tweets_joined,lemmatized_tweets,lemmatized_tweets_joined
0,Relevant,,,Just happened a terrible car crash,1.0,,1,"[Just, happened, a, terrible, car, crash]",Just happened a terrible car crash,"[Just, happened, terrible, car, crash]",Just happened terrible car crash,"[Just, happened, terrible, car, crash]",Just happened terrible car crash
1,Relevant,,,Our Deeds are the Reason of this #earthquake M...,13.0,,1,"[Our, Deeds, are, the, Reason, of, this, #eart...",Our Deeds are the Reason of this #earthquake M...,"[Our, Deeds, Reason, #earthquake, May, ALLAH, ...",Our Deeds Reason #earthquake May ALLAH Forgive us,"[Our, Deeds, Reason, #earthquake, May, ALLAH, ...",Our Deeds Reason #earthquake May ALLAH Forgive u
2,Relevant,,,"Heard about #earthquake is different cities, s...",14.0,,1,"[Heard, about, #earthquake, is, different, cit...","Heard about #earthquake is different cities, s...","[Heard, #earthquake, different, cities,, stay,...","Heard #earthquake different cities, stay safe ...","[Heard, #earthquake, different, cities,, stay,...","Heard #earthquake different cities, stay safe ..."
3,Relevant,,,"there is a forest fire at spot pond, geese are...",15.0,,1,"[there, is, a, forest, fire, at, spot, pond,, ...","there is a forest fire at spot pond, geese are...","[forest, fire, spot, pond,, geese, fleeing, ac...","forest fire spot pond, geese fleeing across st...","[forest, fire, spot, pond,, goose, fleeing, ac...","forest fire spot pond, goose fleeing across st..."
4,Relevant,,,Forest fire near La Ronge Sask. Canada,16.0,,1,"[Forest, fire, near, La, Ronge, Sask., Canada]",Forest fire near La Ronge Sask. Canada,"[Forest, fire, near, La, Ronge, Sask., Canada]",Forest fire near La Ronge Sask. Canada,"[Forest, fire, near, La, Ronge, Sask., Canada]",Forest fire near La Ronge Sask. Canada


## Algorithms

### Naive Bayes

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
#train validation test split. The final models have been trained with only train & test set. Validation set has been only used for hyperparameter optimization prior final training
X_train, X_test, y_train, y_test = train_test_split(data["lemmatized_tweets_joined"], data["target"], test_size = 0.3, shuffle= True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [269]:
#multinomial naive bayes pipeline
NB_pipe_clf = Pipeline(
    [('vect', CountVectorizer(decode_error='ignore', stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('nb_clf', MultinomialNB())
    ])

In [270]:
# training the model 
nb_clf = NB_pipe_clf.fit(X_train, y_train)

#predicting 
preds = NB_pipe_clf.predict(X_test)


#evaluation
print("Accuracy: ", accuracy_score(y_test, preds))
print("Precision: ", precision_score(y_test, preds))
print("Recall: ", recall_score(y_test, preds))
print("F1-Measure: ", f1_score(y_test, preds))


Accuracy:  0.8061787421846267
Precision:  0.8425821064552661
Recall:  0.657243816254417
F1-Measure:  0.7384615384615385


### SVM

In [272]:
# SVM pipeline
SVM_pipe_clf = Pipeline(
    [('vect', CountVectorizer(decode_error='ignore', stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('svm_clf', LinearSVC())
    ])

In [273]:
# Training
svm_clf = SVM_pipe_clf.fit(X_train, y_train)

# Predicting
svm_preds = SVM_pipe_clf.predict(X_test)

# Evaluation
print("Accuracy: ", accuracy_score(y_test, svm_preds))
print("Precision: ", precision_score(y_test, svm_preds))
print("Recall: ", recall_score(y_test, svm_preds))
print("F1-Measure: ", f1_score(y_test, svm_preds))

Accuracy:  0.7910996689959544
Precision:  0.7625698324022346
Recall:  0.7234982332155477
F1-Measure:  0.7425203989120581


### BERT

In [20]:
#importing bert models
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1")

# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])



2022-05-27 14:54:07.766766: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-27 14:54:09.318382: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


In [21]:
#compiling & training BERT model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train,epochs=15, batch_size = 32, validation_data= (X_val,y_val))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [22]:
#prediction of BERT
y_predicted_bert = model.predict(X_test)
y_predicted_bert = y_predicted_bert.flatten()
print(y_predicted_bert)

[0.50622815 0.12553826 0.7523049  ... 0.8410486  0.18414131 0.36373973]


In [23]:
rounded_pre_bert = [round(num) for num in y_predicted_bert]

In [24]:
print("Accuracy: ", accuracy_score(y_test, rounded_pre_bert))
print("Precision: ", precision_score(y_test, rounded_pre_bert))
print("Recall: ", recall_score(y_test, rounded_pre_bert))
print("F1-Measure: ", f1_score(y_test, rounded_pre_bert))

Accuracy:  0.7704566349984676
Precision:  0.754894283476899
Recall:  0.6885714285714286
F1-Measure:  0.7202091893911095


### LSTM 

In [29]:

#creating the encoder for LSTM
X_train = tf.convert_to_tensor(X_train)

print("Train: ", X_train)

VOCAB_SIZE = 30000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(tf.convert_to_tensor(data["lemmatized_tweets_joined"]))

Train:  tf.Tensor(
[b'The good thing #Royals face newbie playoffs. No real reason panic.'
 b'Skinny Jeans Hazardous Your Health! #socialnews http://t.co/LTMa9xQXpx'
 b'I added video @YouTube playlist http://t.co/v2yXurne2p Natural Disaster Survival - HUG BY A GUEST!! Roblox'
 ...
 b'Hundreds feared drowned another Mediterranean asylum seeker boat sinking http://t.co/zsYkzj2bzG'
 b'Trouble trouble I get way ????'
 b'Bayelsa poll: Tension Bayelsa Patience Jonathan plan hijack APC PDP: Plans former First Lady and... http://t.co/3eJL9lZlCH'], shape=(6090,), dtype=string)


In [30]:
#creating the simple LSTM model
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, dropout=0.2)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

In [31]:
#compiling LSTM
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
              )

In [306]:
#training the model
history = model.fit(X_train, y_train, epochs = 15, validation_data = (X_val, y_val))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
y_predicted_lstm = model.predict(X_test)

In [283]:
#final metrics from the LSTM model.
print("Accuracy: ", accuracy_score(y_test, y_predicted_lstm))
print("Precision: ", precision_score(y_test, y_predicted_lstm))
print("Recall: ", recall_score(y_test, y_predicted_lstm))
print("F1-Measure: ", f1_score(y_test, y_predicted_lstm))

Accuracy:  0.7848473703567488
Precision:  0.7937701396348013
Recall:  0.6528268551236749
F1-Measure:  0.7164323800290839


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6cc5b975-c378-4446-8b15-03c3047279ca' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>