In [4]:
import pandas as pd
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [6]:
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df["text"].to_numpy(),
                                                                            train_df["target"].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=42)

###Text Vectorization

In [8]:
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

15

In [9]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
text_vectorizer = TextVectorization(max_tokens=1000,
                                    standardize="lower_and_strip_punctuation",
                                    split="whitespace",
                                    ngrams=None,
                                    output_mode="int",
                                    output_sequence_length=15)

In [10]:
text_vectorizer.adapt(train_sentences)

In [11]:
sample_sentence = "There's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[282,   3, 206,   4,  13, 674,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])>

###Embedding

In [12]:
tf.random.set_seed(42)
embedding = tf.keras.layers.Embedding(input_dim=1000,
                             output_dim=128,
                             embeddings_initializer="uniform",
                             input_length=15,
                             name="embedding_1")

embedding

<keras.src.layers.core.embedding.Embedding at 0x786e29ebbee0>

###**Models**
* Model 0: Naive Bayes (baseline)
* Model 1: Feed-forward neural network (dense model)
* Model 2: LSTM model
* Model 3: GRU model
* Model 4: Bidirectional-LSTM model
* Model 5: 1D Convolutional Neural Network
* Model 6: TensorFlow Hub Pretrained Feature Extractor

###Model 0: Naive Bayes

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()),
                    ("clf", MultinomialNB())
])
model_0.fit(train_sentences, train_labels)

In [14]:
baseline_preds = model_0.predict(val_sentences)

In [15]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.
  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": accuracy_score(y_true, y_pred) * 100,
                  "precision": precision,
                  "recall": recall,
                  "f1": f1}
  return model_results

In [16]:
baseline_results = calculate_results(val_labels,baseline_preds)
baseline_results

{'accuracy': 77.82152230971128,
 'precision': 0.792992256322435,
 'recall': 0.7782152230971129,
 'f1': 0.7703527809038113}

###Model 1: dense model

In [17]:
# build the model
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense")

# compile the model
model_1.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [18]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVe  (None, 15)                0         
 ctorization)                                                    
                                                                 
 embedding_1 (Embedding)     (None, 15, 128)           128000    
                                                                 
 global_average_pooling1d (  (None, 128)               0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 128129 (500.50 KB)
Trainable params: 12

In [19]:
model_1_history = model_1.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
model_1.evaluate(val_sentences, val_labels)



[0.5103117227554321, 0.7769029140472412]

In [21]:
model_1_pred= model_1.predict(val_sentences)



In [22]:
model_1_preds = tf.squeeze(tf.round(model_1_pred))

In [23]:
model_1_results = calculate_results(val_labels,model_1_preds)
model_1_results

{'accuracy': 77.69028871391076,
 'precision': 0.7805919218080933,
 'recall': 0.7769028871391076,
 'f1': 0.7729547194327667}

###Model 2: LSTM model

In [30]:
tf.random.set_seed(42)
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
print(x.shape)
x = layers.LSTM(64)(x)
print(x.shape)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")

(None, 15, 128)
(None, 64)


In [31]:
model_2.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [32]:
model_2.summary()

Model: "model_2_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVe  (None, 15)                0         
 ctorization)                                                    
                                                                 
 embedding_1 (Embedding)     (None, 15, 128)           128000    
                                                                 
 lstm_2 (LSTM)               (None, 64)                49408     
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 177473 (693.25 KB)
Trainable params: 177473 (693.25 KB)
Non-trainable params: 0 (0.00 Byte)
______________

In [33]:
model_2_history = model_2.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [35]:
model_2_pred = model_2.predict(val_sentences)
model_2_pred[:10]



array([[0.01026676],
       [0.19268706],
       [0.5655345 ],
       [0.01433277],
       [0.632068  ],
       [0.3690639 ],
       [0.10015847],
       [0.14000799],
       [0.11050624],
       [0.9489276 ]], dtype=float32)

In [36]:
model_2_pred = tf.squeeze(tf.round(model_2_pred))

In [37]:
model_2_results = calculate_results(val_labels,model_2_pred)
model_2_results

{'accuracy': 78.08398950131233,
 'precision': 0.7837855468483277,
 'recall': 0.7808398950131233,
 'f1': 0.7773898804495247}

###Model 3: GRU

In [38]:
tf.random.set_seed(42)
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GRU(64)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_3 = tf.keras.Model(inputs, outputs, name="model_3_GRU")

In [39]:
model_3.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [40]:
model_3.summary()

Model: "model_3_GRU"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVe  (None, 15)                0         
 ctorization)                                                    
                                                                 
 embedding_1 (Embedding)     (None, 15, 128)           128000    
                                                                 
 gru (GRU)                   (None, 64)                37248     
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                                 
Total params: 165313 (645.75 KB)
Trainable params: 165313 (645.75 KB)
Non-trainable params: 0 (0.00 Byte)
_______________

In [42]:
model_3_history = model_3.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [43]:
model_3_pred = model_3.predict(val_sentences)
model_3_pred[:10]



array([[0.00698828],
       [0.06414541],
       [0.6621655 ],
       [0.02689363],
       [0.84827703],
       [0.24106975],
       [0.11861856],
       [0.12050603],
       [0.07016749],
       [0.98409677]], dtype=float32)

In [44]:
model_3_pred= tf.squeeze(tf.round(model_3_pred))

In [47]:
model_3_results = calculate_results(val_labels,model_3_pred)
model_3_results

{'accuracy': 76.9028871391076,
 'precision': 0.7719436249869269,
 'recall': 0.7690288713910761,
 'f1': 0.7651248072593357}

###Model 4: Bidirectonal RNN model

In [48]:
tf.random.set_seed(42)
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_4 = tf.keras.Model(inputs, outputs, name="model_4_Bidirectional")

In [50]:
model_4.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [51]:
model_4.summary()

Model: "model_4_Bidirectional"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVe  (None, 15)                0         
 ctorization)                                                    
                                                                 
 embedding_1 (Embedding)     (None, 15, 128)           128000    
                                                                 
 bidirectional (Bidirection  (None, 128)               98816     
 al)                                                             
                                                                 
 dense_5 (Dense)             (None, 1)                 129       
                                                                 
Total params: 226945 (886.50 KB)
Trainable pa

In [52]:
model_4_pred = model_4.predict(val_sentences)
model_4_pred[:10]



array([[0.5017058 ],
       [0.5042154 ],
       [0.49631095],
       [0.48826417],
       [0.49932677],
       [0.49874198],
       [0.49180374],
       [0.50049466],
       [0.5004407 ],
       [0.49320498]], dtype=float32)

In [53]:
model_4_pred = tf.squeeze(tf.round(model_4_pred))

In [54]:
model_4_results = calculate_results(val_labels, model_4_pred)
model_4_results

{'accuracy': 58.00524934383202,
 'precision': 0.573747927241784,
 'recall': 0.5800524934383202,
 'f1': 0.573280894067808}

###Model 5: Conv1D

In [55]:
tf.random.set_seed(42)
from tensorflow.keras import layers
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Conv1D(filters=32, kernel_size=5, activation="relu")(x)
x = layers.GlobalMaxPool1D()(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_5 = tf.keras.Model(inputs, outputs, name="model_5_Conv1D")

model_5.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

model_5.summary()

Model: "model_5_Conv1D"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVe  (None, 15)                0         
 ctorization)                                                    
                                                                 
 embedding_1 (Embedding)     (None, 15, 128)           128000    
                                                                 
 conv1d (Conv1D)             (None, 11, 32)            20512     
                                                                 
 global_max_pooling1d (Glob  (None, 32)                0         
 alMaxPooling1D)                                                 
                                                                 
 dense_6 (Dense)             (None, 1)              

In [56]:
model_5_history = model_5.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [57]:
model_5_pred= model_5.predict(val_sentences)



In [58]:
model_5_pred = tf.squeeze(tf.round(model_5_pred))

In [61]:
model_5_results = calculate_results(val_labels,model_5_pred)
model_5_results

{'accuracy': 74.67191601049869,
 'precision': 0.7458383161889279,
 'recall': 0.7467191601049868,
 'f1': 0.7451761796828422}

###Model 6: TensorFlow Hub Pretrained Feature Extractor

In [63]:
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [64]:
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape=[],
                                        dtype=tf.string,
                                        trainable=False,
                                        name="USE")

In [65]:
model_6 = tf.keras.Sequential([
  sentence_encoder_layer,
  layers.Dense(64, activation="relu"),
  layers.Dense(1, activation="sigmoid")
], name="model_6_USE")

# Compile model
model_6.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

model_6.summary()

Model: "model_6_USE"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 USE (KerasLayer)            (None, 512)               256797824 
                                                                 
 dense_7 (Dense)             (None, 64)                32832     
                                                                 
 dense_8 (Dense)             (None, 1)                 65        
                                                                 
Total params: 256830721 (979.73 MB)
Trainable params: 32897 (128.50 KB)
Non-trainable params: 256797824 (979.61 MB)
_________________________________________________________________


In [66]:
model_6_history = model_6.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [67]:
model_6_pred = model_6.predict(val_sentences)



In [68]:
model_6_pred = tf.squeeze(tf.round(model_6_pred))

In [69]:
model_6_results = calculate_results(val_labels, model_6_pred)
model_6_results

{'accuracy': 81.62729658792651,
 'precision': 0.8175440487292063,
 'recall': 0.8162729658792651,
 'f1': 0.8145782594567971}

###Making predictions

In [76]:
import random
test_sentences = test_df["text"].to_list()
test_samples = random.sample(test_sentences, 5)
for test_sample in test_samples:
  pred_prob = tf.squeeze(model_6.predict([test_sample]))
  pred = tf.round(pred_prob)
  print(f"Pred: {int(pred)}, Prob: {pred_prob}")
  print(f"Text:\n{test_sample}\n")
  print("----\n")

Pred: 0, Prob: 0.19301718473434448
Text:
@DodgersNation he was due to get blown up at least this is still a winnable game.

----

Pred: 1, Prob: 0.5531038641929626
Text:
Nigeria warned against massiveåÊflooding http://t.co/CofH4khFsD http://t.co/m0fLpPxIlg

----

Pred: 0, Prob: 0.07399478554725647
Text:
The Hobbit: The Desolation of Smaug (#dvd 2014 2-Disc Set Digital Copy) http://t.co/000siJjL3t http://t.co/JlUJsHCvoA

----

Pred: 0, Prob: 0.12413246929645538
Text:
IMM SCREAMING ARI IS HOLDING THE SIGN

----

Pred: 1, Prob: 0.9265311360359192
Text:
There was also a calm. Something was going to happen. Suddenly a crash of lightning came through the sky and with it the wails of sirens. --

----

