## Load & preprocess the data

In [2]:
import pandas as pd
import numpy as np
import re
import string

In [3]:
df = pd.read_csv('./data/train.csv')
df.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [4]:
print((df['target']==1).sum()) # Disaster
print((df['target']==0).sum()) # No disaster


3271
4342


In [5]:
def remove_URL(txt):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', txt)

def remove_punct(txt):
    translator = str.maketrans("", "", string.punctuation)
    return txt.translate(translator)

In [6]:
df['text'] = df.text.map(remove_URL)
df['text'] = df.text.map(remove_punct)


In [7]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = set(stopwords.words('english'))

def remove_stopwords(txt):
    filtered_words = [word.lower() for word in txt.split() if word.lower() not in stop]
    return ' '.join(filtered_words)


[nltk_data] Downloading package stopwords to /Users/siro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
df['text']= df.text.map(remove_stopwords)

In [47]:
df.text[0]

'deeds reason earthquake may allah forgive us'

In [10]:
from collections import Counter

def count_word(txt_col):
    count = Counter()
    for t in txt_col.values:
        for word in t.split():
            count[word] += 1
    return count

counter = count_word(df.text)

In [1]:
# counter

In [12]:
counter.most_common(5)

[('like', 345), ('im', 299), ('amp', 298), ('fire', 250), ('get', 229)]

In [13]:
n_unique_words = len(counter)

In [14]:
train_size = int(df.shape[0] * 0.8)

train_df = df[:train_size]
val_df = df[train_size:]

train_sent = train_df.text.to_numpy()
train_labels = train_df.target.to_numpy()

val_sent = val_df.text.to_numpy()
val_labels = val_df.target.to_numpy()

In [15]:
train_sent.shape, val_sent.shape

((6090,), (1523,))

In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=n_unique_words)
tokenizer.fit_on_texts(train_sent)

In [17]:
word_idx = tokenizer.word_index

In [2]:
# word_idx

In [19]:
train_seq = tokenizer.texts_to_sequences(train_sent)
val_seq = tokenizer.texts_to_sequences(val_sent)

In [20]:
print(train_sent[10:15])
print(train_seq[10:15])

['three people died heat wave far'
 'haha south tampa getting flooded hah wait second live south tampa gonna gonna fvck flooding'
 'raining flooding florida tampabay tampa 18 19 days ive lost count'
 'flood bago myanmar arrived bago'
 'damage school bus 80 multi car crash breaking']
[[520, 8, 395, 156, 297, 411], [749, 470, 2248, 138, 2249, 2813, 521, 611, 188, 470, 2248, 189, 189, 5679, 117], [2814, 117, 1884, 5680, 2248, 1285, 1450, 522, 256, 644, 2815], [99, 3742, 612, 1451, 3742], [111, 91, 336, 3743, 3744, 52, 22, 312]]


In [21]:
word_idx['gonna']

189

In [46]:
len(train_seq)

6090

In [22]:
# pad the seq to have the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = 20

train_padded = pad_sequences(train_seq, maxlen=max_length, padding='post', truncating='post')
val_padded = pad_sequences(val_seq, maxlen=max_length, padding='post', truncating='post')
train_padded.shape, val_padded.shape

((6090, 20), (1523, 20))

In [23]:
train_padded[10]

array([520,   8, 395, 156, 297, 411,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0], dtype=int32)

In [24]:
reverse_word_idx = dict([(idx, word) for (word, idx) in word_idx.items()])

In [25]:
def decode(seq):
    return ' '.join([reverse_word_idx.get(idx, '?') for idx in seq])

In [26]:
decoded_txt = decode(train_seq[10])

print(train_seq[10])
print(decoded_txt)
print(train_sent[10])

[520, 8, 395, 156, 297, 411]
three people died heat wave far
three people died heat wave far


In [42]:
train_labels.shape

(6090,)

In [43]:
train_padded.shape

(6090, 20)

## Create & train model

In [38]:
n_unique_words

17971

In [39]:
max_length

20

In [37]:
train_padded.shape

(6090, 20)

In [40]:
train_padded

array([[ 3739,   696,   235, ...,     0,     0,     0],
       [   71,     3,   129, ...,     0,     0,     0],
       [ 1448,  1186,  1882, ...,     0,     0,     0],
       ...,
       [  151,     1,  1256, ...,     0,     0,     0],
       [ 1256,   448,    15, ...,     0,     0,     0],
       [15469,   151,   204, ...,     0,     0,     0]], dtype=int32)

In [28]:
import tensorflow as tf

model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(n_unique_words, 32, input_length=max_length),
    tf.keras.layers.LSTM(64, dropout=0.1),
    tf.keras.layers.Dense(1, activation='sigmoid')
])


model.summary()

Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2022-03-28 20:29:55.746183: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-03-28 20:29:55.746290: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 32)            575072    
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 599,969
Trainable params: 599,969
Non-trainable params: 0
_________________________________________________________________


In [29]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
optim = tf.keras.optimizers.Adam(lr=0.001)
metrics = ['accuracy']

model.compile(loss=loss, optimizer=optim, metrics=metrics)




In [30]:
model.fit(train_padded, train_labels, epochs=20, verbose=2)

Epoch 1/20


2022-03-28 20:29:56.142922: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2022-03-28 20:29:56.143110: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-03-28 20:29:56.553028: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-03-28 20:29:56.799129: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-03-28 20:29:58.259959: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


191/191 - 6s - loss: 0.5508 - accuracy: 0.7130
Epoch 2/20
191/191 - 3s - loss: 0.2949 - accuracy: 0.8819
Epoch 3/20
191/191 - 3s - loss: 0.1552 - accuracy: 0.9516
Epoch 4/20
191/191 - 3s - loss: 0.1101 - accuracy: 0.9632
Epoch 5/20
191/191 - 3s - loss: 0.0882 - accuracy: 0.9716
Epoch 6/20
191/191 - 3s - loss: 0.0780 - accuracy: 0.9772
Epoch 7/20
191/191 - 3s - loss: 0.0628 - accuracy: 0.9787
Epoch 8/20
191/191 - 3s - loss: 0.0589 - accuracy: 0.9785
Epoch 9/20
191/191 - 3s - loss: 0.0475 - accuracy: 0.9800
Epoch 10/20
191/191 - 3s - loss: 0.0388 - accuracy: 0.9823
Epoch 11/20
191/191 - 2s - loss: 0.0405 - accuracy: 0.9826
Epoch 12/20
191/191 - 2s - loss: 0.0331 - accuracy: 0.9846
Epoch 13/20
191/191 - 2s - loss: 0.0362 - accuracy: 0.9824
Epoch 14/20
191/191 - 3s - loss: 0.0323 - accuracy: 0.9841
Epoch 15/20
191/191 - 3s - loss: 0.0291 - accuracy: 0.9846
Epoch 16/20
191/191 - 3s - loss: 0.0408 - accuracy: 0.9819
Epoch 17/20
191/191 - 2s - loss: 0.0387 - accuracy: 0.9828
Epoch 18/20
191/1

<tensorflow.python.keras.callbacks.History at 0x178871f70>

In [31]:
preds = model.predict(val_padded)
preds = [1 if p > 0.5 else 0 for p in preds]

2022-03-28 20:30:49.670528: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-03-28 20:30:49.692818: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


In [32]:
print(val_sent[10:20])
print(val_labels[10:20])
print(preds[10:20])

['ap slow report sinking boat mediterranean sea shame'
 'walk plank sinking ship'
 'sinking ship sinkingshipindy scarlet lane lenore replacing stone saison stonebrewingco'
 'horrible sinking feeling you\x89ûªve home phone realise 3g whole time'
 'movie titanic jack rose could stayed wooden beam without sinking'
 '\x89û¢\x89û¢if lost amp alone sinking like stone carry onå¡å¡'
 'theres chance get gander sinking ship tna cant help appease morbid curiosity destinationimpact'
 'happy exercised demon att price kept rising service kept sinking goodbye'
 'feel like sinking low selfimage take quiz'
 'investment news keurig green mountain inc thirdquarter earnings shares sinking afterhours stocks new\x89û']
[1 0 0 0 0 0 0 0 1 0]
[1, 0, 0, 0, 0, 0, 0, 1, 0, 0]


In [33]:
from sklearn.metrics import accuracy_score

accuracy_score(val_labels, preds)

0.7255416940249507

## Test

In [34]:
test = pd.read_csv('./data/test.csv')

test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [35]:

test_X = test['text']

test_X

0                      Just happened a terrible car crash
1       Heard about #earthquake is different cities, s...
2       there is a forest fire at spot pond, geese are...
3                Apocalypse lighting. #Spokane #wildfires
4           Typhoon Soudelor kills 28 in China and Taiwan
                              ...                        
3258    EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259    Storm in RI worse than last hurricane. My city...
3260    Green Line derailment in Chicago http://t.co/U...
3261    MEG issues Hazardous Weather Outlook (HWO) htt...
3262    #CityofCalgary has activated its Municipal Eme...
Name: text, Length: 3263, dtype: object