In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time

In [2]:
df = pd.read_csv('/content/train_data_cleaning.csv.xls')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this # earthquake...,1
1,4,,,Forest fire near La Ronge Sask . Canada,1
2,5,,,All residents asked to ' shelter in place ' ...,1
3,6,,,"13,000 people receive # wildfires evacuation ...",1
4,7,,,Just got sent this photo from Ruby # Alaska a...,1


In [3]:
print((df.target == 1).sum())
print((df.target == 0).sum())

3271
4342


In [4]:
#Preprocessing

import re
import string

In [5]:
def remove_URL(text):
  url = re.compile(r"https?://\S+|www\.\S+")
  return url.sub(r"", text)

def remove_punct(text):
  translator = str.maketrans("", "", string.punctuation)
  return text.translate(translator)

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [6]:
pattern = re.compile(r"https?://\S+|www\.\S+")
for t in df.text:
  matches = pattern.findall(t)
  for match in matches:
    print(t)
    print(match)
    print(pattern.sub(r"", t))
  if len(matches) > 0:
    break

In [7]:
df['text'] = df.text.map(remove_URL)
df['text'] = df.text.map(remove_punct)

In [8]:
#Remove stopwords

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
stop = set(stopwords.words('english'))

def remove_stopwords(text):
  filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
  return " ".join(filtered_words)

In [10]:
stop

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [11]:
df['text'] = df.text.map(remove_stopwords)

In [12]:
df.text

0            deeds reason earthquake may allah forgive us
1                   forest fire near la ronge sask canada
2       residents asked shelter place notified officer...
3       13000 people receive wildfires evacuation orde...
4       got sent photo ruby alaska smoke wildfires pou...
                              ...                        
7608    two giant cranes holding bridge collapse nearb...
7609    aria ahrary thetawniest control wild fires cal...
7610                   m1 94 01 04 utc 5km volcano hawaii
7611    police investigating e bike collided car littl...
7612    latest homes razed northern california wildfir...
Name: text, Length: 7613, dtype: object

In [13]:
from collections import Counter

In [15]:
def counter_word(text_col):
  count= Counter()
  for text in text_col.values:
    for word in text.split():
      count[word] += 1
    return count

counter = counter_word(df.text)

In [19]:
len(counter)

7

In [17]:
counter

Counter({'deeds': 1,
         'reason': 1,
         'earthquake': 1,
         'may': 1,
         'allah': 1,
         'forgive': 1,
         'us': 1})

In [18]:
counter.most_common(5)

[('deeds', 1), ('reason', 1), ('earthquake', 1), ('may', 1), ('allah', 1)]

In [20]:
num_unique_words = len(counter)

In [21]:
#Splitting the dataset into training and validation set

train_size = int(df.shape[0] * 0.8)

train_df = df[:train_size]
val_df = df[train_size:]

train_sentences = train_df.text.to_numpy()
train_labels = train_df.target.to_numpy()
val_sentences = val_df.text.to_numpy()
val_labels = val_df.target.to_numpy()

In [22]:
train_sentences.shape, val_sentences.shape

((6090,), (1523,))

In [23]:
#Tokenize
from tensorflow.keras.preprocessing.text import Tokenizer

In [24]:
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences)

In [25]:
word_index = tokenizer.word_index

In [26]:
word_index

{'like': 1,
 'fire': 2,
 'get': 3,
 'news': 4,
 '2': 5,
 'via': 6,
 'new': 7,
 'people': 8,
 'emergency': 9,
 'would': 10,
 'one': 11,
 'disaster': 12,
 'body': 13,
 'video': 14,
 'us': 15,
 'burning': 16,
 'police': 17,
 'u': 18,
 'buildings': 19,
 'time': 20,
 'crash': 21,
 'first': 22,
 'man': 23,
 'day': 24,
 'california': 25,
 '3': 26,
 '1': 27,
 'still': 28,
 'got': 29,
 'two': 30,
 'know': 31,
 '2015': 32,
 'back': 33,
 'going': 34,
 'attack': 35,
 'world': 36,
 'full': 37,
 'love': 38,
 'accident': 39,
 'go': 40,
 'nuclear': 41,
 '4': 42,
 'see': 43,
 'today': 44,
 'youtube': 45,
 'may': 46,
 'rt': 47,
 'year': 48,
 'many': 49,
 'watch': 50,
 'cannot': 51,
 '5': 52,
 'collapse': 53,
 'hiroshima': 54,
 'car': 55,
 'dead': 56,
 'mass': 57,
 'let': 58,
 'could': 59,
 'life': 60,
 'want': 61,
 'good': 62,
 'years': 63,
 'work': 64,
 'best': 65,
 'death': 66,
 'train': 67,
 'last': 68,
 'fires': 69,
 'say': 70,
 'think': 71,
 'families': 72,
 'way': 73,
 'killed': 74,
 'hot': 75,
 '

In [28]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)

In [29]:
print(train_sentences[10:15])
print(train_sequences[10:15])

['three people died heat wave far'
 'haha south tampa getting flooded hah wait second live south tampa gonna gonna fvck flooding'
 'raining flooding florida tampabay tampa 18 19 days lost count'
 'flood bago myanmar arrived bago'
 'damage school bus 80 multi car crash breaking']
[[], [], [], [], []]


In [30]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [32]:
max_length = 20
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding='post', truncating='post')
train_padded.shape, val_padded.shape

((6090, 20), (1523, 20))

In [33]:
train_padded[10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int32)

In [34]:
print(train_sentences[10])
print(train_sequences[10])
print(train_padded[10])

three people died heat wave far
[]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [35]:
#Check reversing the indices

reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

In [37]:
reverse_word_index

{1: 'like',
 2: 'fire',
 3: 'get',
 4: 'news',
 5: '2',
 6: 'via',
 7: 'new',
 8: 'people',
 9: 'emergency',
 10: 'would',
 11: 'one',
 12: 'disaster',
 13: 'body',
 14: 'video',
 15: 'us',
 16: 'burning',
 17: 'police',
 18: 'u',
 19: 'buildings',
 20: 'time',
 21: 'crash',
 22: 'first',
 23: 'man',
 24: 'day',
 25: 'california',
 26: '3',
 27: '1',
 28: 'still',
 29: 'got',
 30: 'two',
 31: 'know',
 32: '2015',
 33: 'back',
 34: 'going',
 35: 'attack',
 36: 'world',
 37: 'full',
 38: 'love',
 39: 'accident',
 40: 'go',
 41: 'nuclear',
 42: '4',
 43: 'see',
 44: 'today',
 45: 'youtube',
 46: 'may',
 47: 'rt',
 48: 'year',
 49: 'many',
 50: 'watch',
 51: 'cannot',
 52: '5',
 53: 'collapse',
 54: 'hiroshima',
 55: 'car',
 56: 'dead',
 57: 'mass',
 58: 'let',
 59: 'could',
 60: 'life',
 61: 'want',
 62: 'good',
 63: 'years',
 64: 'work',
 65: 'best',
 66: 'death',
 67: 'train',
 68: 'last',
 69: 'fires',
 70: 'say',
 71: 'think',
 72: 'families',
 73: 'way',
 74: 'killed',
 75: 'hot',
 7

In [38]:
def decode(sequence):
  return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

In [40]:
decoded_text = decode(train_sequences[10])

print(train_sequences[10])
print(decoded_text)

[]



In [41]:
#Create LSTM model
from tensorflow.keras import layers

In [42]:
model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))
model.add(layers.LSTM(64, dropout=0.1))
model.add(layers.Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 32)            224       
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 25121 (98.13 KB)
Trainable params: 25121 (98.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [43]:
loss=keras.losses.BinaryCrossentropy(from_logits=False)
optimizer=keras.optimizers.Adam(lr=0.001)
metrics=['accuracy']

model.compile(loss=loss, optimizer=optimizer, metrics=metrics)



In [47]:
model.fit(train_padded, train_labels, epochs=20, validation_data=(val_padded, val_labels), verbose=2)

Epoch 1/20
191/191 - 3s - loss: 0.6627 - accuracy: 0.6138 - val_loss: 0.6856 - val_accuracy: 0.5522 - 3s/epoch - 15ms/step
Epoch 2/20
191/191 - 3s - loss: 0.6626 - accuracy: 0.6123 - val_loss: 0.6858 - val_accuracy: 0.5522 - 3s/epoch - 14ms/step
Epoch 3/20
191/191 - 3s - loss: 0.6628 - accuracy: 0.6133 - val_loss: 0.6901 - val_accuracy: 0.5509 - 3s/epoch - 17ms/step
Epoch 4/20
191/191 - 4s - loss: 0.6630 - accuracy: 0.6135 - val_loss: 0.6902 - val_accuracy: 0.5522 - 4s/epoch - 23ms/step
Epoch 5/20
191/191 - 3s - loss: 0.6625 - accuracy: 0.6133 - val_loss: 0.6857 - val_accuracy: 0.5515 - 3s/epoch - 15ms/step
Epoch 6/20
191/191 - 3s - loss: 0.6627 - accuracy: 0.6140 - val_loss: 0.6869 - val_accuracy: 0.5515 - 3s/epoch - 15ms/step
Epoch 7/20
191/191 - 3s - loss: 0.6626 - accuracy: 0.6136 - val_loss: 0.6866 - val_accuracy: 0.5509 - 3s/epoch - 16ms/step
Epoch 8/20
191/191 - 4s - loss: 0.6628 - accuracy: 0.6125 - val_loss: 0.6872 - val_accuracy: 0.5515 - 4s/epoch - 23ms/step
Epoch 9/20
191/1

<keras.src.callbacks.History at 0x7f3f7c17bcd0>

In [48]:
predictions = model.predict(train_padded)
predictions = [1 if p > 0.5 else 0 for p in predictions]



In [50]:
print(train_sentences[10:20])
print(train_labels[10:20])
print(predictions[10:20])

['three people died heat wave far'
 'haha south tampa getting flooded hah wait second live south tampa gonna gonna fvck flooding'
 'raining flooding florida tampabay tampa 18 19 days lost count'
 'flood bago myanmar arrived bago'
 'damage school bus 80 multi car crash breaking' 'man' 'love fruits'
 'summer lovely' 'car fast' 'goooooooaaaaaal']
[1 1 1 1 1 0 0 0 0 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
