In [9]:
import pandas as pd
import numpy as np
from tensorflow.keras import layers
from tensorflow.keras import Input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping

In [10]:
df = pd.read_csv("data/train.csv")

In [10]:
embeddings_index = {}
f = open('data/glove.6B.100d.txt',encoding="utf")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xdf in position 1: invalid continuation byte

## View the data:

In [11]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## Let's perform some data cleaning

In [12]:
data = df.text

In [13]:
labels = df.target

In [14]:
data.shape

(7613,)

In [15]:
0.2*7613

1522.6000000000001

In [16]:
7613-1522

6091

In [17]:
x_train = data[0:6100]

In [18]:
x_test = data[6100:]

In [19]:
y_train = labels[0:6100] 

In [20]:
y_test = labels[6100:]

## Tokenizing the data

In [21]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train.values)
sequences = tokenizer.texts_to_sequences(x_train.values)
sequences = sequence.pad_sequences(sequences, maxlen=200)

In [22]:
sequences.shape

(6100, 200)

In [23]:
vocab_size = len(tokenizer.word_index)+1

In [24]:
embedding_dim = 100
max_words=1513

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

## Let's define the model:

In [25]:
input_layer = Input(shape=(None,), dtype='int32', name='tweet_input')
x = layers.Embedding(vocab_size, 100, input_length=200)(input_layer)
x = layers.LSTM(32,
dropout=0.1,
recurrent_dropout=0.5,
return_sequences=True)(x)
x = layers.LSTM(32,
dropout=0.1,
recurrent_dropout=0.5,
return_sequences=False)(x)


In [26]:
x = layers.Dense(100, activation='relu')(x)
output = layers.Dense(1, activation='sigmoid')(x)

In [27]:
model = Model(input_layer,output)

In [28]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 tweet_input (InputLayer)    [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 100)         1934600   
                                                                 
 lstm (LSTM)                 (None, None, 32)          17024     
                                                                 
 lstm_1 (LSTM)               (None, 32)                8320      
                                                                 
 dense (Dense)               (None, 100)               3300      
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 1,963,345
Trainable params: 1,963,345
Non-train

In [29]:
model.layers[1].set_weights([embedding_matrix])
model.layers[1].trainable = False

In [30]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

## Training the model

In [31]:
es = EarlyStopping(monitor='val_loss', mode='min')

In [32]:
history = model.fit(sequences, y_train.values, epochs=20, validation_split=0.2, callbacks = [es])

Epoch 1/20
Epoch 2/20


In [34]:
model.save("trained.h5")

## Evaluating the model:

In [234]:
x_test.reshape(1513,-1)

AttributeError: 'Series' object has no attribute 'reshape'

In [235]:
sequences = tokenizer.texts_to_sequences(x_test.values)
sequences = sequence.pad_sequences(sequences, maxlen=200)

In [236]:
x_test

6100    @AP \n Too slow report the sinking boat in the...
6101                  We walk the plank of a sinking ship
6102    The Sinking Ship (@sinkingshipindy): Scarlet L...
6103    that horrible sinking feeling when youÛªve be...
6104    In the movie 'Titanic' Jack and Rose both coul...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 1513, dtype: object

In [237]:
x_test = sequences

In [238]:
score = model.evaluate(x_test, y_test.values)



In [239]:
score

[0.4619843661785126, 0.7799074649810791]

## Now loading Kaggle's Test Set:

In [240]:
test = pd.read_csv("test.csv")

In [241]:
test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [242]:
ids = test.id

In [243]:
test = test.text

In [244]:
sequences = tokenizer.texts_to_sequences(test)
sequences = sequence.pad_sequences(sequences, maxlen=200)

In [245]:
results = model.predict(sequences)

In [246]:
results = results.round()

In [247]:
results = results.squeeze()

In [248]:
csv_df = pd.DataFrame({
    "id": ids,
    "target": results
})

In [249]:
csv_df.index = csv_df.id

In [250]:
csv_df = csv_df["target"]

In [251]:
csv_df = csv_df.astype(int)

In [252]:
csv_df.to_csv(proj_dir+"results.csv", header=True)

## Trying the model in action

In [1]:
import numpy as np
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()

In [2]:
def encoder(text):
    text = tokenizer.texts_to_sequences([text])
    text = sequence.pad_sequences(text, maxlen=200)
    return text

In [3]:
def predict(text):
    encoded_text = encoder(text)
#     print(encoded_text)
    prediction = (model.predict(encoded_text))
    print(prediction)
    prediction = np.round(prediction)
    if prediction==1:
        return "Disaster"
    return "Not a Disaster"

In [5]:
predict("OMG a blazing sky!")

[[0.21607375]]


'Not a Disaster'

In [6]:
predict("fire fighters are here")

[[0.21607375]]


'Not a Disaster'

# The End?

In [257]:
predict("earthquake is here")

[[0.57285625]]


'Disaster'

In [258]:
predict("life")

[[0.20029667]]


'Not a Disaster'

In [259]:
predict("fire")

[[0.26070035]]


'Not a Disaster'

In [260]:
predict("text me")

[[0.13923046]]


'Not a Disaster'

In [263]:
predict("run for your life emergency bomb bomb bomb")

[[0.6681422]]


'Disaster'

In [262]:
predict("earthquake bomb boom boom")

[[0.8116151]]


'Disaster'

In [264]:
predict("sunflowers daisies watermelons")

[[0.21449864]]


'Not a Disaster'

In [266]:
predict("tanay gupta")

[[0.21449864]]


'Not a Disaster'

In [267]:
predict("anshita palorkar")

[[0.21449864]]


'Not a Disaster'

In [268]:
predict("gautham prabhu")

[[0.21449864]]


'Not a Disaster'

In [269]:
predict("metta venkata srujan ")

[[0.21449864]]


'Not a Disaster'

In [270]:
predict("gujarat")

[[0.21449864]]


'Not a Disaster'

In [272]:
predict("fire fire help")

[[0.5752065]]


'Disaster'

In [273]:
predict("you are an asshole")

[[0.15648368]]


'Not a Disaster'

In [274]:
predict("earthquake bomb bomb")

[[0.761991]]


'Disaster'

In [275]:
predict("send help to refugees")

[[0.28959823]]


'Not a Disaster'

In [276]:
predict("send help we are stuck")

[[0.22379905]]


'Not a Disaster'

In [282]:
predict("flash flood is killing me")

[[0.62048584]]


'Disaster'

In [283]:
predict("flood")

[[0.2707003]]


'Not a Disaster'

In [284]:
predict("earthquake")

[[0.29822612]]


'Not a Disaster'

In [285]:
predict("an earthquake is happening")

[[0.56929356]]


'Disaster'

In [286]:
predict("I dont like earthquake")

[[0.15757793]]


'Not a Disaster'

In [287]:
predict("Your mom is an earthquake")



[[0.16240293]]


'Not a Disaster'

In [288]:
predict("Flash floods in Nepal")

[[0.70219666]]


'Disaster'

In [289]:
predict("Earthquakes are bad")

[[0.2099728]]


'Not a Disaster'

In [290]:
predict("my fit is fire")pr

[[0.16299]]


'Not a Disaster'

In [291]:
predict("boom boom")

[[0.21449864]]


'Not a Disaster'

In [292]:
predict("there lived a certain man in russia")

[[0.2867236]]


'Not a Disaster'

In [296]:
predict("Delhi riots buildings on fire everywhere")

[[0.5954698]]


'Disaster'

In [311]:
predict("Soumya")

[[0.21449864]]


'Not a Disaster'

In [317]:
predict("bridge destroys in delhi")

[[0.37445554]]


'Not a Disaster'

In [4]:
from tensorflow.keras.models import load_model
model = load_model('trained.h5')

In [9]:
predict("hello")

[[0.21607375]]


'Not a Disaster'