In [118]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import nltk
import re
from nltk.corpus import stopwords

### Baseline model training

In [119]:
df = pd.DataFrame(pd.read_csv('data/train.csv', index_col='id'))
df

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...
10869,,,Two giant cranes holding a bridge collapse int...,1
10870,,,@aria_ahrary @TheTawniest The out of control w...,1
10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
10872,,,Police investigating after an e-bike collided ...,1


In [120]:
df = df.drop(['location'], axis=1).dropna()
df

Unnamed: 0_level_0,keyword,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
48,ablaze,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
49,ablaze,We always try to bring the heavy. #metal #RT h...,0
50,ablaze,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
52,ablaze,Crying out for more! Set me ablaze,0
53,ablaze,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0
...,...,...,...
10830,wrecked,@jt_ruff23 @cameronhacker and I wrecked you both,0
10831,wrecked,Three days off from work and they've pretty mu...,0
10832,wrecked,#FX #forex #trading Cramer: Iger's 3 words tha...,0
10833,wrecked,@engineshed Great atmosphere at the British Li...,0


In [121]:
X = df[['keyword', 'text']]
y = df[['target']]
X.head()

Unnamed: 0_level_0,keyword,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
48,ablaze,@bbcmtd Wholesale Markets ablaze http://t.co/l...
49,ablaze,We always try to bring the heavy. #metal #RT h...
50,ablaze,#AFRICANBAZE: Breaking news:Nigeria flag set a...
52,ablaze,Crying out for more! Set me ablaze
53,ablaze,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...


In [122]:
y.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
48,1
49,0
50,1
52,0
53,0


In [123]:
voc_size = 5000

tweets = X.copy()
tweets.reset_index(inplace=True)
tweets['text'][1]

'We always try to bring the heavy. #metal #RT http://t.co/YAo1e0xngw'

In [124]:
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>


False

In [125]:
### Dataset Preprocessing
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(tweets)):
    review = re.sub('[^a-zA-Z]', ' ', tweets['text'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

corpus[:5]

['bbcmtd wholesal market ablaz http co lhyxeohi c',
 'alway tri bring heavi metal rt http co yao e xngw',
 'africanbaz break news nigeria flag set ablaz aba http co nndbgwyei',
 'cri set ablaz',
 'plu side look sky last night ablaz http co qqsmshaj n']

In [126]:
onehot_repr=[one_hot(words, voc_size) for words in corpus] 
print(onehot_repr[0])

[3387, 3415, 2936, 4895, 2701, 461, 2846, 3777]


In [127]:
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs[0])

[   0    0    0    0    0    0    0    0    0    0    0    0 3387 3415
 2936 4895 2701  461 2846 3777]


In [128]:
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 20, 40)            200000    
                                                                 
 dropout_4 (Dropout)         (None, 20, 40)            0         
                                                                 
 lstm_3 (LSTM)               (None, 100)               56400     
                                                                 
 dropout_5 (Dropout)         (None, 100)               0         
                                                                 
 dense_3 (Dense)             (None, 1)                 101       
                                                                 
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [129]:
len(embedded_docs),y.shape

(7552, (7552, 1))

In [130]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [131]:
X_final.shape,y_final.shape

((7552, 20), (7552, 1))

In [132]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

In [133]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10


2022-12-23 23:29:19.175867: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-23 23:29:19.392530: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-23 23:29:19.586041: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-12-23 23:29:22.169165: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-23 23:29:22.234555: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2e6b0dd90>

In [134]:
y_pred = model.predict(X_test)

 1/78 [..............................] - ETA: 20s

2022-12-23 23:29:38.765666: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-23 23:29:38.813837: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




In [135]:
print([round(*n) for n in y_pred[:20]])
print([round(*n) for n in y_test[:20]])

[1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0]
[1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1]


In [136]:
#confusion_matrix(y_test, y_pred)

In [137]:
accuracy_score([round(*n) for n in y_test[:20]],[round(*n) for n in y_pred[:20]])

0.9

### Making predictions for the real data

In [148]:
df_out = pd.DataFrame(pd.read_csv('data/test.csv'))
df_out = df_out.drop(['location'], axis=1)
df_out.head()

Unnamed: 0,id,keyword,text
0,0,,Just happened a terrible car crash
1,2,,"Heard about #earthquake is different cities, s..."
2,3,,"there is a forest fire at spot pond, geese are..."
3,9,,Apocalypse lighting. #Spokane #wildfires
4,11,,Typhoon Soudelor kills 28 in China and Taiwan


In [158]:
df_out.shape

(3263, 3)

In [149]:
X_out = df_out[['keyword', 'text']]
X_out.head()

Unnamed: 0,keyword,text
0,,Just happened a terrible car crash
1,,"Heard about #earthquake is different cities, s..."
2,,"there is a forest fire at spot pond, geese are..."
3,,Apocalypse lighting. #Spokane #wildfires
4,,Typhoon Soudelor kills 28 in China and Taiwan


In [159]:
tweets_out = X_out.copy()
tweets_out.reset_index(inplace=True)
tweets_out['text'][0]

'Just happened a terrible car crash'

In [160]:
corpus_out = []
for i in range(0, len(tweets_out)):
    review = re.sub('[^a-zA-Z]', ' ', tweets_out['text'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus_out.append(review)

corpus_out[:5]

['happen terribl car crash',
 'heard earthquak differ citi stay safe everyon',
 'forest fire spot pond gees flee across street cannot save',
 'apocalyps light spokan wildfir',
 'typhoon soudelor kill china taiwan']

In [161]:
onehot_repr_out = [one_hot(words, voc_size) for words in corpus_out] 
print(onehot_repr_out[0])

[467, 4053, 1196, 4509]


In [162]:
len(onehot_repr_out)

3263

In [164]:
sent_length = 20
embedded_docs_out = pad_sequences(onehot_repr_out, padding='pre', maxlen=sent_length)
print(embedded_docs_out[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0  467 4053 1196 4509]


In [165]:
y_pred_out = model.predict(np.array(embedded_docs_out))
print(y_pred_out[:5])

[[0.9758268 ]
 [0.40028554]
 [0.99999   ]
 [0.18965062]
 [0.9998191 ]]


In [166]:
df_submission = pd.DataFrame()
df_submission['id'] = df_out[['id']]
df_submission['target'] = [round(*n) for n in y_pred_out]
df_submission.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1


In [None]:
df_submission.to_csv('submission.csv', index=False)