# Projet - Real or Not? NLP with Disaster Tweets

## Project description 

* Competition - https://www.kaggle.com/c/nlp-getting-started

## Preprocessing

In [35]:
# Import Tensorflow & Pathlib librairies
import tensorflow as tf 
import pathlib 
import pandas as pd 
import os
import io
import warnings
import sys
import numpy
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
numpy.set_printoptions(threshold=sys.maxsize)

In [36]:
# Import dataset with Pandas 
dataset = pd.read_csv("/content/train.csv", error_bad_lines=False, encoding="utf-8")
dataset = dataset.dropna()
dataset= dataset[dataset.target.notnull()]
#dataset_test = pd.read_csv("/content/test.csv", error_bad_lines=False, encoding="utf-8")
dataset.head(100)

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0
36,54,ablaze,Pretoria,@PhDSquares #mufc they've built so much hype a...,0
37,55,ablaze,World Wide!!,INEC Office in Abia Set Ablaze - http://t.co/3...,1
39,57,ablaze,Paranaque City,Ablaze for you Lord :D,0
40,59,ablaze,Live On Webcam,Check these out: http://t.co/rOI2NSmEJJ http:/...,0
42,62,ablaze,milky way,Had an awesome time visiting the CFC head offi...,0


In [37]:
dataset.keyword.isnull().values

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [38]:
# Let's take the columns we're interested in 
ds_train = dataset[[ "keyword","text","target"]]
ds_train["text"]=ds_train["keyword"]+" "+ ds_train["text"]
ds_train=ds_train[ds_train["keyword"] !="traumatised" ]
ds_train.drop(['keyword'], axis=1, inplace=True)

ds_train.head()

Unnamed: 0,text,target
31,ablaze @bbcmtd Wholesale Markets ablaze http:/...,1
32,ablaze We always try to bring the heavy. #meta...,0
33,ablaze #AFRICANBAZE: Breaking news:Nigeria fla...,1
34,ablaze Crying out for more! Set me ablaze,0
35,ablaze On plus side LOOK AT THE SKY LAST NIGHT...,0


In [39]:
!python -m spacy download en_core_web_md -q
!pip install spacy -q

[K     |████████████████████████████████| 96.4 MB 1.3 MB/s 
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [40]:
import en_core_web_md
nlp = en_core_web_md.load()
# Import Stop words 
from spacy.lang.en.stop_words import STOP_WORDS

**Cleaning and filtering**

In [41]:
import re
def clean_text(text):
  text = "".join([ch for ch in text if ch.isalnum() or ch == " "])
  text = re.sub(" +", " ", text).lower().strip()
  
  text = re.sub(" like ", "", text).lower().strip()
  text = re.sub(" video ", "", text).lower().strip()
  text = re.sub("'s", "", text).lower().strip()  
  text=  re.sub('%20',  ' ',text)
  text=  re.sub('httptco',  '',text)
  
  text = re.sub(r'\d+',  ' ',text)
  text= re.sub(r'@\w+',  '',text)
  text = re.sub("&amp",  '',text)
  
  text = " ".join([token.lemma_ for token in nlp(text) if token.text not in STOP_WORDS and token.lemma_ not in STOP_WORDS])
  return text

In [42]:
ds_train["text_clean"] = ds_train["text"].apply(lambda x: clean_text(x))


In [43]:
ds_train.head(100)

Unnamed: 0,text,target,text_clean
31,ablaze @bbcmtd Wholesale Markets ablaze http:/...,1,ablaze bbcmtd wholesale market ablaze lhyxeohy c
32,ablaze We always try to bring the heavy. #meta...,0,ablaze try bring heavy metal rt yao e xngw
33,ablaze #AFRICANBAZE: Breaking news:Nigeria fla...,1,ablaze africanbaze break newsnigeria flag set ...
34,ablaze Crying out for more! Set me ablaze,0,ablaze cry set ablaze
35,ablaze On plus side LOOK AT THE SKY LAST NIGHT...,0,ablaze plus look sky night ablaze qqsmshaj n
36,ablaze @PhDSquares #mufc they've built so much...,0,ablaze phdsquares mufc build hype new acquisit...
37,ablaze INEC Office in Abia Set Ablaze - http:/...,1,ablaze inec office abia set ablaze imaomknna
39,ablaze Ablaze for you Lord :D,0,ablaze ablaze lord d
40,ablaze Check these out: http://t.co/rOI2NSmEJJ...,0,ablaze check roi nsmejj tj zjin yduixefipe...
42,ablaze Had an awesome time visiting the CFC he...,0,ablaze awesome time visit cfc head office anco...


**Text Tokenizing**

In [44]:
import numpy as np
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000) # instanciate the tokenizer
tokenizer.fit_on_texts(ds_train.text_clean)

ds_train["text_encoded"] = tokenizer.texts_to_sequences(ds_train.text_clean)


ds_train["len_text"] = ds_train["text_encoded"].apply(lambda x: len(x))
ds_train = ds_train[ds_train["len_text"]!=0]

In [45]:
len(tokenizer.word_index)

16024

In [46]:
tokenizer.word_index

{'fire': 1,
 's': 2,
 'emergency': 3,
 'burn': 4,
 'amp': 5,
 'building': 6,
 'bomb': 7,
 'new': 8,
 'attack': 9,
 'body': 10,
 'disaster': 11,
 'storm': 12,
 'collapse': 13,
 'scream': 14,
 'flood': 15,
 'crash': 16,
 'people': 17,
 'accident': 18,
 'police': 19,
 'news': 20,
 'death': 21,
 'bag': 22,
 'suicide': 23,
 'destroy': 24,
 'weapon': 25,
 'derail': 26,
 'kill': 27,
 'drown': 28,
 'time': 29,
 'good': 30,
 'come': 31,
 'wound': 32,
 'injury': 33,
 'nuclear': 34,
 'collide': 35,
 'fatality': 36,
 'wreck': 37,
 'day': 38,
 'blow': 39,
 'rescue': 40,
 'evacuate': 41,
 'man': 42,
 'know': 43,
 'u': 44,
 'plan': 45,
 'mass': 46,
 'demolish': 47,
 'year': 48,
 'family': 49,
 'panic': 50,
 'w': 51,
 'survive': 52,
 'obliterate': 53,
 'think': 54,
 'watch': 55,
 'm': 56,
 'sink': 57,
 'casualty': 58,
 'fear': 59,
 'd': 60,
 'explode': 61,
 'x': 62,
 'deluge': 63,
 'dead': 64,
 'work': 65,
 'service': 66,
 'pm': 67,
 'blaze': 68,
 'c': 69,
 'siren': 70,
 'love': 71,
 'fall': 72,
 'cru

In [47]:

ds_train.head()


Unnamed: 0,text,target,text_clean,text_encoded,len_text
31,ablaze @bbcmtd Wholesale Markets ablaze http:/...,1,ablaze bbcmtd wholesale market ablaze lhyxeohy c,"[160, 4519, 1765, 328, 160, 4520, 69]",7
32,ablaze We always try to bring the heavy. #meta...,0,ablaze try bring heavy metal rt yao e xngw,"[160, 198, 359, 530, 851, 101, 4521, 105, 4522]",9
33,ablaze #AFRICANBAZE: Breaking news:Nigeria fla...,1,ablaze africanbaze break newsnigeria flag set ...,"[160, 4523, 168, 4524, 418, 188, 160, 716, 4525]",9
34,ablaze Crying out for more! Set me ablaze,0,ablaze cry set ablaze,"[160, 773, 188, 160]",4
35,ablaze On plus side LOOK AT THE SKY LAST NIGHT...,0,ablaze plus look sky night ablaze qqsmshaj n,"[160, 1482, 115, 621, 301, 160, 4526, 86]",8


In [48]:
ds_train_pad = tf.keras.preprocessing.sequence.pad_sequences(ds_train.text_encoded, padding="post")

In [49]:
ds_train_slices = tf.data.Dataset.from_tensor_slices((ds_train_pad, ds_train.target.values ))


In [50]:
ds_train.shape[0]

5059

**Spliting and Batching train data**

In [51]:
# Train Test Split
TAKE_SIZE = int(0.7*ds_train.shape[0])

train_data = ds_train_slices.take(TAKE_SIZE).shuffle(TAKE_SIZE)
train_data = train_data.batch(64)

val_data = ds_train_slices.skip(TAKE_SIZE)
val_data = val_data.batch(64)

In [52]:
 # Regardons un batch 
for text, target in train_data.take(1):
  print(text, target)

tf.Tensor(
[[  13 1203  209  742 1217  427 6924 1934   13 6925 6926    0    0    0
     0    0    0    0    0    0    0    0    0]
 [ 229  409 2618 1038  462  157 2165  831  117 4124    0    0    0    0
     0    0    0    0    0    0    0    0    0]
 [  73 3579 1107  730  649  472   73 1660 2375 7375 7376    0    0    0
     0    0    0    0    0    0    0    0    0]
 [  28  662  334 3843   87 3844 3845  829  332  701 1192  458 9107 9108
     0    0    0    0    0    0    0    0    0]
 [  13 6917  185   13  646 1291  668 6918  764 6919 2490 6920    0    0
     0    0    0    0    0    0    0    0    0]
 [  24   15   17   64  234   24 1683   17 2091   27  234 1690 8592    0
     0    0    0    0    0    0    0    0    0]
 [  93 1591 1528  251    8 3958   93 1052   75  402    1  849 1112    0
     0    0    0    0    0    0    0    0    0]
 [ 129 1014 1111 1536 2151  399  761 1196  129   30   80 2780 3403 2028
     0    0    0    0    0    0    0    0    0]
 [ 108    1  108    1  779  3

In [53]:
print(text)

tf.Tensor(
[[  13 1203  209  742 1217  427 6924 1934   13 6925 6926    0    0    0
     0    0    0    0    0    0    0    0    0]
 [ 229  409 2618 1038  462  157 2165  831  117 4124    0    0    0    0
     0    0    0    0    0    0    0    0    0]
 [  73 3579 1107  730  649  472   73 1660 2375 7375 7376    0    0    0
     0    0    0    0    0    0    0    0    0]
 [  28  662  334 3843   87 3844 3845  829  332  701 1192  458 9107 9108
     0    0    0    0    0    0    0    0    0]
 [  13 6917  185   13  646 1291  668 6918  764 6919 2490 6920    0    0
     0    0    0    0    0    0    0    0    0]
 [  24   15   17   64  234   24 1683   17 2091   27  234 1690 8592    0
     0    0    0    0    0    0    0    0    0]
 [  93 1591 1528  251    8 3958   93 1052   75  402    1  849 1112    0
     0    0    0    0    0    0    0    0    0]
 [ 129 1014 1111 1536 2151  399  761 1196  129   30   80 2780 3403 2028
     0    0    0    0    0    0    0    0    0]
 [ 108    1  108    1  779  3

**Model definition and training**

In [54]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.layers import Embedding
embedding_dim=10 # the dimensionality of the representation space
from keras.models import Sequential
from keras.layers import Activation, SimpleRNN,Dense,LSTM,GRU

from tensorflow.keras import regularizers

model = keras.Sequential([
  # This layers encodes the string as sequences of int
  Embedding(10001, embedding_dim,input_shape=[text.shape[1],], name="embedding"), # the embedding layer
  #LSTM(units=8, return_sequences=False, return_state=False), # returns the last output
  tf.keras.layers.GlobalAveragePooling1D(),
  keras.layers.Dropout(0.2),
  Dense(4, kernel_regularizer=regularizers.l2(0.01)),    
  Dense(2, activation='softmax') # the prediction layer ou sigmoid
])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 23, 10)            100010    
                                                                 
 global_average_pooling1d_1   (None, 10)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dropout_1 (Dropout)         (None, 10)                0         
                                                                 
 dense_2 (Dense)             (None, 4)                 44        
                                                                 
 dense_3 (Dense)             (None, 2)                 10        
                                                                 
Total params: 100,064
Trainable params: 100,064
Non-trainable params: 0
________________________________________________

In [55]:
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy,SparseCategoricalCrossentropy
from tensorflow.keras.metrics import BinaryAccuracy, SparseCategoricalAccuracy
optimizer= tf.keras.optimizers.Adam()

model.compile(optimizer=optimizer,
                loss=SparseCategoricalCrossentropy(),
                 metrics=SparseCategoricalAccuracy()
              )

In [56]:
val_data

<BatchDataset shapes: ((None, 23), (None,)), types: (tf.int32, tf.int64)>

In [57]:
model.fit(
    train_data,
    epochs=40,
    validation_data = val_data
    #callbacks=[model_checkpoint_callback]
    )

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7feac4aa33d0>

In [58]:
model.save("model_tweet.h5")
import json
json.dump(model.history.history, open("/content/model_tweet.json", 'w'))

In [59]:
tweet_history = json.load(open("/content/model_tweet.json", 'r'))

In [60]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(y=tweet_history["loss"],
                    mode='lines',
                    name='loss'))
fig.add_trace(go.Scatter(y=tweet_history["val_loss"],
                    mode='lines',
                    name='val_loss'))
fig.show()

In [61]:
predictions=model.predict(val_data)

In [62]:
print(val_data)

<BatchDataset shapes: ((None, 23), (None,)), types: (tf.int32, tf.int64)>


In [63]:
tokenizer.sequences_to_texts([[ 110,  127, 1452, 2448,  963, 1691,  110,  127, 2662, 1878,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0]])

['thunderstorm spill practice demon worth broad thunderstorm spill zy fireman']

**Compare prediction to real value of target**

In [64]:
import re
from nltk.tokenize.treebank import TreebankWordDetokenizer as Detok
detokenizer = Detok()
i=0
for text, target in val_data.take(len(val_data)):
  #print(text, target)
  
  #rint("val data {} ".format(detokenizer.detokenize(text[i])))
  print("val data {} ".format(tokenizer.sequences_to_texts([text[i].numpy()])))
  print("target {}".format(target[i]))
  #print("pred {}".format(predictions[i]))
  print("predict {} ".format(np.where(predictions[i][0] > 0.5, 0, 1)))
  print("#######fin########")
  i +=1

val data ['oil spill watch speak bp oil spill birthday tn ax'] 
target 1
predict 1 
#######fin########
val data ['pandemonium know bout pandemonium album'] 
target 0
predict 1 
#######fin########
val data ['police person block se th ave portland portland police pp pdx'] 
target 0
predict 1 
#######fin########
val data ['rainstorm rainstorm downtown um'] 
target 1
predict 1 
#######fin########
val data ['rescue coastal german shepherd rescue oc share link animalrescue httpstcois idc ubj'] 
target 0
predict 1 
#######fin########
val data ['riot drag queen color lead riot know lesbian usually credit incite'] 
target 0
predict 1 
#######fin########
val data ['ruin time good day ruin'] 
target 0
predict 1 
#######fin########
val data ['scream scream lung guy come sa promise'] 
target 0
predict 1 
#######fin########
val data ['seismic u control future subject exploration seismic maintenance electrical'] 
target 0
predict 1 
#######fin########
val data ['siren game attack siren gt character n

***As conclusion we see that the input data contains some fake disaster tweets which lead our model to predict false labels for some tweets***