# Imports

In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import re
from tqdm import tqdm

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM,Dense, SpatialDropout1D, Dropout, Bidirectional
from tensorflow.keras.initializers import Constant
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# Import data

In [18]:
train = pd.read_csv('data/train.csv')

#### Training dataframe visualization

In [32]:
train

Unnamed: 0,id,keyword,location,text,target,tokens,clean_tweet
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[deed, reason, earthquake, allah, forgive]",deed reason earthquake allah forgive
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, fire, ronge, sask, canada]",forest fire ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,"[resident, asked, shelter, place, notified, of...",resident asked shelter place notified officer ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[people, receive, wildfire, evacuation, order,...",people receive wildfire evacuation order calif...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[got, sent, photo, ruby, alaska, smoke, wildfi...",got sent photo ruby alaska smoke wildfire pour...
...,...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,"[giant, crane, holding, bridge, collapse, near...",giant crane holding bridge collapse nearby home
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,"[control, wild, fire, california, northern, pa...",control wild fire california northern part sta...
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,"[utc, volcano, hawaii]",utc volcano hawaii
7611,10872,,,Police investigating after an e-bike collided ...,1,"[police, investigating, bike, collided, car, l...",police investigating bike collided car little ...


#### Tweets class repartition

In [33]:
class_0 = 0
class_1 = 0

for k in range(train.shape[0]):
    if train.target[k] == 0:
        class_0 += 1
    else:
        class_1 += 1

print('In the training set, there are '+str(class_0)+' tweets non-disaster related, and '+str(class_1)+' tweets disaster related.')

In the training set, there are 4342 tweets non-disaster related, and 3271 tweets disaster related.


# Data preproccesing

#### Cleaning functions

In [34]:
# Create a lemmatizer to fold inflected word forms together.
lemmatizer = WordNetLemmatizer()
nltk.download( 'stopwords', quiet=True )
nltk.download( 'wordnet', quiet=True )

tag_dict = {"J": wordnet.ADJ,
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "R": wordnet.ADV
           }

# Get part of speech for a word. 
def get_wordnet_pos( word ):
    tag = nltk.pos_tag( [word] )[0][1][0].upper()
    return tag_dict.get( tag , wordnet.NOUN )

# Remove web links from a tweet.
def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet = re.sub(r'http\S+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet

# Remove retweet and @user information from a tweet.
def remove_users(tweet):
    '''Takes a string and removes retweet and @user information'''
    
    # remove retweet
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) 
    
    # remove tweeted at
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)     
    return tweet

#### Define stopwords and ponctuation

In [35]:
# Get the default English stopwords list from nltk.
stopwords = nltk.corpus.stopwords.words('english')
f = open('data/stopwords.txt') ; my_stopwords = set(f.read().split('\n')) ; f.close()
stopwords = list(set(stopwords).union(my_stopwords))

# Get a list of punctuation to clean from tweet.
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@#'

#### Tokenization and lemmatization process

In [36]:
def tokenization_process(tweet, bigrams=False, remove_hashtag=False):
    tweet = remove_users(tweet)  # remove users (handles)
    tweet = remove_links(tweet)  # remove web links
    tweet = tweet.lower()        # convert tweet to lower case
    tweet = re.sub('['+my_punctuation + ']+', ' ', tweet) # strip punctuation
    tweet = re.sub('\s+', ' ', tweet) #remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet) # remove numbers

    # tokenize the tweet.
    tweet_token_list = [word for word in tweet.strip().split(' ')
                            if len(word)>2 ]

    # lemmatize the words in the tweet.
    tweet_token_list = [lemmatizer.lemmatize(word)
        for word in tweet_token_list]

    # remove stop words from lemma list.
    tweet_token_list = [word for word in tweet_token_list
                       if word not in stopwords]

    # deal with bigrams if requested.
    if bigrams:
        tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
                                            for i in range(len(tweet_token_list)-1)]
    
    # remove hashtags
    if remove_hashtag:
        for token in tweet_token_list:
            token.replace('#', '')
                
    return tweet_token_list

def to_list(tweet_token_list):
    
    # join the processed words in the tweet back to a string with a blank separating the words.
    tweet = ' '.join(list(tweet_token_list))

    # return the cleaned tweet.
    return tweet

In [37]:
train['tokens'] = train.text.apply(tokenization_process)
train['clean_tweet'] = train.tokens.apply(to_list)

In [38]:
train['clean_tweet']

0                    deed reason earthquake allah forgive
1                           forest fire ronge sask canada
2       resident asked shelter place notified officer ...
3       people receive wildfire evacuation order calif...
4       got sent photo ruby alaska smoke wildfire pour...
                              ...                        
7608      giant crane holding bridge collapse nearby home
7609    control wild fire california northern part sta...
7610                                   utc volcano hawaii
7611    police investigating bike collided car little ...
7612    latest home razed northern california wildfire...
Name: clean_tweet, Length: 7613, dtype: object

## Training

with help of https://www.kaggle.com/shahules/basic-eda-cleaning-and-glove

In [72]:
y_raw = train[['target']].to_numpy(dtype=int)
y = np.zeros((train.shape[0], 2))
for k in range(y_raw.shape[0]):
    y[k, y_raw[k]] = 1

In [73]:
embedding_dict={}
with open('../glove.6B.100d.txt','r') as f:
    for line in f:
        values=line.split()
        word = values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

#### Training

In [74]:
MAX_LEN=max(len(tokens)for tokens in train['tokens'])
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(train['tokens'])
sequences=tokenizer_obj.texts_to_sequences(train['tokens'])

tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [79]:
num_words=len(tokenizer_obj.word_index)+1
embedding_matrix=np.zeros((num_words,100))

for word,i in tqdm(tokenizer_obj.word_index.items()):
    if i < num_words:
        emb_vec=embedding_dict.get(word)
        if emb_vec is not None:
            embedding_matrix[i]=emb_vec

In [76]:
model=Sequential()

embedding=Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model.add(embedding)
model.add(Bidirectional(LSTM(25, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(1, activation='sigmoid'))


optimzer=Adam(learning_rate=1e-4)


model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])

In [77]:
X_train, X_test, y_train, y_test = train_test_split(tweet_pad, y_raw, test_size=0.2)

In [79]:
history=model.fit(X_train,y_train,batch_size=16,epochs=20,validation_data=(X_test,y_test))