### Importing all the necessary libraries of NLP, keras, ML

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
import re
import spacy

In [48]:
# importing tf-keras libraries for using word-embeddings
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam

In [15]:
nlp = spacy.load("en_core_web_sm")

### Loading the dataset from local machine

In [3]:
train = pd.read_csv(r'C:\Users\DELL\Desktop\Kaggle Playground\Disaster prediction using Tweets\train.csv')
test = pd.read_csv(r'C:\Users\DELL\Desktop\Kaggle Playground\Disaster prediction using Tweets\test.csv')

In [4]:
train.shape

(7613, 5)

In [5]:
train.head(3)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1


In [6]:
train.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

### Preprocessing the data 
#### 1. Removing some columns.
#### 2. Removing all the links, stopwords, punctuations, extra spaces present in the data.
#### 3. Doing .lower() to bring data in same case.
#### 4. Creating word corpus from data, i.e., converting strings into list of words for feeding into Neural Network

In [16]:
train.drop(columns='location', inplace=True)
test.drop(columns='location', inplace=True)

In [17]:
# ACCEPTS STRING => RETURNS STRING
def link_hashtag_remover(text):
  # reference : https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression
  # pattern = 'http:[\/]{2}[a-z].[a-z]*[\/][a-zA-Z0-9]*|#[a-zA-Z0-9]*|@[a-zA-Z0-9]*'

  preprocessed = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
  #re.sub() is used to replace a given pattern with a replacement pattern
  return preprocessed


# ACCEPTS STRING => RETURNS LIST
def stopwords_punct_remover(text):
  doc = nlp(text)

  preprocessed = []

  for token in doc:
    if(token.is_stop == False):
      if(token.pos_ not in ["PUNCT", "SPACE"]):
        # print(token.text)
        preprocessed.append(token.text)

  return preprocessed
  #return preprocessed

# ACCEPTS LIST => RETURNS STRING
def lemmatizer(text):
  text = " ".join(text)
  str1 = ""
  doc = nlp(text)
  for token in doc:
    str1 += token.lemma_
    str1 += " "
  return str1

In [18]:
def pipeline(text):
  text1 = link_hashtag_remover(text)
  text2 = stopwords_punct_remover(text1)
  text3 = lemmatizer(text2)

  return text3

In [37]:
def create_corpus(df):
    corpus=[]
    for tweet in df['input']:
        words = []
        doc = nlp(tweet)
        for token in doc:
            words.append(token.text.lower())
        corpus.append(words)
    return corpus

In [28]:
train['input'] = train['text'].map(lambda x:pipeline(x))

In [38]:
train['input']

0                   deed Reason earthquake ALLAH Forgive 
1                  forest fire near La Ronge Sask Canada 
2       resident ask shelter place notify officer evac...
3       13 000 people receive wildfire evacuation orde...
4       got send photo Ruby Alaska smoke wildfire pour...
                              ...                        
7608        giant crane hold bridge collapse nearby home 
7609    ahrary control wild fire California northern s...
7610               M1 94 01 04 utc 5 km S Volcano Hawaii 
7611    Police investigate e bike collide car Little P...
7612    late Homes Razed Northern California Wildfire ...
Name: input, Length: 7613, dtype: object

In [39]:
corpus = create_corpus(train)

In [41]:
len(corpus)

7613

### Word Embeddings using GloVe
#### 1. Mapping each word from glove with its suitable numerical word vector of dimensions 50 X 1.
#### 2. post padding to make each sentence of equal length by adding 0's at the end.
#### 3. Create embedding and convert the text corpus into numerical data and ignoring the words which are not present in the dictionary.

In [24]:
# using GloVe embeddings to map each word with their vectors
embeddings_dict = {}
with open(r"C:\Users\DELL\Desktop\glove.6B\glove.6B.50d.txt", 'r', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [49]:
MAX_LEN=50
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences=tokenizer_obj.texts_to_sequences(corpus)

tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [50]:
# maps each unique word to index, not necessarily in alphabetical order
# indexing will start from 1 and not from 0
word_index=tokenizer_obj.word_index
print('Number of unique words:',len(word_index))

Number of unique words: 13182


In [54]:
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words,50))

for word,i in word_index.items():
    if i > num_words:
        continue
    
    emb_vec=embeddings_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i]=emb_vec

In [56]:
embedding_matrix.shape # since there are 13182 words each of size 50 dimension

(13183, 50)

### Making Neural Network model for Classification of tweet, which contains embedding layer,  dropout layer, LSTM layer

In [57]:
model=Sequential()

embedding=Embedding(num_words,50,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


optimzer=Adam(learning_rate=1e-5)

model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])

In [58]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 50)            659150    
                                                                 
 spatial_dropout1d (SpatialD  (None, 50, 50)           0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 64)                29440     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 688,655
Trainable params: 29,505
Non-trainable params: 659,150
_________________________________________________________________


In [59]:
tweet_pad

array([[3767,  447,  197, ...,    0,    0,    0],
       [ 132,    5,  171, ...,    0,    0,    0],
       [1592,  485, 1593, ...,    0,    0,    0],
       ...,
       [2332, 2006,  544, ...,    0,    0,    0],
       [  26,  783,  425, ...,    0,    0,    0],
       [ 141,  742,  795, ...,    0,    0,    0]])

In [60]:
train['input']

0                   deed Reason earthquake ALLAH Forgive 
1                  forest fire near La Ronge Sask Canada 
2       resident ask shelter place notify officer evac...
3       13 000 people receive wildfire evacuation orde...
4       got send photo Ruby Alaska smoke wildfire pour...
                              ...                        
7608        giant crane hold bridge collapse nearby home 
7609    ahrary control wild fire California northern s...
7610               M1 94 01 04 utc 5 km S Volcano Hawaii 
7611    Police investigate e bike collide car Little P...
7612    late Homes Razed Northern California Wildfire ...
Name: input, Length: 7613, dtype: object

In [67]:
print(word_index['deed'])
print(word_index['reason'])

3767
447


### Splitting the data into test data and train data using train_test_split function from sklearn library

In [69]:
train_data = tweet_pad[:train.shape[0]]
test_data = tweet_pad[train.shape[0]:]

In [70]:
x_train,x_test,y_train,y_test=train_test_split(train_data,train['target'].values,test_size=0.15)
print('Shape of train',x_train.shape)
print("Shape of Validation ",x_test.shape)

Shape of train (6471, 50)
Shape of Validation  (1142, 50)


### Training the model

In [71]:
history=model.fit(x_train,y_train,batch_size=4,epochs=15,validation_data=(x_test,y_test),verbose=2)

Epoch 1/15
1618/1618 - 29s - loss: 0.6911 - accuracy: 0.5693 - val_loss: 0.6882 - val_accuracy: 0.5709 - 29s/epoch - 18ms/step
Epoch 2/15
1618/1618 - 27s - loss: 0.6397 - accuracy: 0.6268 - val_loss: 0.5504 - val_accuracy: 0.7443 - 27s/epoch - 17ms/step
Epoch 3/15
1618/1618 - 27s - loss: 0.5430 - accuracy: 0.7401 - val_loss: 0.5297 - val_accuracy: 0.7574 - 27s/epoch - 17ms/step
Epoch 4/15
1618/1618 - 27s - loss: 0.5255 - accuracy: 0.7549 - val_loss: 0.5176 - val_accuracy: 0.7609 - 27s/epoch - 17ms/step
Epoch 5/15
1618/1618 - 27s - loss: 0.5179 - accuracy: 0.7585 - val_loss: 0.5105 - val_accuracy: 0.7671 - 27s/epoch - 17ms/step
Epoch 6/15
1618/1618 - 27s - loss: 0.5116 - accuracy: 0.7651 - val_loss: 0.5049 - val_accuracy: 0.7715 - 27s/epoch - 17ms/step
Epoch 7/15
1618/1618 - 27s - loss: 0.5099 - accuracy: 0.7631 - val_loss: 0.5010 - val_accuracy: 0.7776 - 27s/epoch - 17ms/step
Epoch 8/15
1618/1618 - 27s - loss: 0.5000 - accuracy: 0.7747 - val_loss: 0.4994 - val_accuracy: 0.7767 - 27s/ep

### Making the predictions

In [73]:
sample_sub=pd.read_csv(r'C:\Users\DELL\Desktop\Kaggle Playground\Disaster prediction using Tweets\sample_submission.csv')

In [76]:
test

Unnamed: 0,id,keyword,text
0,0,,Just happened a terrible car crash
1,2,,"Heard about #earthquake is different cities, s..."
2,3,,"there is a forest fire at spot pond, geese are..."
3,9,,Apocalypse lighting. #Spokane #wildfires
4,11,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...
3258,10861,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,Storm in RI worse than last hurricane. My city...
3260,10868,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,MEG issues Hazardous Weather Outlook (HWO) htt...


### Preprocessing the test-data, as we need to convert it into model-readable format.

In [77]:
test['input'] = test['text'].map(lambda x:pipeline(x))

In [78]:
test

Unnamed: 0,id,keyword,text,input
0,0,,Just happened a terrible car crash,happen terrible car crash
1,2,,"Heard about #earthquake is different cities, s...",hear earthquake different city stay safe
2,3,,"there is a forest fire at spot pond, geese are...",forest fire spot pond geese flee street save
3,9,,Apocalypse lighting. #Spokane #wildfires,apocalypse light Spokane wildfire
4,11,,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kill 28 China Taiwan
...,...,...,...,...
3258,10861,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,earthquake safety LOS ANGELES SAFETY fastener ...
3259,10865,,Storm in RI worse than last hurricane. My city...,storm ri bad hurricane city amp 3others hard h...
3260,10868,,Green Line derailment in Chicago http://t.co/U...,Green Line derailment Chicago
3261,10874,,MEG issues Hazardous Weather Outlook (HWO) htt...,meg issue Hazardous Weather Outlook HWO


In [79]:
test_corpus = create_corpus(test)

In [86]:
test_sequences=tokenizer_obj.texts_to_sequences(test_corpus)
test_tweet_pad=pad_sequences(test_sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [87]:
test_tweet_pad.shape

(3263, 50)

In [89]:
test_tweet_pad

array([[ 200, 1621,   49, ...,    0,    0,    0],
       [ 137,  197,  985, ...,    0,    0,    0],
       [ 132,    5,  582, ...,    0,    0,    0],
       ...,
       [ 724,  480,  289, ...,    0,    0,    0],
       [5127,  187,  340, ...,    0,    0,    0],
       [1314, 1315,   19, ...,    0,    0,    0]])

In [91]:
word_index['car']

49

### Final prediction

In [92]:
y_pre = model.predict(test_tweet_pad)



In [93]:
type(y_pre)

numpy.ndarray

In [95]:
y_pre.shape

(3263, 1)

In [97]:
y_pre[0:5]

array([[0.8249626 ],
       [0.8091696 ],
       [0.86521316],
       [0.8212333 ],
       [0.8981525 ]], dtype=float32)

### Saving the file that can be directly submitted on kaggle.

In [100]:
y_pre=np.round(y_pre).astype(int).reshape(3263)
sub=pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_pre})
sub.to_csv('submission_word_embeddings.csv',index=False)

In [99]:
y_pre

array([1, 1, 1, ..., 1, 0, 1])