# Natural Language Processing with Disaster Tweets
- Hola amigos, this is my Kaggle Notebook for the Kaggle competition Natural Language Processing with Disaster Tweets, which can be found [here](https://www.kaggle.com/c/nlp-getting-started/overview)
- In this notebook, I have primarily used **Google pre-trained Word2Vec** for transforming the text into embeddings and I have used a **single layer bi-directional LSTM neural network** for modelling purposes.
- If you like my work, do upvote it, and if you find a better result producing configuration, do let me know in the comments section. Would love to know about it!

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Installing & Importing Packages

In [None]:
import re
import gensim
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from scipy.sparse import vstack
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split

from tensorflow import one_hot
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Accuracy
from tensorflow_addons.metrics import F1Score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Dense, Input, Dropout, LSTM, SimpleRNN, Bidirectional

# Importing the Dataset

In [None]:
df_train = pd.read_csv('../input/nlp-getting-started/train.csv')
df_sub = pd.read_csv("../input/nlp-getting-started/sample_submission.csv") 
df_train.info()

In [None]:
df_train.drop_duplicates(subset=["text", "target"], keep="first", inplace=True)
sep = df_train.shape[0]
df_train.info()

In [None]:
Y = df_train["target"]
df_train.drop(["target"], axis=1, inplace=True)
print(df_train.shape, Y.shape)

In [None]:
df_test = pd.read_csv("../input/nlp-getting-started/test.csv")
df_test.info()

In [None]:
df = pd.concat([df_train, df_test], axis=0)
df.drop(["location"], axis=1, inplace=True)
df.info()

In [None]:
df.head()

# Pre-processing the Dataset
- First, we will be performing the **decontraction** of all the contracted words like "won't", "can't", and many others.
- Then, we removed all the words with numbers. Then, we removed all the special characters. Then, we removed all the stopwords
- Finally, we saved all the pre-processed sentences in a list, which we use to over-write the original dataset.

In [None]:
# Decontraction
def decontracted(phrase):
    # Specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # General
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
# https://gist.github.com/sebleier/554280
# We are removing the words from the stop words list: 'no', 'nor', 'not' as they generally hold
# a lot of information regarding the meaning of the sentence.

stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
    "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
    'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
    'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
    'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
    'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
    'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
    'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
    'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
    'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
    's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
    've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
    "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
    "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
    'won', "won't", 'wouldn', "wouldn't"])

In [None]:
# Combining all the above steps
pre_text = []

# tqdm is for printing the status bar
for sen in tqdm(df['text'].values):
    # Decontraction
    sen = decontracted(sen)
    # Remove words with numbers python: https://stackoverflow.com/a/18082370/4084039
    sen = re.sub("\S*\d\S*", "", sen).strip()
    # Remove special characters: https://stackoverflow.com/a/5843547/4084039
    sen = re.sub('[^A-Za-z]+', ' ', sen)
    # Removing all the stopwords
    sen = ' '.join(e.lower() for e in sen.split() if e.lower() not in stopwords)
    # Adding them back to pre_text
    pre_text.append(sen.strip())

In [None]:
print(pre_text[10])
print(pre_text[20])
print(pre_text[30])

# Featurizing the Dataset
- We will be using Word2Vec to convert all the words into vectors, and then we will feed all the word embeddings into a bi-directional sequence network (RNN/GRU/LSTM).
- We would be using Google pre-trained word embeddings, which can be found [here](https://code.google.com/archive/p/word2vec/)

## Word2Vec

In [None]:
# Importing the pre-trained word2vec Vocabulary
w2v = gensim.models.KeyedVectors.load_word2vec_format('../input/nlpword2vecembeddingspretrained/GoogleNews-vectors-negative300.bin', binary=True)  

In [None]:
# Finding the length of the longest sentence
max_len = 0
for text in pre_text:
    text = text.lower().split(" ")
    max_len = max(max_len, len(text))
print(max_len)

In [None]:
Tx = 24
fea_text = np.ndarray((len(pre_text), Tx, 300))

for i, text in enumerate(pre_text):
    text = text.split(" ")
    for j in range(Tx):
        try:
            emb_word = w2v[text[j]]
            fea_text[i][j][:] = emb_word
        except:
            emb_word = np.zeros((300))
            fea_text[i][j][:] = emb_word

In [None]:
# Here 10784 represents the total number of data-points (Train + Test)
# 24 represents the length of the longest sentence
# 300 represents the dimensions of embedding of a single word
fea_text.shape

# Preparing the dataset for Training

In [None]:
ids = df['id']
id_train = df['id'][ :sep]
id_test = df['id'][sep: ]
print(ids.shape, id_train.shape, id_test.shape)

df_train = fea_text[ :sep, : , : ]
df_test = fea_text[sep: , : , : ]
print(df_train.shape, df_test.shape)

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(df_train, Y, test_size = 0.1, random_state = 7)
print(X_train.shape, X_val.shape, Y_train.shape, Y_val.shape)

In [None]:
# Converting Y into One-Hot vectors for training purposes
Y_train = one_hot(Y_train, depth=2)
Y_val = one_hot(Y_val, depth=2)
print(Y_train.shape, Y_val.shape)

# Training the Model
- While modelling the dataset, I tried out several approaches including **uni-directional RNN/GRU/LSTM** and **bi-directional RNN/GRU/LSTM**.
- Also, I tried out various possible structures of neural networks consisting of these layers, such as using a single layer of LSTM, 2 layers or RNN, 5 Dense layers, 3 layers of GRU, etc.
- Among all the configurations that I tried, the below configuration gave me the best results.

In [None]:
# Create the Tensorlfow model and return it
def create_model(input_shape):
    '''
    input_shape: Shape of the Input Vector Representations (Tx, 300)
    '''
    
    emb = Input(shape=input_shape)
    X = Bidirectional(LSTM(64, return_sequences=False), merge_mode="concat")(emb)
    X = Dense(64, activation='relu')(X)
    X = Dropout(0.2)(X)
    X = Dense(64, activation='relu')(X)
    X = Dense(32, activation='relu')(X)
    X = Dense(2, activation='softmax')(X)
    
    # Create Model instance which converts emb into X.
    model = Model(inputs=emb, outputs=X)
    return model

In [None]:
# Creating the Model
model = create_model((Tx, 300,))
model.summary()

In [None]:
# Compiling the Model
f1_score = F1Score(num_classes=2)
opt = Adam(learning_rate = 0.0001)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy', f1_score])

In [None]:
# Training the Model
epochs = 15
batch_size = 16
model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, shuffle=True)

In [None]:
acc = Accuracy()
y_pred_val = model.predict(X_val)
print("Accuracy = ", acc(Y_val, y_pred_val))
print("F1 Score = ", f1_score(Y_val, y_pred_val))

# Preparing the Submission

In [None]:
Y_oh = one_hot(Y, depth=2)
print(df_train.shape, Y_oh.shape)

In [None]:
# Before making the submission, training the model on the entire training dataset
sub_model = create_model((Tx, 300,))
sub_model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy', f1_score])
sub_model.fit(df_train, Y_oh, epochs=epochs, batch_size=batch_size, shuffle=True)

In [None]:
y_pred = sub_model.predict(df_test)
y_pred = y_pred.argmax(axis=1)

In [None]:
df_sub['target'] = y_pred
print(df_sub.shape)

In [None]:
df_sub.to_csv('submission.csv', index=False)