# NLP Deep Learning Model with GRU

## Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
sns.set_style("whitegrid")
warnings.filterwarnings('ignore')

from string import digits
import regex as re
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob, Word
from collections import Counter

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.utils import to_categorical
from keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_curve, roc_auc_score

## Import Dataset

In [2]:
df = pd.read_csv("../input/disaster-tweets/tweets.csv")
df.head()

## Statistical Check

In [3]:
df.info()

Since target is a categorical numeric variable $int64$, we want to convert it to $str$ for better display of statistics

In [4]:
df.target = df.target.astype('str')
df.info()

In [5]:
df.describe(include = 'object')

In [6]:
df.describe(exclude = 'object')

- Total 11370 data
- Most of the target variable is not talking about disaster
- ID column contain unique values only. We need to drop it
- We want to remove the keyword and location fields too so we can use only the essential features

In [7]:
train = df.drop(['id', 'keyword', 'location'], axis = 1)
train.info()

## Duplicated Text

In [8]:
print("Is there any duplicated text? ", train.duplicated(subset = ['text'], keep = False).any())
print("")
print("Total duplicated text = ", train.duplicated(subset = ['text'], keep = False).sum())

We can see that our dataset has a total of 292 duplicate texts. We want to remove those duplicate values for a better dataset

In [9]:
train.drop_duplicates(subset = ['text'], keep = 'last', inplace = True)
train.duplicated(subset = ['text'], keep = False).any()

In [10]:
train.tail()

Now let's head into a quick EDA

# EDA

For this EDA, we're using wordcloud to find most frequent word

In [11]:
from wordcloud import WordCloud

texts = " ".join(train.text)

In [12]:
wordclouds = WordCloud(collocations = False, 
                       background_color = 'white',
                      max_font_size = 70,
                      max_words = 250).generate(texts)

plt.rcParams["figure.figsize"] = (20, 10)
plt.imshow(wordclouds)
plt.axis('off')
plt.show()

# Preprocessing

## Lowering

In [13]:
stopword = set(stopwords.words('english'))
lemma = WordNetLemmatizer()
rx = re.compile(r'([^\W\d_])\1{2,}')

def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None

def lemmatizing(text):
    pos_tagged = nltk.pos_tag(nltk.word_tokenize(text))
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))
    lemmatized_stc = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_stc.append(word)
        else:       
            lemmatized_stc.append(lemma.lemmatize(word, tag))
    
    return ' '.join(lemmatized_stc)

def clean(text):
    text =  text.lower()
    text_at = text.split()
    text =  ' '.join([i for i in text_at if 'htt' not in i])
    text = text.translate(str.maketrans('', '', digits))
    text = re.sub(r'[^\w]', ' ', text)
    text = ' '.join(word for word in text.split() if word not in stopword)
    text = lemmatizing(text)
    text = re.sub(r'[^\W\d_]+', lambda x: Word(rx.sub(r'\1\1', x.group())).correct() if rx.search(x.group()) else x.group(), text)
    return text

train['clean'] = train.text.apply(lambda x: clean(x))
train.head()

## Counter

In [14]:
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count

counter = counter_word(train.clean)

num_unique_words = len(counter)
num_unique_words

## Tokenizer

In [15]:
ds = {'label':train.target.values, 'text':train.clean.values}
dfs = pd.DataFrame(data = ds)

# Split dataset into training and validation set
train_size = int(dfs.shape[0] * 0.8)

train_df = dfs[:train_size]
val_df = dfs[train_size:]

# split text and labels
train_sentences = train_df.text.to_numpy()
train_labels = train_df.label.to_numpy().astype(int)
val_sentences = val_df.text.to_numpy()
val_labels = val_df.label.to_numpy().astype(int)

In [16]:
# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences) # fit only to training

In [17]:
# each word has unique index
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)

In [18]:
# Pad the sequences to have the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max number of words in a sequence
max_length = 20

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="post", truncating="post")
train_padded.shape, val_padded.shape

## Model

In [29]:
tf.keras.backend.clear_session()
np.random.seed(42)

In [30]:
from tensorflow.keras import layers

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 300, input_length=max_length))
model.add(layers.Bidirectional(
                                tf.keras.layers.GRU(128,
                                return_sequences=True)))
model.add(GlobalMaxPooling1D())
model.add(layers.Dense(128, activation="relu"))
model.add(layers.BatchNormalization())
model.add(layers.Dense(64, activation="relu"))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

In [31]:
# compile model
lr = 0.000003

model.compile(loss=keras.losses.BinaryCrossentropy(from_logits=False),
                optimizer=tf.keras.optimizers.Adam(learning_rate = lr),
                metrics=["accuracy"])

In [32]:
callbacks = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

model_trained = model.fit(train_padded,
                          train_labels,
                          epochs=30,
                          callbacks=callbacks,
                          validation_data=(val_padded, val_labels))

In [33]:
plt.figure(figsize=(15, 10))
plt.plot(model_trained.history["accuracy"], label="train accuracy")
plt.plot(model_trained.history["val_accuracy"], label="validation accuracy")
plt.plot(model_trained.history["loss"], label="train loss")
plt.plot(model_trained.history["val_loss"], label="validation loss")

plt.legend()

In [34]:
model.evaluate(val_padded, val_labels)