In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Libraries

In [None]:
import os
import re
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow.keras as keras 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Embedding, 
                                     LSTM, 
                                     Dense, 
                                     Dropout,
                                     GRU) 
from tensorflow.keras.callbacks import EarlyStopping
print('You are using TensorFlow version: ',tf.__version__)

# Load Datasets

In [None]:
dirs = '/kaggle/input/nlp-getting-started/'
df_train = pd.read_csv(dirs+'train.csv', sep=',', encoding='ISO-8859-1')
df_test = pd.read_csv(dirs+'test.csv', sep=',', encoding='ISO-8859-1')

pd.set_option('display.max_colwidth', None)
df_train.head(10)

In [None]:
print("======= TRAIN DATA =======")
print("shape : ", df_train.shape)
print("null  : ", np.sum(df_train.isnull().any(axis=1)))
print("======= TEST DATA =======")
print("shape : ", df_test.shape)
print("null  : ", np.sum(df_test.isnull().any(axis=1)))

In [None]:
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're','s', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']

STOPWORDS = set(stopwordlist)

In [None]:
## Utility codes for text cleaning

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

def remove_urls(text):
    new_text = re.sub(r'http?:\/\/.*[\r\n]*', "", text)
    new_text = re.sub(r'https?:\/\/.*[\r\n]*', "", new_text)
    #new_text = re.sub(r'@[a-zA-Z0-9]', "", new_text)
    new_text = ' '.join(x for x in new_text.split() if not x.startswith('@'))
    return new_text.casefold().strip()

def remove_special_chrs(text):
    new_text = re.sub(r"[^a-zA\s]",'',text)
    new_text=new_text.replace("#","").strip()
    return new_text.strip()

In [None]:
df_train['text'] = df_train.text.apply(remove_urls).dropna()
df_test['text'] = df_test.text.apply(remove_urls)
df_train.sample(10)

In [None]:
## Sorting by id
X = df_train.sort_values(by=['id'], ascending=True)
Xtest = df_test.sort_values(by=['id'], ascending=True)

## get target
tweets = X.loc[:, X.columns =='text'].values.flatten()
tweets_test = Xtest.loc[:, Xtest.columns =='text'].values.flatten()

In [None]:
print(tweets[:10])

# Applying NLP Techniques

In [None]:
u_token = Tokenizer()
def preprocess(text):
    seqs = u_token.fit_on_texts(text)
    return seqs

In [None]:
## fit tokenizer on training set:
tokens=preprocess(tweets)
print(tokens)

In [None]:
## Text to Sequences
X_train = u_token.texts_to_sequences(tweets)
X_test = u_token.texts_to_sequences(tweets_test)
print(X_train[0])

In [None]:
## Pad Sequences
X_train = np.array(X_train, dtype=object)
X_test = np.array(X_test, dtype=object)
print("Array Sequence: ", X_train[0])
X_train = keras.preprocessing.sequence.pad_sequences(X_train)
X_test = keras.preprocessing.sequence.pad_sequences(X_test)
print("Pad Sequence:\n",X_train[0])

In [None]:
## size of the vocabulary
vocab_size = len(u_token.word_index)
print('Size of vocabulary:', vocab_size)

In [None]:
# target training variable:
y_train = df_train.target.values
print(y_train[0])

# Modelling

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_val, y, y_val = train_test_split(X_train, y_train, 
                                            test_size=0.2, 
                                            random_state=42)

print("Training data shape: ", x_train.shape)
print("Validation data shape: ", x_val.shape)

In [None]:
model = keras.Sequential([
    Embedding(input_dim = vocab_size+2, output_dim = 128, name='Embedding'),
    GRU(128, return_sequences=True),
    GRU(128),
    Dense(1)
])

In [None]:
model.compile(
    loss = 'binary_crossentropy',
    optimizer = 'adam',
    metrics = ['binary_accuracy'],
)
model.summary()

# Training the Model

In [None]:
early_stopping = EarlyStopping(
                monitor = 'val_accuracy',
                min_delta = .001,
                patience = 5,
                restore_best_weights = True    
            )

history = model.fit(
    x_train, y,
    validation_data=(x_val, y_val),
    batch_size=120,
    epochs=10,
    callbacks=[early_stopping],
)

In [None]:
train_score = model.evaluate(x_val, y_val)

In [None]:
# Plot history: Binary Cross-entropy
plt.plot(history.history['loss'], label='BCE (training data)')
plt.plot(history.history['val_loss'], label='BCE (validation data)')
plt.title('BCE for disaster Twitts')
plt.ylabel('BCE value')
plt.xlabel('Epochs')
plt.legend(loc="upper left")

In [None]:
# Plot history: Accuracy
plt.plot(history.history['binary_accuracy'], label='acc (training data)')
plt.plot(history.history['val_binary_accuracy'], label='acc (validation data)')
plt.title('Accuracy for disaster Twitts')
plt.ylabel('Accuracy value')
plt.xlabel('No. epoch')
plt.legend(loc="upper left")
plt.show()

# Metric Validation

In [None]:
from sklearn.metrics import f1_score

y_pred = np.where(model.predict(x_val)>0.5,1.0,0.0).flatten().astype('int32')
y_true = y_val
f1_score(y_true, y_pred)

In [None]:
preds = np.where(model.predict(X_test)>0.5,1.0,0.0).flatten().astype('int32')

In [None]:
results = {'text':tweets_test, 'target':preds}
df_results = pd.DataFrame(results)
df_results.sample(20)

In [None]:
submission = pd.merge(df_test, df_results, on='text').drop_duplicates()
submission.drop(['text', 'keyword', 'location'],axis=1).to_csv('submission_2.csv', index=False)