## This is NLP based classification problem is solved using a simple Bidirectional LSTM based RNN. 
## If you like my work then do consider upvoting this notebook.

In [None]:
import numpy as np
import pandas as pd
import os, re, unidecode, random, math
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.regularizers import L1L2

In [None]:
def data_cleaner(data):
    data = data.replace('\\n', ' ').replace('\n', ' ').replace('\t',' ').replace('\\', ' ').replace('. com', '.com')
    
    soup = BeautifulSoup(data, 'html.parser')
    data = soup.get_text(separator=' ')
    
    remove_http = re.sub(r'http\S+', '', data)
    data = re.sub(r"\ [A-Za-z]*\.com", " ", remove_http)
    
    data = unidecode.unidecode(data)
    data = data.lower()
    data = re.sub(r"[^a-zA-Z0-9:$-,()%.?!]+", ' ', data) 
    data = re.sub(r"[:$-,()%.?!]+", ' ',data)
    
    stoplist = stopwords.words("english")
    data = [word for word in word_tokenize(data) if word not in stoplist]
    data = " ".join(data)
    
    return data

def seed_everything(SEED = 13):
    np.random.seed(SEED)
    random.seed(SEED)
    tf.random.set_seed(SEED)
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    os.environ['PYTHONHASHSEED'] = str(SEED)
    
seed_everything()
print('seeded everything to get same output')

In [None]:
train_df = pd.read_csv('../input/nlp-getting-started/train.csv')
train_df.head()

In [None]:
del train_df['id']
del train_df['keyword']
del train_df['location']
train_df.head()

In [None]:
print(train_df.shape)
print(train_df.target.value_counts())

In [None]:
train_df['text'] = [data_cleaner(train_df['text'][i]) for i in tqdm(range(train_df.shape[0]))]
train_df.head()

In [None]:
def myModel():    
    model = Sequential(name='Bidirectional_RNN')
    model.add(Embedding(1000, 256, input_length = 256))
    #model.add(Bidirectional((LSTM(512, return_sequences = True, recurrent_dropout=0.0, kernel_regularizer = L1L2(l1=0.0, l2=0.01)))))
    #model.add(Dropout(0.5))
    #model.add(Bidirectional((LSTM(512, return_sequences = True, recurrent_dropout=0.0))))
    #model.add(Dropout(0.5))
    
    #model.add(Bidirectional(LSTM(256, return_sequences = True, recurrent_dropout=0.0, kernel_regularizer = L1L2(l1=0.0, l2=0.01))))
    #model.add(Dropout(0.5))
    
    #model.add(Bidirectional(LSTM(128, return_sequences = True, recurrent_dropout=0.0, kernel_regularizer = L1L2(l1=0.0, l2=0.01))))
    #model.add(Dropout(0.5))
    model.add(Bidirectional(LSTM(128, return_sequences = True, recurrent_dropout=0.0, kernel_regularizer = L1L2(l1=0.0, l2=0.01))))
    model.add(Dropout(0.5))
    
    model.add(Bidirectional(LSTM(64, return_sequences = True, recurrent_dropout=0.0, kernel_regularizer = L1L2(l1=0.0, l2=0.01))))
    model.add(Dropout(0.5))
    
    model.add(Bidirectional(LSTM(32, return_sequences = False, recurrent_dropout=0.0, kernel_regularizer = L1L2(l1=0.0, l2=0.01))))
    model.add(Dropout(0.4))
    #model.add(Bidirectional(LSTM(32, return_sequences = False, recurrent_dropout=0.0, kernel_regularizer = L1L2(l1=0.0, l2=0.01))))
    #model.add(Dropout(0.4))
    
    model.add(Dense(256, activation = 'relu'))
    model.add(Dropout(0.2))
    model.add(Dense(16, activation = 'relu'))
    model.add(Dropout(0.1))
    
    model.add(Dense(1, activation = 'sigmoid'))
    print(model.summary())
    return model

model = myModel()    
model.compile(
optimizer=Adam(learning_rate=0.0001),
loss='binary_crossentropy',
metrics=['Accuracy']
)

In [None]:
text = train_df['text']
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(text.values)
x_train = tokenizer.texts_to_sequences(text.values)
x_train = pad_sequences(x_train, maxlen=256)
print('generated pad sequences')

In [None]:
y_train = train_df['target']
lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.25,
    patience=3,
    verbose=0,
    mode='min'
)

chk_point = ModelCheckpoint(
    '/kaggle/working/best_model.h5',
    monitor='val_loss',
    verbose=0,
    save_best_only=True,
    mode='min'
)

es = EarlyStopping(
    patience=5,
    min_delta=0,
    monitor='val_loss',
    #restore_best_weights=True,
    verbose=0,
    mode='min',
    baseline=None
)

history = model.fit(
    x_train, y_train,
    validation_split=0.2,
    batch_size=64,
    epochs = 100,
    callbacks=[es,lr,chk_point],
    shuffle=True,
    verbose=1
)

In [None]:
# Reference: https://www.pluralsight.com/guides/data-visualization-deep-learning-model-using-matplotlib
plt.plot(history.history['Accuracy'])
plt.plot(history.history['val_Accuracy'])
plt.title('Model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epochs')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('loss')
plt.xlabel('epochs')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
test_df = pd.read_csv('../input/nlp-getting-started/test.csv')
test_df['text'] = [data_cleaner(test_df['text'][i]) for i in tqdm(range(test_df.shape[0]))]
test_df.head()

In [None]:
text = test_df['text']
x_test = tokenizer.texts_to_sequences(text.values)
x_test = pad_sequences(x_test, maxlen=256)
print('generated pad sequences')

In [None]:
model = load_model('./best_model.h5')
preds = model.predict(x_test)
pred = [1 if i>0.5 else 0 for i in preds]

In [None]:
submission=pd.DataFrame()
submission['id']=test_df['id'].to_list()
submission['target']=pred

In [None]:
submission.to_csv('submission.csv',index=False)
submission.head()