# Import libs

In [None]:
import numpy as np
import pandas as pd
import random
import nltk
import string
import re
import emoji
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

In [None]:
SEED = 40

def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    
set_seed()

# Loading files

In [None]:
train_df = pd.read_csv('../input/nlp-getting-started/train.csv')
test_df = pd.read_csv('../input/nlp-getting-started/test.csv')
sample = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

train_df.head(3)

In [None]:
train_df.info()

# Preprocessing

In [None]:
# need to clean duplicates
train_df.drop_duplicates('text', inplace=True)

In [None]:
def preprocess_text(text):
    text = re.sub('https?:\/\/t.co\/[A-Za-z0-9]+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\d+', '', text)
    text = re.sub(r'&amp;?', r'and', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'[!]+', '!', text)
    text = re.sub(r'[?]+', '?', text)
    text = re.sub(r'[.]+', '.', text)
    #delete emodzi
    allchars = [c for c in text]
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI["en"]]
    text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    text = nltk.word_tokenize(text)
    text = [i.lower() for i in text if (i not in string.punctuation)]
    text = [i for i in text if (i not in stopwords.words('english'))]
#     in case you need cleaner data
#     #stemming
#     text = [PorterStemmer().stem(word) for word in text]
#     #Lemmentization
#     text = [WordNetLemmatizer().lemmatize(word) for word in text]
    text = ' '.join(text)
    text = text.strip()
    
    return text

In [None]:
train_df['tokenized'] = train_df.text.apply(preprocess_text)
test_df['tokenized'] = test_df.text.apply(preprocess_text)

In [None]:
tok = Tokenizer()

train_text = train_df.tokenized
train_labels = train_df.target
test_text = test_df.tokenized

tok.fit_on_texts(train_text)

train_text = tok.texts_to_sequences(train_text)
test_text = tok.texts_to_sequences(test_text)

In [None]:
seq_list = [len(s) for s in train_text]
MAX_SEQ_LEN = np.max(seq_list)

train_text = tf.keras.preprocessing.sequence.pad_sequences(
    train_text,
    padding='post',
    truncating='post',
    maxlen=MAX_SEQ_LEN
)

test_text = tf.keras.preprocessing.sequence.pad_sequences(
    test_text,
    padding='post',
    truncating='post',
    maxlen=MAX_SEQ_LEN
)

# Model

In [None]:
VOC_SIZE = len(tok.index_word) + 1
EPOCHS = 100
BATCH_SIZE = 512
UNITS = 64

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOC_SIZE, UNITS, input_length=MAX_SEQ_LEN, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(UNITS, return_sequences=True, dropout=0.5, recurrent_dropout=0.5)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(UNITS, return_sequences=False, dropout=0.5, recurrent_dropout=0.5)),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
# very important callbacks
early_stopping = EarlyStopping(patience=13, verbose=1)
checkpoint = ModelCheckpoint('model.h5', save_best_only=True, verbose=1)
lr_reduce = ReduceLROnPlateau(patience=5, verbose=1)

# Training

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(amsgrad=True),
    loss='binary_crossentropy',
    metrics=['acc']
    )

history = model.fit(
    train_text,
    train_labels,
    validation_split=0.2,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=[early_stopping, checkpoint, lr_reduce]
)

In [None]:
pred = model.predict_classes(test_text)

In [None]:
sample['target'] = pred
sample.to_csv('submission.csv', index=False)

Feel free to comment my notebook and don't forget to hit the like button!