## Import modules

In [None]:
import string
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf


from keras.preprocessing.text import Tokenizer
from keras.metrics import Recall, Precision
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from keras.optimizers import Adam

from sklearn.model_selection import train_test_split

## Configuration

In [None]:
class Config:
    seed = 44
    n_epochs = 100
    embedding_dim = 256
    maxlen = 220
        
    validation_rate = 0.2    
    dropout_rate = 0.1
    
    optimizer = Adam(lr=1e-3)
    loss = 'binary_crossentropy'
    metrics=['accuracy',  Precision(), Recall()]
    
    reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                  mode='auto',
                                  factor=0.8,
                                  patience=2,
                                  epsilon=1e-4,
                                  coldown=5,
                                  min_lr=1e-5)
    
    checkpoint_best = ModelCheckpoint('best_model.h5',
                                      monitor='val_loss',
                                      mode='min',
                                      verbose=1,
                                      save_best_only=True,
                                      save_weights_only=False)
    
    checkpoint_last = ModelCheckpoint('last_model.h5',
                                      monitor='val_loss',
                                      mode='min',
                                      verbose=1,
                                      save_best_only=False,
                                      save_weights_only=False)
    
    early_stop = EarlyStopping(monitor='val_loss', mode='min', patience=15)
    
    callbacks = [reduce_lr, checkpoint_best, checkpoint_last, early_stop]
    
    paths = {'train': '../input/nlp-getting-started/train.csv',
             'test': '../input/nlp-getting-started/test.csv'}
    

In [None]:
config = Config()

## Check and load data

In [None]:
train_df = pd.read_csv(config.paths['train'])
test_df = pd.read_csv(config.paths['test'])

In [None]:
train_df

In [None]:
test_df

In [None]:
x_train_raw = train_df.text
y_train = train_df.target

In [None]:
x_train_raw

In [None]:
y_train

In [None]:
y_train.value_counts()

## Clean text

In [None]:
def clean_text(text):
    stop_words = {'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once',
               'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 
                'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 
                'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each',
                'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 
                'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 
                'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 
                'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 
                'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 
                'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 
                'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 
                'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further',
                'was', 'here', 'than'}
    
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+|http:?//\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    #we need to get rid of some special characters
    text = re.sub(r"\x89Û_", "", text)
    text = re.sub(r"\x89ÛÒ", "", text)
    text = re.sub(r"\x89ÛÓ", "", text)
    
    sentence_list = text.split()
    new_sentence = []


    for word in sentence_list:
        for stop_word in stop_words:
            if (stop_word == word):
                word = re.sub(stop_word, '', word)
        new_sentence.append(word) 
    return (" ".join(new_sentence))


In [None]:
x_train_raw = x_train_raw.apply(lambda x: clean_text(x))
test_df.text = test_df.text.apply(lambda x: clean_text(x))
x_train_raw

In [None]:
test_df.text

## Tokenize text

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train_raw)

In [None]:
x_train = tokenizer.texts_to_sequences(x_train_raw)
x_test = tokenizer.texts_to_sequences(test_df.text)
print(x_train[0])
print(x_train_raw[0])


In [None]:
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=config.maxlen)
x_train

In [None]:
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=config.maxlen)
x_test

## Model create

In [None]:
import keras.backend as K

from keras.models import Sequential
from keras.layers import Layer
from keras.layers import Embedding, Bidirectional, Dense, LSTM, merge
from keras import initializers, regularizers, constraints, optimizers, layers

In [None]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(shape=(input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight(shape=(input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim
    
    def get_config(self):
        return super(Attention,self).get_config()

In [None]:
model = Sequential([
    Embedding(len(tokenizer.word_index) + 1, config.embedding_dim, input_length=config.maxlen),
    Bidirectional(LSTM(64, return_sequences=True, dropout=config.dropout_rate)),
    #Attention(config.maxlen),
    Dense(64, activation="relu"),
    Dense(1, activation="sigmoid")])
model.summary()

In [None]:
model.compile(loss=config.loss, optimizer=config.optimizer, metrics=config.metrics)
model.summary()

In [None]:
model.fit(x_train, y_train, epochs=config.n_epochs, verbose=1, validation_split=config.validation_rate, shuffle=True, callbacks=config.callbacks)

In [None]:
model.load_weights('./best_model.h5')

In [None]:
result_dataframe = pd.DataFrame(columns=['id', 'target'])
result_dataframe['id'] = test_df['id']
result_dataframe['target'] = np.where(np.array(model.predict(x_test, verbose=1)) >= 0.5, 1, 0 )
result_dataframe.to_csv('submission.csv', index= False)

In [None]:
result_dataframe