# Welcome to my Kernel ! 

# Introduction

This particular challenge is perfect for data scientists looking to get started with Natural Language Processing.

Competition Description
Twitter has become an important communication channel in times of emergency.
The ubiquitousness of smartphones enables people to announce an emergency they’re observing in real-time. Because of this, more agencies are interested in programatically monitoring Twitter (i.e. disaster relief organizations and news agencies).

But, it’s not always clear whether a person’s words are actually announcing a disaster. 

Take an example:
The author explicitly uses the word “ABLAZE” but means it metaphorically. This is clear to a human right away, especially with the visual aid. But it’s less clear to a machine.

In this competition, you’re challenged to build a machine learning model that predicts which Tweets are about real disasters and which one’s aren’t. You’ll have access to a dataset of 10,000 tweets that were hand classified.

In [None]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import xticks
from nltk.corpus import stopwords
import nltk
import re
from nltk.stem import WordNetLemmatizer
import string
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import defaultdict
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
import tensorflow as tf
from sklearn.metrics import f1_score
from wordcloud import WordCloud,STOPWORDS
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from keras.preprocessing.sequence import pad_sequences
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense,Flatten,Embedding,Activation,Dropout
from keras.layers import Conv1D,MaxPooling1D,GlobalMaxPooling1D,LSTM
from keras.layers import Bidirectional

In [None]:
# load train and test datasets
train= pd.read_csv('../input/nlp-getting-started/train.csv')
test=pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
# check the no. of rows and columns in the dataset
train.shape, test.shape

In [None]:
train.head()

In [None]:
train.isnull().sum().sort_values(ascending = False)

We can see a lots of null values for "keyword" and "location" columns

In [None]:
sns.countplot(x=train.target)

**We have a balanced dataset, which is good**

### Data Cleaning

In order to get accurate results from the predictive model, we need to remove these stop words & punctuations.

Apart from removing these stopwords & puncuations, we would also convert all the messages in lowercase so that words like "Go" & "go" can be treated as same word and not different words.

We will also convert the words to its lemma form (for example, lemma of word "running" would be run), converting words to their lemmas would also help improving the predictive power of our model.

We would also remove embedded special characters from the tweets, for example, #earthquake should be replaced by earthquake

We also need to remove the "URLs" from the tweets

And then finally we remove the digits from the tweets

Lets write a small function "preprocess" to achive all these tasks.

In [None]:
#lets save stopwords in a variable
stop = list(stopwords.words("english"))

In [None]:
# save list of punctuation/special characters in a variable
punctuation = list(string.punctuation)

In [None]:
# create an object to convert the words to its lemma form
lemma = WordNetLemmatizer()

In [None]:
# lets make a combine list of stopwords and punctuations
sw_pun = stop + punctuation

In [None]:
# function to preprocess the messages
def preprocess(tweet):
    tweet = re.sub(r"https?:\/\/t.co\/[A-Za-z0-9]+", "", tweet) # removing urls 
    tweet = re.sub('[^\w]',' ',tweet) # remove embedded special characters in words (for example #earthquake)         
    tweet = re.sub('[\d]','',tweet) # this will remove numeric characters
    tweet = tweet.lower()
    words = tweet.split()  
    sentence = ""
    for word in words:     
        if word not in (sw_pun):  # removing stopwords & punctuations                
            word = lemma.lemmatize(word,pos = 'v')  # converting to lemma    
            if len(word) > 3: # we will consider words with length  greater than 3 only
                sentence = sentence + word + ' '             
    return(sentence)

In [None]:
# apply preprocessing functions on the train and test datasets
train['text'] = train['text'].apply(lambda s : preprocess(s))
test ['text'] = test ['text'].apply(lambda s : preprocess(s))

In [None]:
# function to remove emojis
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
# applying the function on the train and the test datasets
train['text'] = train['text'].apply(lambda s : remove_emoji(s))
test ['text'] = test ['text'].apply(lambda s : remove_emoji(s))


# Vocabulary creation


Lets create our own vocabulary

In [None]:
# function to create vocab
from collections import Counter
def create_vocab(df):
    vocab = Counter()
    for i in range(df.shape[0]):
        vocab.update(df.text[i].split())
    return(vocab)

In [None]:
# concatenate training and testing datasets
master=pd.concat((train,test)).reset_index(drop=True)

# call vocabulary creation function on master dataset
vocab = create_vocab(master)

# lets check the no. of words in the vocabulary
len(vocab)

In [None]:
# lets check the most common 50 words in the vocabulary
vocab.most_common(50)


lets consider only those words which have appeared more than once in the corpus


In [None]:
# create the final vocab by considering words with more than one occurence
final_vocab = []
min_occur = 2
for k,v in vocab.items():
    if v >= min_occur:
        final_vocab.append(k)

In [None]:
# lets check the no. of the words in the final vocabulary
vocab_size = len(final_vocab)
vocab_size

vocab size reduced drastically from 16k to 6k

Now lets apply this vocab on our train and test datasets, we will keep only those words in training and testing datasets which appear in the vocabulary

In [None]:
# function to filter the dataset, keep only words which are present in the vocab
def filter(tweet):
    sentence = ""
    for word in tweet.split():  
        if word in final_vocab:
            sentence = sentence + word + ' '
    return(sentence)

In [None]:
# apply filter function on the train and test datasets
train['text'] = train['text'].apply(lambda s : filter(s))
test ['text'] = test ['text'].apply(lambda s : filter(s))

In [None]:
# lets take a look at the update training dataset
train.text.head()

# Data Preprocessing

In [None]:
# the different units into which you can break down text (words, characters, or n-grams) are called tokens, 
# and breaking text into such tokens is called tokenization, this can be achieved using Tokenizer in Keras

from keras.preprocessing.text import Tokenizer
# fit a tokenizer
def create_tokenizer(lines):
    # num_words = vocab_size will create a tokenizer,configured to only take into account the vocab_size(6025)
    tokenizer = Tokenizer(num_words=vocab_size)
    # Build th word index, Turns strings into lists of integer indices
    tokenizer.fit_on_texts(lines) 
    return tokenizer

In [None]:
# create and apply tokenizer on the training dataset
tokenizer = create_tokenizer(train.text)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Now we will apply texts_to_matrix() function to convert text into vectors.

The texts_to_matrix() function on the Tokenizer can be used to create one vector per document provided per input. The length of the vectors is the total size of the vocabulary, which is 6025 here (we passed 6025 as num_words into tokenizer)

This function provides a suite of standard bag-of-words model text encoding schemes that can be provided via a mode argument to the function.

The modes available include:

* ‘binary‘: Whether or not each word is present in the document. This is the default.
* ‘count‘: The count of each word in the document.
* ‘tfidf‘: The Text Frequency-Inverse DocumentFrequency (TF-IDF) scoring for each word in the document.
* ‘freq‘: The frequency of each word as a ratio of words within each document.

In [None]:
# converting texts into vectors
train_text = tokenizer.texts_to_matrix(train.text, mode = 'freq')

# Model Building & Evaluation

### 1. Neural Network

We will create an Artificial Neural Network, this competition is evaluated on f1 scores,which is not shown by default after every epoch, so lets create a function to  achieve the same.

In [None]:
# Test train split 
X_train, X_test, y_train, y_test = train_test_split(train_text, train.target, test_size = 0.2, random_state = 42)

In [None]:
# function to calculate f1 score for each epoch
import keras.backend as K
def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val


In [None]:
# define the model
def define_model(n_words):
    # define network
    model = Sequential()
    model.add(Dense(1024, input_shape=(n_words,), activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(512,activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(256,activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))
    
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics = [get_f1])
    
    # summarize defined model
    model.summary()
    return model

In [None]:
X_train.shape

In [None]:
callbacks_list = [EarlyStopping(monitor='get_f1',patience=10,),
ModelCheckpoint(filepath='./NN.h5',monitor='val_loss',save_best_only=True)
]

In [None]:
# create the model
n_words = X_train.shape[1]
model = define_model(n_words)

In [None]:
#fit network
history = model.fit(X_train,y_train,epochs=100,verbose=2,callbacks=callbacks_list,validation_split=0.2)

In [None]:
acc = history.history['get_f1']
val_acc = history.history['val_get_f1']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

There is a huge difference between training and validation accuracies and losses

In [None]:
import keras

dependencies = {
    'get_f1': get_f1
}


# load the model from disk
loaded_model_NN = keras.models.load_model('./NN.h5',custom_objects=dependencies)

In [None]:
# prediction on the test dataset
#X_test_Set = tokenizer.texts_to_matrix(X_test, mode = 'freq')
y_pred = loaded_model_NN.predict_classes(X_test)

In [None]:
# important metrices
print(classification_report(y_test, y_pred))

# Predictions on the test dataset

In [None]:
test_id = test.id
test.drop(["id","location","keyword"],1,inplace = True)

# apply tokenizer on the test dataset
test_set = tokenizer.texts_to_matrix(test.text, mode = 'freq')

In [None]:
# make predictions on the test dataset
y_test_pred = loaded_model_NN.predict_classes(test_set)

In [None]:
# lets prepare for the prediction submission
sub = pd.DataFrame()
sub['Id'] = test_id
sub['target'] = y_test_pred
sub.head()

In [None]:
sub.to_csv('submission_NN.csv',index=False)

# Model using Word Embeddings

Another popular and powerful way to associate a vector with a word is the use of dense word vectors, also called `word embeddings`. 

The Embedding layer is best understood as a dictionary that maps integer indices (which stand for specific words) to dense vectors. It takes integers as input, it looks up these integers in an internal dictionary, and it returns the associated vectors. It’s effectively a dictionary lookup.

Whereas the vectors obtained through one-hot encoding are binary, sparse (mostly made of zeros), and very high-dimensional (same dimensionality as the number of words in the vocabulary), word embeddings are low dimensional floating-point vectors (that is, dense vectors, as opposed to sparse vectors); 

Unlike the word vectors obtained via one-hot encoding, word embeddings are learned from data. It’s common to see word embeddings that are 256-dimensional, 512-dimensional, or 1,024-dimensional when dealing with very large vocabularies. 

On the other hand, one-hot encoding words generally leads to vectors that are 20,000-dimensional or greater (capturing a vocabulary of 6,025 tokens, above). So, word embeddings pack more information into far fewer dimensions.

### There are two ways to obtain word embeddings:

* Learn word embeddings jointly with the main task you care about (such as document classification or sentiment prediction). In this setup, you start with random word vectors and then learn word vectors in the same way you learn the
weights of a neural network.

* Load into your model word embeddings that were precomputed using a different machine-learning task than the one you’re trying to solve. These are called
pretrained word embeddings.

In [None]:
from keras.layers import Embedding
# The Embedding layer takes at least two arguments: the number of possible tokens (here, 5,000: 1 + maximum word index)
#and the dimensionality of the embeddings (here, 64).
#embedding_layer = Embedding(5000, 64)

In [None]:
# Number of words to consider as features
max_features = vocab_size

# Cuts off the text after this number of words (among the max_features most common words)
maxlen = 100

In [None]:
# create and apply tokenizer on the training dataset
tokenizer = create_tokenizer(train.text)

In [None]:
from keras import preprocessing
# conver text to sequences
sequences = tokenizer.texts_to_sequences(train.text)
#print(sequences)

In [None]:
# Turns the lists of integers into a 2D integer tensor of shape (samples, maxlen), padding shorter sequences with 0s
train_text = preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)

In [None]:
# Test train split 
X_train, X_test, y_train, y_test = train_test_split(train_text, train.target, test_size = 0.2, random_state = 42)

### 2. Neural Network with Embedding Layer

In [None]:
# build the model
model = Sequential()
# Specifies the maximum input length to the Embedding layer so you can later flatten the embedded inputs. 

# After the Embedding layer, the activations have shape (samples, maxlen, 8)
model.add(Embedding(vocab_size, 8, input_length=maxlen))

# Flattens the 3D tensor of embeddings into a 2D tensor of shape (samples, maxlen * 8)
model.add(Flatten())

# Dense layer for classification
model.add(Dense(1, activation='sigmoid'))

# compile the model
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=[get_f1])
model.summary()

In [None]:
callbacks_list = [
EarlyStopping(
monitor='get_f1',
patience=1,
),
ModelCheckpoint(filepath='./embd.h5',monitor='val_loss',save_best_only=True)
]

In [None]:
# train the model
history = model.fit(X_train, y_train,
epochs=100,
batch_size=32,
callbacks=callbacks_list,
validation_split=0.2)

In [None]:
# check model performance
acc = history.history['get_f1']
val_acc = history.history['val_get_f1']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
dependencies = {
    'get_f1': get_f1
}


# load the model from disk
loaded_model_embd = keras.models.load_model('./embd.h5',custom_objects=dependencies)

In [None]:
# prediction on the test dataset
y_pred = loaded_model_embd.predict_classes(X_test)

# important metrices
print(classification_report(y_test, y_pred))

We got to a f1 score of 81%, which is pretty good considering that we’re only looking at the first 20 words in every review. But note that merely flattening the embedded sequences and training a single Dense layer on top leads to a model that treats each word in the input sequence separately, without considering inter-word relationships and sentence structure (for example, this model would likely treat both “this movie is a bomb” and “this movie is the bomb” as being negative reviews). 

It’s much better to add recurrent layers or 1D convolutional layers on top of the embedded sequences to learn features that take into account each sequence as a whole. We will do this later.

In [None]:
# conver text to sequences
sequences = tokenizer.texts_to_sequences(test.text)
# Turns the lists of integers into a 2D integer tensor of shape (samples, maxlen)
test_text = preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)

In [None]:
# make predictions on the test dataset
y_test_pred = loaded_model_embd.predict_classes(test_text)

# lets prepare for the prediction submission
sub = pd.DataFrame()
sub['Id'] = test_id
sub['target'] = y_test_pred
sub.head()

In [None]:
sub.to_csv('submission_embedding.csv',index=False)

### 3. Neural Network with pre trained Embedding Layer(GLOVE)

In [None]:
# Considers only the top 5000 words in the dataset
max_words = 5000

We’ll build an embedding matrix that you can load into an Embedding layer. 

It must be a matrix of shape (max_words, embedding_dim), where each entry i contains the embedding_dim-dimensional vector for the word of index i in the reference word index (built during tokenization). 

Note that index 0 isn’t supposed to stand for any word or token—it’s a placeholde

In [None]:
import os
glove_dir = "../input/glove6b100dtxt/"
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

In [None]:
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))

for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector # Words not found in the embedding index will be all zeros.

In [None]:
# lets use the same model architecture we used earlier
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

### LOADING THE GLOVE EMBEDDINGS IN THE MODEL

The Embedding layer has a single weight matrix: a 2D float matrix where each entry i is the word vector meant to be associated with index i. Simple enough. 

Load the GloVe matrix we prepared into the Embedding layer, the first layer in the model

In [None]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [None]:
# Compile the model
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=[get_f1])

In [None]:
callbacks_list = [
EarlyStopping(
monitor='get_f1',
patience=1,
),
ModelCheckpoint(filepath='./pre_embd.h5',monitor='val_loss',save_best_only=True)
]

In [None]:
# train the model
history = model.fit(X_train, y_train,
epochs=100,
batch_size=32,
callbacks=callbacks_list,
validation_split=0.2)

In [None]:
# check model performance
acc = history.history['get_f1']
val_acc = history.history['val_get_f1']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
dependencies = {
    'get_f1': get_f1
}


# load the model from disk
loaded_model_pre_embd = keras.models.load_model('./pre_embd.h5',custom_objects=dependencies)

In [None]:
# prediction on the test dataset
#X_test_Set = tokenizer.texts_to_matrix(X_test, mode = 'freq')
y_pred = loaded_model_pre_embd.predict_classes(X_test)

# important metrices
print(classification_report(y_test, y_pred))

In [None]:
# make predictions on the test dataset
y_test_pred = loaded_model_pre_embd.predict_classes(test_text)

# lets prepare for the prediction submission
sub = pd.DataFrame()
sub['Id'] = test_id
sub['target'] = y_test_pred
sub.head()

In [None]:
sub.to_csv('submission_pre_embedding.csv',index=False)

Embedding layer that learnt embeddings with the model training proved to be better than pre trained embedding.

### 4.SIMPLE RNN

In [None]:
from keras.layers import Embedding, SimpleRNN
model = Sequential()
model.add(Embedding(max_words, 32))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
callbacks_list = [
EarlyStopping(
monitor='get_f1',
patience=1,
),
ModelCheckpoint(filepath='./SRNN.h5',monitor='val_loss',save_best_only=True)
]

In [None]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=[get_f1])

history = model.fit(X_train, y_train,
epochs=100,
batch_size=128,
callbacks=callbacks_list,
validation_split=0.2)

In [None]:
# check model performance
acc = history.history['get_f1']
val_acc = history.history['val_get_f1']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
dependencies = {
    'get_f1': get_f1
}


# load the model from disk
loaded_model_SRNN = keras.models.load_model('./SRNN.h5',custom_objects=dependencies)

In [None]:
# prediction on the test dataset
#X_test_Set = tokenizer.texts_to_matrix(X_test, mode = 'freq')
y_pred = loaded_model_SRNN.predict_classes(X_test)

# important metrices
print(classification_report(y_test, y_pred))

In [None]:
# make predictions on the test dataset
y_test_pred = loaded_model_SRNN.predict_classes(test_text)

# lets prepare for the prediction submission
sub = pd.DataFrame()
sub['Id'] = test_id
sub['target'] = y_test_pred
sub.head()

In [None]:
sub.to_csv('submission_SRNN.csv',index=False)

Model's performance not yet improved, lets stack some layers.

### 5. Stack multiple SimpleRNN layers

In [None]:
from keras.layers import Embedding, SimpleRNN
model = Sequential()
model.add(Embedding(max_words, 32))
model.add(SimpleRNN(32,return_sequences=True))
model.add(SimpleRNN(32,return_sequences=True))
model.add(SimpleRNN(32,return_sequences=True))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
callbacks_list = [
EarlyStopping(
monitor='get_f1',
patience=1,
),
ModelCheckpoint(filepath='./STRNN.h5',monitor='val_loss',save_best_only=True)
]

In [None]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=[get_f1])

history = model.fit(X_train, y_train,
epochs=100,
batch_size=128,
callbacks=callbacks_list,
validation_split=0.2)

In [None]:
# check model performance
acc = history.history['get_f1']
val_acc = history.history['val_get_f1']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
dependencies = {
    'get_f1': get_f1
}


# load the model from disk
loaded_model_STRNN = keras.models.load_model('./STRNN.h5',custom_objects=dependencies)

In [None]:
# prediction on the test dataset
#X_test_Set = tokenizer.texts_to_matrix(X_test, mode = 'freq')
y_pred = loaded_model_STRNN.predict_classes(X_test)

# important metrices
print(classification_report(y_test, y_pred))

In [None]:
# make predictions on the test dataset
y_test_pred = loaded_model_STRNN.predict_classes(test_text)

# lets prepare for the prediction submission
sub = pd.DataFrame()
sub['Id'] = test_id
sub['target'] = y_test_pred
sub.head()

In [None]:
sub.to_csv('submission_stackRNN.csv',index=False)

Didn't help, lets try LSTM

### 6. LSTM

In [None]:
from keras.layers import LSTM

model = Sequential()
model.add(Embedding(max_features, 32))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))


In [None]:
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=[get_f1])

In [None]:
callbacks_list = [
EarlyStopping(
monitor='get_f1',
patience=1,
),
ModelCheckpoint(filepath='./LSTM.h5',monitor='val_loss',save_best_only=True)
]

In [None]:
history = model.fit(X_train, y_train,
epochs=100,
batch_size=128,
callbacks=callbacks_list,
validation_split=0.2)

In [None]:
# check model performance
acc = history.history['get_f1']
val_acc = history.history['val_get_f1']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
dependencies = {
    'get_f1': get_f1
}


# load the model from disk
loaded_model_LSTM = keras.models.load_model('./LSTM.h5',custom_objects=dependencies)

In [None]:
# prediction on the test dataset
#X_test_Set = tokenizer.texts_to_matrix(X_test, mode = 'freq')
y_pred = loaded_model_LSTM.predict_classes(X_test)

# important metrices
print(classification_report(y_test, y_pred))

In [None]:
# make predictions on the test dataset
y_test_pred = loaded_model_LSTM.predict_classes(test_text)

# lets prepare for the prediction submission
sub = pd.DataFrame()
sub['Id'] = test_id
sub['target'] = y_test_pred
sub.head()

In [None]:
sub.to_csv('LSTM.csv',index=False)

LSTM doing a decent job here, lets try Bi directional LSTM

### 7. Bi-Direction LSTM

In [None]:
model = Sequential()
model.add(Embedding(max_features, 32))
model.add(Bidirectional(LSTM(32,return_sequences=True)))
model.add(Bidirectional(LSTM(32,return_sequences=True)))
model.add(Bidirectional(LSTM(32,return_sequences=True)))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=[get_f1])

In [None]:
callbacks_list = [
EarlyStopping(
monitor='get_f1',
patience=1,
),
ModelCheckpoint(filepath='./BILSTM.h5',monitor='val_loss',save_best_only=True)
]

In [None]:
history = model.fit(X_train, y_train,
epochs=100,
batch_size=128,
callbacks=callbacks_list,
validation_split=0.2)

In [None]:
# check model performance
acc = history.history['get_f1']
val_acc = history.history['val_get_f1']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
dependencies = {
    'get_f1': get_f1
}


# load the model from disk
loaded_model_BILSTM = keras.models.load_model('./BILSTM.h5',custom_objects=dependencies)

In [None]:
# prediction on the test dataset
#X_test_Set = tokenizer.texts_to_matrix(X_test, mode = 'freq')
y_pred = loaded_model_BILSTM.predict_classes(X_test)

# important metrices
print(classification_report(y_test, y_pred))

In [None]:
# make predictions on the test dataset
y_test_pred = loaded_model_BILSTM.predict_classes(test_text)

# lets prepare for the prediction submission
sub = pd.DataFrame()
sub['Id'] = test_id
sub['target'] = y_test_pred
sub.head()

In [None]:
sub.to_csv('BiLSTM.csv',index=False)

Neural Network with Embedding layer seems to the best model for this classification task.

# Please upvote if you like this kernel.

# GRU

In [None]:
from keras.layers import GRU

model = Sequential()
model.add(Embedding(max_features, 32))
model.add(GRU(32))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=[get_f1])

In [None]:
callbacks_list = [
EarlyStopping(
monitor='get_f1',
patience=1,
),
ModelCheckpoint(filepath='./GRU.h5',monitor='val_loss',save_best_only=True)
]

In [None]:
history = model.fit(X_train, y_train,
epochs=100,
batch_size=128,
callbacks=callbacks_list,
validation_split=0.2)

In [None]:
# check model performance
acc = history.history['get_f1']
val_acc = history.history['val_get_f1']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
dependencies = {
    'get_f1': get_f1
}

# load the model from disk
loaded_model_GRU = keras.models.load_model('./GRU.h5',custom_objects=dependencies)

In [None]:
# prediction on the test dataset
#X_test_Set = tokenizer.texts_to_matrix(X_test, mode = 'freq')
y_pred = loaded_model_GRU.predict_classes(X_test)

# important metrices
print(classification_report(y_test, y_pred))

In [None]:
# make predictions on the test dataset
y_test_pred = loaded_model_GRU.predict_classes(test_text)

# lets prepare for the prediction submission
sub = pd.DataFrame()
sub['Id'] = test_id
sub['target'] = y_test_pred
sub.head()

In [None]:
sub.to_csv('GRU.csv',index=False)

# Stacked GRU

In [None]:
from keras.layers import GRU

model = Sequential()
model.add(Embedding(max_features, 32))
model.add(GRU(32,return_sequences=True))
model.add(GRU(32,return_sequences=True))
model.add(GRU(32,return_sequences=True))
model.add(GRU(32))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=[get_f1])

In [None]:
callbacks_list = [
EarlyStopping(
monitor='get_f1',
patience=1,
),
ModelCheckpoint(filepath='./SGRU.h5',monitor='val_loss',save_best_only=True)
]

In [None]:
history = model.fit(X_train, y_train,
epochs=100,
batch_size=128,
callbacks=callbacks_list,
validation_split=0.2)

In [None]:
# check model performance
acc = history.history['get_f1']
val_acc = history.history['val_get_f1']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
dependencies = {
    'get_f1': get_f1
}


# load the model from disk
loaded_model_SGRU = keras.models.load_model('./SGRU.h5',custom_objects=dependencies)

In [None]:
# prediction on the test dataset
#X_test_Set = tokenizer.texts_to_matrix(X_test, mode = 'freq')
y_pred = loaded_model_SGRU.predict_classes(X_test)

# important metrices
print(classification_report(y_test, y_pred))

In [None]:
# make predictions on the test dataset
y_test_pred = loaded_model_SGRU.predict_classes(test_text)

# lets prepare for the prediction submission
sub = pd.DataFrame()
sub['Id'] = test_id
sub['target'] = y_test_pred
sub.head()

In [None]:
sub.to_csv('SGRU.csv',index=False)

# Stacked GRU with Dropouts

In [None]:
from keras.layers import GRU

model = Sequential()
model.add(Embedding(max_features, 32))
model.add(GRU(32,dropout=0.2,recurrent_dropout=0.2,return_sequences=True))
model.add(GRU(32,return_sequences=True))
model.add(GRU(32,dropout=0.2,recurrent_dropout=0.2,return_sequences=True))
model.add(GRU(32,return_sequences=True))
model.add(GRU(32,dropout=0.2,recurrent_dropout=0.2,return_sequences=True))
model.add(GRU(32))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=[get_f1])

In [None]:
callbacks_list = [
EarlyStopping(
monitor='get_f1',
patience=1,
),
ModelCheckpoint(filepath='./DSGRU.h5',monitor='val_loss',save_best_only=True)
]

In [None]:
history = model.fit(X_train, y_train,
epochs=100,
batch_size=128,
callbacks=callbacks_list,
validation_split=0.2)

In [None]:
# check model performance
acc = history.history['get_f1']
val_acc = history.history['val_get_f1']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
dependencies = {
    'get_f1': get_f1
}


# load the model from disk
loaded_model_DSGRU = keras.models.load_model('./DSGRU.h5',custom_objects=dependencies)

In [None]:
# prediction on the test dataset
#X_test_Set = tokenizer.texts_to_matrix(X_test, mode = 'freq')
y_pred = loaded_model_DSGRU.predict_classes(X_test)

# important metrices
print(classification_report(y_test, y_pred))

In [None]:
# make predictions on the test dataset
y_test_pred = loaded_model_DSGRU.predict_classes(test_text)

# lets prepare for the prediction submission
sub = pd.DataFrame()
sub['Id'] = test_id
sub['target'] = y_test_pred
sub.head()

In [None]:
sub.to_csv('GRUDropOut.csv',index=False)