# Sentence Classification with GloVe embeddings on Bi-LSTM Networks


**Data Set:** "Quora insincere questions" 

**Pre_trained Embeddings:** GloVe Embedding

In [None]:
# libraries
import os
import re 
import gc
import numpy as np 
import pandas as pd
pd.set_option('display.max_colwidth',None)

# scikit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

# gensim
import gensim
from gensim.models import KeyedVectors

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Process the data sets

def load_datasets():
    pd.set_option('display.max_colwidth',None)
    train = pd.read_csv('../input/quora-insincere-questions-classification/train.csv')
    test  = pd.read_csv('../input/quora-insincere-questions-classification/test.csv')
    return train, test

train, test = load_datasets()
display(train)

In [None]:
import re

def clean_data(data):
    tag=re.compile(r'[0-9]+')
    data=tag.sub(r'',data)
    punct_tag=re.compile(r'[^\w\s]')
    data=punct_tag.sub(r'',data)
    tag=re.compile(r'\s+')
    data=tag.sub(r' ',data)
    red_tag=re.compile(r'[?<=(  )\\]|[&&|\|\|-]')
    data=red_tag.sub(r' ',data)
    return "".join(data)
    
train['question_text'] = train['question_text'].apply(lambda x: clean_data(x))
test['question_text']  = test['question_text'].apply(lambda x: clean_data(x))

In [None]:
%%time

#stemmimng the text
from nltk.stem.porter import PorterStemmer
from nltk.stem import *

def stem_corpus(data):
    stemmer = PorterStemmer()
    out_data=""
    for words in data:
        out_data+= stemmer.stem(words)
    return out_data


train['question_text'] = train['question_text'].apply(lambda x: stem_corpus(x))
test['question_text']  = test['question_text'].apply(lambda x: stem_corpus(x))

In [None]:
#Converting uppercase letters to lowercase

def convert_2lowercase(data):
    data =[string.lower() for string in data if string.isupper]
    return ''.join(data)

train['question_text'] = train['question_text'].apply(lambda z: convert_2lowercase(z))
test['question_text']  = test['question_text'].apply(lambda z: convert_2lowercase(z))

In [None]:
train.info()

In [None]:
test.info()

In [None]:
#Let's have a look at the clean and preprocessed data sets 
train.to_csv("train_data.csv",sep=",",index=False)
test.to_csv("test_data.csv",sep=",",index=False)

## Split the train and validation data

In [None]:
# keras-tf 2.0
import tensorflow as tf
from tensorflow import keras

In [None]:
# keras-tf 2.0
from tensorflow.keras import regularizers
from keras import layers
from tensorflow.keras.layers import Input, Embedding, Bidirectional,LSTM,Dense,Flatten,Conv2D,Conv1D,GlobalMaxPooling1D,Concatenate,TimeDistributed
from keras.models import Sequential, Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from keras.utils import plot_model
from keras.wrappers.scikit_learn import KerasClassifier

In [None]:
# Load the input features

X = train['question_text'] # input
y = train['target'].values # target /label

sentences_train,sentences_val,y_train,y_val = train_test_split(X,y,test_size=0.2,random_state=14)

# tokenize the text corpus with keras tokenizer
tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(sentences_train)
X_train = tokenizer.texts_to_sequences(sentences_train)
X_val = tokenizer.texts_to_sequences(sentences_val)

# Adding 1 because of  reserved 0 index
vocab_size = len(tokenizer.word_index) + 2 # (in case of pre-trained embeddings it's +2)                         
maxlen = 128 # sentence length

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_val = pad_sequences(X_val, padding='post', maxlen=maxlen)

word_index = tokenizer.word_index
num_tokens = len(tokenizer.word_index)+2

print("Vocabulary Size / Unique Words in the Corpus:",num_tokens)

In [None]:
del train
gc.collect()

### Plot the history of the model

In [None]:
# generic function to plot the train Vs validation loss/accuracy:
def plot_history(history):
    loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' not in s]
    val_loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' in s]
    acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' not in s]
    val_acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' in s]
    if len(loss_list) == 0:
        print('Loss is missing in history')
        return 
    ## As loss always exists
    epochs = range(1,len(history.history[loss_list[0]]) + 1)
    plt.figure(figsize=(25,15))
    ## Accuracy
    plt.subplot(2,2,1)
    for l in acc_list:
        plt.plot(epochs, history.history[l], 'b', label='Training accuracy (' + str(format(history.history[l][-1],'.4f'))+')')
    for l in val_acc_list:    
        plt.plot(epochs, history.history[l], 'g', label='Validation accuracy (' + str(format(history.history[l][-1],'.4f'))+')')

    plt.title('Training Accuracy Vs Validation Accuracy\n')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    ## Loss
    plt.subplot(2,2,2)
    for l in loss_list:
        plt.plot(epochs, history.history[l], 'b', label='Training loss (' + str(str(format(history.history[l][-1],'.4f'))+')'))
    for l in val_loss_list:
        plt.plot(epochs, history.history[l], 'g', label='Validation loss (' + str(str(format(history.history[l][-1],'.4f'))+')'))
    
    plt.title('Training Loss Vs Validation Loss\n')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

### Generic function to plot the confusion matrix

In [None]:
def conf_matrix(actual, prediction, model_name):
    cm_array=metrics.confusion_matrix(actual,prediction,labels=[0,1])
    sns.set_context("notebook", font_scale=1.1)
    plt.figure(figsize=(5,5))
    sns.heatmap(cm_array,annot=True, fmt='.0f',xticklabels=['Sincere','Insincere'],yticklabels=['Sincere','Insincere'])
    plt.ylabel('True\n')
    plt.xlabel('Predicted\n')
    plt.title(model_name)
    plt.show()

### Using the Glove word embeddding

In [None]:
import io
from tqdm import tqdm
import zipfile

## make a dict mapping words (strings) to their NumPy vector representation:
embeddings_index = {}

with zipfile.ZipFile("../input/quora-insincere-questions-classification/embeddings.zip") as zf:
    with io.TextIOWrapper(zf.open("glove.840B.300d/glove.840B.300d.txt"), encoding="utf-8") as f:
        for line in tqdm(f):
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, dtype=float, sep=" ")
            embeddings_index[word] = coefs
            
            
print("Found %s word vectors." % len(embeddings_index))

In [None]:
gc.collect()

### Designing Embedding Matrix

In [None]:
## prepare a corresponding embedding matrix that we can use in a Keras Embedding layer. 
## It's a simple NumPy matrix where entry at index i is the pre-trained vector for the word of index i in our vectorizer's vocabulary.

word_index=tokenizer.word_index
num_tokens = len(tokenizer.word_index)+ 2
embedding_dim = 300

hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))

for word, i in word_index.items():
    try:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1
    except:
        misses += 1
        
print("Converted %d words (%d misses)" % (hits, misses))


#load the pre-trained word embeddings matrix into an Embedding layer.
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False
)

# Build the model

In [None]:
embedding_dim = 300


# Input for variable-length sequences of integers
inputs = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = layers.Embedding(num_tokens,embedding_dim,weights=[embedding_matrix],input_length=maxlen,trainable=False)(inputs)
# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(64,return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64,return_sequences=False))(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.5)(x)
# Add a classifier
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.summary()


opt = Adam(learning_rate=0.01)


# defining the call backs
early_stopping=tf.keras.callbacks.EarlyStopping(
                                                monitor="val_loss",
                                                patience=3,
                                                mode="min",
                                                restore_best_weights=True
                                              )
### Now reducing the learning rate when the model is not improvinig 
reduce_lr=tf.keras.callbacks.ReduceLROnPlateau(
                                                monitor="val_loss",
                                                factor=0.2,
                                                patience=2,
                                                verbose=1,
                                                mode="auto"
                                            )

my_callbacks=[early_stopping,reduce_lr]

model.compile(optimizer=opt,loss='binary_crossentropy',metrics=['accuracy'])

## Validate the fit 

In [None]:
%%time

epochs=2

for e in range(epochs):
    history = model.fit(X_train,y_train,batch_size=256,epochs=5,validation_split=0.2,callbacks=my_callbacks)
    pred_val_y = model.predict([X_val], batch_size=512, verbose=1)
    best_thresh = 0.5
    best_score = 0.0
    for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        score = metrics.f1_score(y_val, (pred_val_y>thresh).astype(int))
        if score > best_score:
            best_thresh = thresh
            best_score = score
    print("Val F1 Score: {:.6f}".format(best_score))

In [None]:
gc.collect()

In [None]:
best_thresh

In [None]:
best_score

## Plot history

In [None]:
plot_history(history)

## Metrics on Validation Data

In [None]:
pred_y_val = (pred_val_y>best_thresh).astype(int)
print("Metrics\n")
print(metrics.classification_report(y_val,pred_y_val))

## Confusion Matrix on Validation Data

In [None]:
conf_matrix(y_val,pred_y_val,'Bi-RNN Model Validation Scores\n')

### Prediction - Glove

In [None]:
test_df    = pd.read_csv("../input/quora-insincere-questions-classification/test.csv")
test_input = test_df['question_text']
ids        = test_df['qid']
test_input = np.array(test_input)

tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(test_input)
sequences = tokenizer.texts_to_sequences(test_input)
test_sequences = pad_sequences(sequences, maxlen=128,padding='post')
indices = np.arange(test_sequences.shape[0])
test_padded = test_sequences[indices]
test_padded.shape

In [None]:
gc.collect()

## Final Submission - Glove - BiLSTM

In [None]:
test_predictions = model.predict(test_padded)
predictions = (test_predictions>best_thresh).astype(int)
predictions = predictions.flatten()

out_df = pd.DataFrame({'qid':ids,'prediction':predictions})
pd.set_option('display.max_colwidth',None)
# submission file
out_df.to_csv("submission.csv", index=False)

In [None]:
out_df.head()