In [None]:
# Files supplied by Quora dataset:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import os.path
# print(os.getcwd())
if not os.path.isfile("/kaggle/working/glove.840B.300d/glove.840B.300d.txt") :
    print("Extracting")
    !unzip -j /kaggle/input/quora-insincere-questions-classification/embeddings.zip "glove.840B.300d/glove.840B.300d.txt" -d "/kaggle/working/glove.840B.300d/"

#### Install the attention module. There are multiple modules available, out of keras as well as in keras itself. Installing one below for demo purpuse

In [None]:
!pip install keras-self-attention
!pip install attention

In [None]:
!ls -ltrh /kaggle/input/quora-insincere-questions-classification/ 

#### Import all necessary modules in one go:

In [None]:
import os
import tensorflow as tf
os.chdir('/kaggle/input/quora-insincere-questions-classification')
import pandas as pd
import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Dense, Input, Flatten
from keras.layers import Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

#### Read the dataset in train and test dataframe. In this notebook we will use only train, and split it for training and validation. Supplied test.csv is used for competition and not useful for this attention demo

In [None]:
trainAll = train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
# Have a quick look at training csv
train.head()

#### Define function below to plot the history of model training output. Useful to understand if model is overfilling

In [None]:
import matplotlib.pyplot as plt
# Visualize training history
def plot_history(history):
# print(history.history.keys())
    # summarize history for accuracy
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

from IPython.display import Image, display
def plot_model(model):
    tf.keras.utils.plot_model(model, to_file="/kaggle/working/model.png", show_shapes=True, show_dtype=False, show_layer_names=True, rankdir="TB", expand_nested=True)
    display(Image(filename="/kaggle/working/model.png"))

### Read some portion dataset for training
#### Just take 1% of dataset for quicker training. If whole dataset to be used, then need to increase the model size.

In [None]:
def preprocessing_text(df):
    df = pd.DataFrame(df)
    for i,row in df.iterrows():
#         print(i, row)
        df.at[i,'question_text'] = str(row["question_text"]).replace('n\'t', ' not').replace('\'', '').replace('\"', '')
    return df['question_text']
# preprocessing_text(X)

In [None]:
# embedding_vecor_length = 32
train = trainAll
train = train.sample(frac=0.01, random_state=np.random.RandomState() )   ## TAke just 1% of training data for this exercise
X = train["question_text"]
X = preprocessing_text(X)
y=train["target"]

embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 200 # max number of words in a question to use

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X))
X_sequences = tokenizer.texts_to_sequences(X)
# val_X = tokenizer.texts_to_sequences(val_X)
# test_X = tokenizer.texts_to_sequences(test_X)

X_train, X_test, y_train, y_test = train_test_split(X_sequences, y, test_size = 0.1)
X_train = sequence.pad_sequences(X_train, maxlen=maxlen, padding='post')
X_test = sequence.pad_sequences(X_test, maxlen=maxlen, padding='post')
print(X_train[:10])

### Prepare embedding layer
#### Use the glove embedding file to prepare the Embedding layer for models below

In [None]:
glove_file="/kaggle/working/glove.840B.300d/glove.840B.300d.txt"
import tqdm

EMBEDDING_VECTOR_LENGTH = 300 #50 # <=200
OOV=[]
def construct_embedding_matrix(glove_file, word_index):
    global OOV
    embedding_dict = {}
    with open(glove_file,'r') as f:
        for line in f:
            values=line.split(' ')
            # get the word
            word=values[0]
            if word in word_index.keys():
                # get the vector
                try:  vector = np.asarray(values[1:], 'float32')
                except:
                    print("Error at:", values[:5])
                    pass
                embedding_dict[word] = vector
    ###  oov words (out of vacabulary words) will be mapped to 0 vectors

    num_words=len(word_index)+1
    #initialize it to 0
    embedding_matrix=np.zeros((num_words, EMBEDDING_VECTOR_LENGTH))
    
    for word,i in tqdm.tqdm(word_index.items()):
        if i < num_words:
            vect=embedding_dict.get(word, [])
            if len(vect)>0:
                embedding_matrix[i] = vect[:EMBEDDING_VECTOR_LENGTH]
            else:
                OOV.append(word)
                
    print("OOV words.. first 100:", OOV[:100])            
    print("OOV word printed above (if any), should be handled for good results.")

    return embedding_matrix
  
if 'embedding_matrix' not in vars() or  embedding_matrix is None:
    embedding_matrix =  construct_embedding_matrix(glove_file, tokenizer.word_index)

In [None]:
print("Number of words not understood by glove (need more preprocesing): ", len(OOV))

In [None]:
from keras.initializers import Constant
embedding=Embedding(len(tokenizer.word_index)+1, # number of unique tokens
                    EMBEDDING_VECTOR_LENGTH, #number of features
                    embeddings_initializer=Constant(embedding_matrix), # initialize 
                    input_length=maxlen, 
                    trainable=False)

### I will run on TPU , as i am bit impatient

In [None]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

# instantiating the model in the strategy scope creates the model on the TPU

#### As we are going to build multiple models below, so let's define common layers

In [None]:
lstm = dropout = dense_last = flatten = epochs = opt = None
def rebuild_layers():
    global lstm , dropout,  dense_last, epochs, opt , flatten
    lstm = LSTM(20, return_sequences=True)
    dropout = Dropout(0.05)
    dense_last = Dense(5, kernel_regularizer=keras.regularizers.l2())
    epochs = 7
    opt = keras.optimizers.Adam(learning_rate=0.001)

## A simple LSTM model 
#### As data is very low, so just 2 LSTM layers. Model overfits quickly so adding a dropout layer too with 20% dropout

In [None]:
rebuild_layers()
with tpu_strategy.scope():
    model = tf.keras.Sequential() # define your model normally
    model.add(Input(shape=(maxlen,)))
#     model.add(Embedding(max_features, embed_size))
    model.add(embedding)
    model.add(lstm)
    model.add(dropout)
    model.add(dense_last)
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
    print(model.summary())
    plot_model(model)

history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, batch_size=64)
plot_history(history)

##### Note the accuracy of above model training in final epoch. For current run it was: 96.02% 
##### (Note: It may vary with each run due to presence of random sample of dataset and random dropout on each run)

## Bidrectional LSTM model without Attention layer

In [None]:
rebuild_layers()
with tpu_strategy.scope():
    model = tf.keras.Sequential() # define your model normally
    model.add(Input(shape=(maxlen,)))
#     model.add(Embedding(max_features, embed_size))
    model.add(embedding)
    model.add(Bidirectional(lstm))
    model.add(dropout)
    model.add(dense_last)
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
print(model.summary())
plot_model(model)

history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, batch_size=64)
plot_history(history)

##### Note the accuracy of above model training in final epoch. For current run it was: 96.48%  - slightly better than non bidirectional version above

##### For attention code, have to use keras fucntional api instead of sequential. Because attention takes a different route in the flow as seen in the model plot below

## Same bidrectional model as above with Attention (Self attention) layer (for comaprision of results)

In [None]:
rebuild_layers()
from attention import Attention
rebuild_layers()
with tpu_strategy.scope():
    model_input   = Input(shape=(maxlen,))
    x = Embedding(len(tokenizer.word_index)+1, # number of unique tokens
                    EMBEDDING_VECTOR_LENGTH, #number of features
                    embeddings_initializer=Constant(embedding_matrix), # initialize 
                    input_length=maxlen, 
                    trainable=False)(model_input)
    x = Bidirectional(LSTM(20, return_sequences=True))(x) #model_input
    x = Attention(50)(x)
    x = dropout(x)
    x = dense_last(x)
    x = Flatten()(x)
    x = Dense(1, activation='sigmoid')(x)
    modelAtt = keras.models.Model(model_input, x)
    modelAtt.compile(loss='binary_crossentropy', optimizer=opt,metrics=["accuracy"], experimental_run_tf_function=False) #, run_eagerly=True
    print(modelAtt.summary())
    plot_model(modelAtt)
    
history = modelAtt.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, batch_size=64)
plot_history(history)

##### Note the accuracy of above model training in final epoch. For current run it was: 97.02%  better than non attention version above

# It is clear from above last 2 models that, due to applying attention layer the accuracy has improved. As model has learnt faster than non attention version