In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/quora-insincere-questions-classification/sample_submission.csv
/kaggle/input/quora-insincere-questions-classification/embeddings.zip
/kaggle/input/quora-insincere-questions-classification/train.csv
/kaggle/input/quora-insincere-questions-classification/test.csv


In [22]:
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.model_selection import train_test_split

data = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/train.csv').sample(10000)
docs = data['question_text'].fillna('NA').str.replace('[^a-z\s]', '')
train_x, validate_x, train_y, validate_y = train_test_split(docs, data['target'], test_size=0.2,
                                                           random_state=1)
data.shape

  """


(100000, 3)

In [3]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional, GRU
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K
import tensorflow as tf

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_x)
vocab_size = len(tokenizer.word_index) + 1

encoded_docs_train_x = tokenizer.texts_to_sequences(train_x)
encoded_docs_validate_x = tokenizer.texts_to_sequences(validate_x)
max_length = max([len(x) for x in encoded_docs_train_x])
padded_docs_train_x = pad_sequences(encoded_docs_train_x, maxlen=max_length, padding='post')
padded_docs_validate_x = pad_sequences(encoded_docs_validate_x, maxlen=max_length, padding='post')

### Model with trainable word emebddings

In [5]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [54]:
embedding_dim = 300

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, trainable=True))
model.add(Bidirectional(LSTM(64, activation='tanh')))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', f1_m])

In [55]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2)
model.fit(padded_docs_train_x, train_y, 
          epochs=2, verbose=1, 
          batch_size=1024,
          callbacks=[callback],
         validation_data=(padded_docs_validate_x, validate_y),)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f365de4cf10>

### Model using Pre-trained Word Embeddings

In [6]:
from zipfile import ZipFile
zip_path = '/kaggle/input/quora-insincere-questions-classification/embeddings.zip'
zf = ZipFile(zip_path)
zf.filelist

[<ZipInfo filename='GoogleNews-vectors-negative300/' filemode='drwxrwxr-x' external_attr=0x10>,
 <ZipInfo filename='glove.840B.300d/' filemode='drwxrwxr-x' external_attr=0x10>,
 <ZipInfo filename='paragram_300_sl999/' filemode='drwxr-xr-x' external_attr=0x10>,
 <ZipInfo filename='wiki-news-300d-1M/' filemode='drwxrwxr-x' external_attr=0x10>,
 <ZipInfo filename='glove.840B.300d/glove.840B.300d.txt' compress_type=deflate filemode='-rw-rw-r--' file_size=5646236541 compress_size=2178478737>,
 <ZipInfo filename='GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin' compress_type=deflate filemode='-rw-rw-r--' file_size=3644258522 compress_size=1746270195>,
 <ZipInfo filename='wiki-news-300d-1M/wiki-news-300d-1M.vec' compress_type=deflate filemode='-rw-r--r--' file_size=2259088777 compress_size=682384991>,
 <ZipInfo filename='paragram_300_sl999/README.txt' compress_type=deflate filemode='-rw-r--r--' file_size=731 compress_size=441>,
 <ZipInfo filename='paragram_300_sl999/paragram

### Read Glove Embeddings Text File
- Open the zip file
- Go to each line
    - Split the line by space
    - First element is the word
    - Remaining elements are the vector representation of the word
    - Update the dictionary(key=word; value=vector)

In [13]:
vocab = tokenizer.word_index.keys()

In [7]:
glove_path = 'glove.840B.300d/glove.840B.300d.txt'

with zf.open(glove_path) as file:
    embeddings = {}
    for line in file:
        line = line.decode('utf-8').replace('\n', '').split(' ')
        word = line[0]
        if word in vocab:
            vector = line[1:]
            vector = [float(x) for x in vector]
            embeddings[word] = vector

In [19]:
embedding_dim = len(vector)
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, index in tokenizer.word_index.items():
    if word in embeddings:
        embedding_matrix[index] = embeddings[word]

In [21]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix]))
model.add(Bidirectional(LSTM(64, activation='tanh')))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', f1_m])

callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2)
model.fit(padded_docs_train_x, train_y, 
          epochs=5, verbose=1, 
          batch_size=1024,
          callbacks=[callback],
         validation_data=(padded_docs_validate_x, validate_y),)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fe763047950>