In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/quora-insincere-questions-classification/train.csv
/kaggle/input/quora-insincere-questions-classification/embeddings.zip
/kaggle/input/quora-insincere-questions-classification/test.csv
/kaggle/input/quora-insincere-questions-classification/sample_submission.csv


In [2]:
# Extract a zip file
import zipfile
zip_ref = zipfile.ZipFile('/kaggle/input/quora-insincere-questions-classification/embeddings.zip', 'r')
print(zip_ref.namelist())
embeddings = zip_ref.open('glove.840B.300d/glove.840B.300d.txt', 'r')

['GoogleNews-vectors-negative300/', 'glove.840B.300d/', 'paragram_300_sl999/', 'wiki-news-300d-1M/', 'glove.840B.300d/glove.840B.300d.txt', 'GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin', 'wiki-news-300d-1M/wiki-news-300d-1M.vec', 'paragram_300_sl999/README.txt', 'paragram_300_sl999/paragram_300_sl999.txt']


In [3]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.decode().split(" ")) for o in embeddings)

In [4]:
import tensorflow as tf
from sklearn.model_selection import train_test_split

Stopwords list from https://github.com/Yoast/YoastSEO.js/blob/develop/src/config/stopwords.js

In [5]:
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
train_data = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/train.csv')
test_data = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/test.csv')

In [7]:
print(train_data.columns)
train_input = list(train_data['question_text'])
train_label = list(train_data['target'])

test_input = list(test_data['question_text'])

Index(['qid', 'question_text', 'target'], dtype='object')


Remove stop words from the data

In [8]:
def remove_stop_words(x):
    for word in stopwords:
        token = " " + word + " "
        if (x.find(token) != -1):
            x = x.replace(token, " ")
    return x

train_input_rsw = list(map(remove_stop_words, train_input))
test_input_rsw = list(map(remove_stop_words, test_input))

Create a embedding matrix using embedding imported and words in train data.

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_input_rsw)
word_index = tokenizer.word_index

embedding_matrix = np.zeros((len(word_index)+1, 300))

for word, index in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None):
        embedding_matrix[index] = embedding_vector
        
print(embedding_matrix.shape)

(222161, 300)


In [10]:
del embeddings_index
import gc
gc.collect()

42

In [11]:
max_length = 30

Convert text to numbers and pad them for processing

In [12]:
sequences = tokenizer.texts_to_sequences(train_input_rsw)
train_input_padded = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
print(train_input_padded.shape)

sequences = tokenizer.texts_to_sequences(test_input_rsw)
test_input_padded = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
print(test_input_padded.shape)

(1306122, 30)
(375806, 30)


has_cv to control the split of train data, if false model will train on whole data else splits into 9:1 ratio and train's on 90 of the data. 

In [13]:
has_cv = False
test_split_size = 0.1 if has_cv else 0

# Train split
if (has_cv) :
    train_text, cv_text, train_target, cv_target = train_test_split(train_input_padded, train_label, test_size = test_split_size, random_state=2)
else:
    train_text = train_input_padded
    train_target = train_label
    

print(f'Train Input Shape : {len(train_text)}')
print(f'Train Label Shape : {len(train_target)}')
if (has_cv) :
    print(f'CV Input Shape : {len(cv_text)}')
    print(f'CV label Shape : {len(cv_target)}')

Train Input Shape : 1306122
Train Label Shape : 1306122


In [14]:
tf.random.set_seed(123)

model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(len(word_index)+1, 300, input_length=max_length, weights=[embedding_matrix], trainable=False),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,return_sequences=True)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

model.summary()

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 300)           66648300  
_________________________________________________________________
bidirectional (Bidirectional (None, 30, 256)           439296    
_________________________________________________________________
dropout (Dropout)            (None, 30, 256)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 26, 64)            81984     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 6, 64)             0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0

In [15]:
epochs = 5
if (has_cv):
    history = model.fit(np.array(train_text), np.array(train_target), epochs = epochs, validation_data=(np.array(cv_text),np.array(cv_target)), batch_size=1024)
else:
    history = model.fit(np.array(train_text), np.array(train_target), epochs = epochs, batch_size=1024)

Train on 1306122 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


When trained model on 90% of data, best threshold achieved is 0.29. So used it for the full train data model.

In [16]:
if (has_cv):
    # calculate F1 Score
    from sklearn.metrics import f1_score
    cv_predictions = model.predict(cv_text, batch_size=1024)

    thresholds = []
    for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        result = f1_score(cv_target, (cv_predictions>thresh).astype(int))
        thresholds.append([thresh, result])
        print("F1 score at threshold {0} is {1}".format(thresh, result))

    thresholds.sort(key=lambda x: x[1], reverse=True)
    print("Best value {0}".format(thresholds[0]))
    best_thresh = thresholds[0]
else:
    best_thresh = 0.29

To Analyse training

In [17]:
 if (has_cv):
    import matplotlib.image  as mpimg
    import matplotlib.pyplot as plt

    #-----------------------------------------------------------
    # Retrieve a list of list results on training and test data
    # sets for each training epoch
    #-----------------------------------------------------------
    acc=history.history['accuracy']
    val_acc=history.history['val_accuracy']
    loss=history.history['loss']
    val_loss=history.history['val_loss']

    epochs=range(len(acc)) # Get number of epochs

    #------------------------------------------------
    # Plot training and validation accuracy per epoch
    #------------------------------------------------
    plt.plot(epochs, acc, 'r')
    plt.plot(epochs, val_acc, 'b')
    plt.title('Training and validation accuracy')
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.legend(["Accuracy", "Validation Accuracy"])

    plt.figure()

    #------------------------------------------------
    # Plot training and validation loss per epoch
    #------------------------------------------------
    plt.plot(epochs, loss, 'r')
    plt.plot(epochs, val_loss, 'b')
    plt.title('Training and validation loss')
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend(["Loss", "Validation Loss"])

    plt.figure()

To print confusion matrix for the cv

In [18]:
if (has_cv):
    predictions = model.predict(cv_text)
    predictions = np.around(predictions).astype(int)
    df = pd.DataFrame({'pred': predictions.flatten(), 'actual': cv_target})
    df.head()
    pd.crosstab(df['pred'], df['actual'], margins=True)

In [19]:
predictions = model.predict(test_input_padded)

In [20]:
predictions = (predictions>best_thresh).astype(int)

output = pd.DataFrame({'qid': test_data.qid, 'prediction': predictions.flatten()})

output.to_csv('submission.csv', index=False)