In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
print(tf.__version__)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Dataset Exploration

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/train.csv')
df['target'].value_counts().plot.bar(title='Target')
plt.show()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

def summarize_data(corpus):
    """
    print statements and visualizations to summarize the corpus
    """
    
    # get the documents size
    df_doc_size = pd.Series([len(str(doc).split(" ")) for doc in corpus])
    
    # get the tokens in the corpus
    df_tokens = pd.Series([token for doc in corpus for token in str(doc).split(" ")])
    
    print("---------------------------")
    print("num docs", len(corpus))
    print("median tokens", df_doc_size.median())
    print("num tokens", len(df_tokens))
    print("unique tokens", len(df_tokens.value_counts()))
    print("---------------------------")
    
    # make plots
    fig = plt.figure(figsize=(14,6))
    ax1 = fig.add_subplot(121)
    ax2 = fig.add_subplot(122)
    
    df_doc_size.plot.hist(ax=ax1, title='Document Sizes')
    df_tokens.value_counts().plot.hist(ax=ax2, title='Tokens Counts')
    
summarize_data(df.question_text.values.tolist())

In [None]:
import shutil
from sklearn.model_selection import train_test_split

train_set, valid_set = train_test_split(df, test_size=0.2, stratify=df.target)

print(train_set.shape)
print(valid_set.shape)

In [None]:
# delete temp dir
if os.path.exists('/kaggle/temp/'):
    shutil.rmtree('/kaggle/temp/')

os.mkdir("/kaggle/temp/")

train_path = "/kaggle/temp/train.csv"
valid_path = "/kaggle/temp/valid.csv" 

train_set.to_csv(train_path, index=False)
valid_set.to_csv(valid_path, index=False)

# keras Text preprocessing with Tokenizer

In [None]:
train_sentences = train_set.question_text.values.tolist()
train_labels = train_set.target

valid_sentences = valid_set.question_text.values.tolist()
valid_labels = valid_set.target

train_sentences[:5]

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

sequence_length = 50
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
vocab_size = 100000

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, maxlen=sequence_length, padding=padding_type, truncating=trunc_type)

valid_sequences = tokenizer.texts_to_sequences(valid_sentences)
valid_padded = pad_sequences(valid_sequences, maxlen=sequence_length, padding=padding_type, truncating=trunc_type)

print(train_sentences[:4])
print(train_padded[:4])

In [None]:
from collections import Counter
token_sentences = tokenizer.sequences_to_texts(train_sequences)
vocabulary = Counter()

for sentence in token_sentences:
    vocabulary.update(sentence.split())

In [None]:
vocab = [word for word, count in vocabulary.most_common()]
len(vocab)

# keras Data API and TextVectorization Layer

In [None]:
import tensorflow as tf

train_ds = tf.data.experimental.CsvDataset(train_path, record_defaults=[""] + [tf.constant([], dtype=tf.int32)], select_cols=[1, 2], header=True)
train_ds = train_ds.shuffle(10000).batch(512).prefetch(1)

valid_ds = tf.data.experimental.CsvDataset(valid_path, record_defaults=[""] + [tf.constant([], dtype=tf.int32)], select_cols=[1, 2], header=True)
valid_ds = valid_ds.batch(512).prefetch(1)

for X, y in train_ds.take(5):
    print(X[0], y[0])

In [None]:
vectorize_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size, output_mode='int', output_sequence_length=sequence_length)
text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer)

for X in text_ds.take(1):
    print(X[0].numpy())
    print(model.predict(X)[0])

In [None]:
print(len(vectorize_layer.get_vocabulary()))
print(vectorize_layer.get_vocabulary()[:10])

# Create a classification model with pretrained Embeddings

In [None]:
import zipfile
local_zip = "/kaggle/input/quora-insincere-questions-classification/embeddings.zip"
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('/kaggle/temp/')
zip_ref.close()

In [None]:
# load word embeddings
embeddings_index = {}
with open('/kaggle/temp/glove.840B.300d/glove.840B.300d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[-300:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:
embed_matrix = np.zeros((vocab_size, 300))
for idx, word in enumerate(vectorize_layer.get_vocabulary()):
    embed_vector = embeddings_index.get(word)
    if embed_vector is not None:
        embed_matrix[idx] = embed_vector
        
embed_matrix.shape

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1,), dtype=tf.string),
    vectorize_layer,
    tf.keras.layers.Embedding(vocab_size, 300, input_length=sequence_length, weights=[embed_matrix], trainable=False, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.summary()

### Calculate class weights

In [None]:
# Scaling by total/2 helps keep the loss to a similar magnitude.
# The sum of the weights of all examples stays the same.

neg = train_set.target.value_counts().loc[0]
pos = train_set.target.value_counts().loc[1]
total = train_set.shape[0]

weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}
class_weight

In [None]:
K = tf.keras.backend
K.clear_session()
tf.random.set_seed(42)
np.random.seed(42)

METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall'),
      tf.keras.metrics.AUC(name='auc')
]

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=METRICS, )
early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=5)

history = model.fit(train_ds, epochs=30, validation_data=valid_ds, callbacks=[early_stopping_cb], class_weight=class_weight)

In [None]:
from sklearn.metrics import confusion_matrix

def plot_cm(labels, predictions, p=0.5):
    cm = confusion_matrix(labels, predictions > p)
    plt.figure(figsize=(5,5))
    sns.heatmap(cm, annot=True, fmt="d")
    plt.title('Confusion matrix @{:.2f}'.format(p))
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    
labels = valid_set.target.values
predictions = model.predict(valid_ds.map(lambda x, y: x))
plot_cm(labels, predictions)

In [None]:
train_auc=history.history['auc']
valid_auc=history.history['val_auc']
train_loss=history.history['loss']
valid_loss=history.history['val_loss']

epochs=range(len(train_auc)) # Get number of epochs

plt.plot(epochs, train_auc, 'r')
plt.plot(epochs, valid_auc, 'b')
plt.title('Training and validation AUC')
plt.xlabel("Epochs")
plt.ylabel("AUC")
plt.legend(["Training AUC", "Validation AUC"])

plt.figure()

#------------------------------------------------
# Plot training and validation loss per epoch
#------------------------------------------------
plt.plot(epochs, train_loss, 'r')
plt.plot(epochs, valid_loss, 'b')
plt.title('Training and validation loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend(["Loss", "Validation Loss"])
plt.figure()


# Expected Output
# A chart where the validation loss does not increase sharply!

# Make Predictions

In [None]:
test_path = '/kaggle/input/quora-insincere-questions-classification/test.csv'
test_ds = tf.data.experimental.CsvDataset(test_path, record_defaults=[""], select_cols=[1], header=True).batch(512).prefetch(1)
y_pred = model.predict(test_ds)

In [None]:
test_set = pd.read_csv(test_path)
test_set['prediction'] = np.where(y_pred >= 0.5, 1, 0)
test_set[['qid', 'prediction']].to_csv('submission.csv', index=False)

In [None]:
pd.read_csv('submission.csv').head()