In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Dataset Exploration

In [None]:
import zipfile
zip_ref = zipfile.ZipFile('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip', 'r')
zip_ref.extractall('/kaggle/temp')
zip_ref.close()

os.listdir('/kaggle/temp/')

In [None]:
df = pd.read_csv('/kaggle/temp/train.csv')
df.info()

In [None]:
df.head()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

def summarize_data(corpus):
    """
    print statements and visualizations to summarize the corpus
    """
    
    # get the documents size
    df_doc_size = pd.Series([len(str(doc).split(" ")) for doc in corpus])
    
    # get the tokens in the corpus
    df_tokens = pd.Series([token for doc in corpus for token in str(doc).split(" ")])
    
    print("---------------------------")
    print("num docs", len(corpus))
    print("median tokens", df_doc_size.median())
    print("num tokens", len(df_tokens))
    print("unique tokens", len(df_tokens.value_counts()))
    print("---------------------------")
    
    # make plots
    fig = plt.figure(figsize=(14,6))
    ax1 = fig.add_subplot(121)
    ax2 = fig.add_subplot(122)
    
    df_doc_size.plot.hist(ax=ax1, title='Document Sizes')
    df_tokens.value_counts().plot.hist(ax=ax2, title='Tokens Counts')
    
summarize_data(df.comment_text.values.tolist())

# Explore the Target

In [None]:
df[df.drop(['id','comment_text'], axis=1).sum(axis=1)>1]

In [None]:
fig = plt.figure(figsize=(8,6))
df.drop(['id','comment_text'], axis=1).sum().sort_values(ascending=False).plot.bar(title='Classes Counts')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

train_set, valid_set = train_test_split(df, test_size=0.2, random_state=42)

print(train_set.shape)
print(valid_set.shape)

In [None]:
fig = plt.figure(figsize=(14,6))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)

train_set.drop(['id','comment_text'], axis=1).sum().sort_values(ascending=False).plot.bar(title='Classes Counts | Train', ax=ax1)
valid_set.drop(['id','comment_text'], axis=1).sum().sort_values(ascending=False).plot.bar(title='Classes Counts | Valid', ax=ax2)

In [None]:
train_path = "/kaggle/temp/train_set.csv"
valid_path = "/kaggle/temp/valid_set.csv" 

train_set.to_csv(train_path, index=False)
valid_set.to_csv(valid_path, index=False)

os.listdir('/kaggle/temp/')

# keras Text preprocessing with TextVectorization

In [None]:
import tensorflow as tf

vocab_size = 100000
sequence_length = 150

train_sentences = train_set.comment_text.values.tolist()
valid_sentences = valid_set.comment_text.values.tolist()

vectorize_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size, output_mode='int', output_sequence_length=sequence_length)
vectorize_layer.adapt(train_sentences)

vectorizer = tf.keras.models.Sequential()
vectorizer.add(tf.keras.Input(shape=(1,), dtype=tf.string))
vectorizer.add(vectorize_layer)

train_sequences = vectorizer.predict(train_sentences)
valid_sequences = vectorizer.predict(valid_sentences)

In [None]:
print(train_sentences[:3])
print(train_sequences[:3])

In [None]:
print(len(vectorize_layer.get_vocabulary()))
print(vectorize_layer.get_vocabulary()[:10])

# Create Classification model

In [None]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# input layer
inputs = tf.keras.layers.Input(shape=(sequence_length,))

# embeddings
embed = tf.keras.layers.Embedding(vocab_size, 100, input_length=sequence_length, mask_zero=True)(inputs)

# lstm layers
z = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(embed)
z = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(z)

# output block
class OutputBlock(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.dense = tf.keras.layers.Dense(32, activation='relu')
        self.dropout = tf.keras.layers.Dropout(0.5)
        self.out = tf.keras.layers.Dense(1, activation='sigmoid')
        
    def call(self, inputs):
        Z = inputs
        Z = self.dense(Z)
        Z = self.dropout(Z)
        return self.out(Z)
    
output_blocks = [OutputBlock(name=label) for label in labels]
outputs = []
for block in output_blocks:
    outputs.append(block(z))

model = tf.keras.models.Model(inputs=[inputs], outputs=outputs)
model.summary()

In [None]:
y_train = [train_set[label].values for label in labels]
y_valid = [valid_set[label].values for label in labels]

In [None]:
K = tf.keras.backend
K.clear_session()
tf.random.set_seed(42)
np.random.seed(42)

model.compile(loss={label:"binary_crossentropy" for label in labels},
              optimizer=tf.keras.optimizers.Adam(),
              metrics={label:tf.keras.metrics.AUC(name='auc') for label in labels})

early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=5)
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("nlp.h5", save_best_only=True)

history = model.fit(train_sequences, y_train, epochs=30, validation_data=(valid_sequences, y_valid), batch_size=128,
                    callbacks=[early_stopping_cb, checkpoint_cb])

In [None]:
loss=history.history['loss']
val_loss=history.history['val_loss']
epochs=range(len(loss)) # Get number of epochs

# Plot training and validation loss per epoch
plt.plot(epochs, loss, 'r', label="Training Loss")
plt.plot(epochs, val_loss, 'b', label="Validation Loss")
plt.legend()
plt.show()

# Make Predictions

In [None]:
model = tf.keras.models.load_model('nlp.h5', custom_objects={'OutputBlock':OutputBlock}) # rollback to the best model
model.evaluate(valid_sequences, y_valid)

In [None]:
zip_ref = zipfile.ZipFile("/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip", 'r')
zip_ref.extractall('/kaggle/temp')

zip_ref = zipfile.ZipFile("/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip", 'r')
zip_ref.extractall('/kaggle/temp')

zip_ref = zipfile.ZipFile("/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip", 'r')
zip_ref.extractall('/kaggle/temp')

zip_ref.close()

os.listdir('/kaggle/temp')

In [None]:
pd.read_csv('/kaggle/temp/sample_submission.csv').head()

In [None]:
test_set = pd.read_csv('/kaggle/temp/test.csv')
test_sentences = test_set.comment_text.values.tolist()
test_sequences = vectorizer.predict(test_sentences)
predictions = model.predict(test_sequences)

In [None]:
for label, y_pred in zip(labels, predictions):
    test_set[label] = y_pred
    
test_set.head()

In [None]:
test_set.drop(['comment_text'], axis=1).to_csv('submission.csv', index=False)
pd.read_csv('submission.csv').head()