In [None]:
import pandas as pd
import numpy as np

import string
import re
import zipfile
import os

import tensorflow as tf
from tensorflow import keras
os.environ["KMP_SETTINGS"] = "false"

train_path = '../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip'
test_path = '../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip'

train_zip = zipfile.ZipFile(train_path, 'r')
train_zip.extractall()
test_zip = zipfile.ZipFile(test_path, 'r')
test_zip.extractall()

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df

In [None]:
sum_len = 0
count = 0
for s in train_df['comment_text']:
    sum_len += len(s)
    count += 1
    
avg_len = int(sum_len / count)
print('Average length:', avg_len)

In [None]:
batch_size = 126

raw_df = train_df.drop('id', axis=1)
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

dataset = tf.data.Dataset.from_tensor_slices((raw_df['comment_text'], tf.convert_to_tensor(raw_df[labels], dtype=tf.int64)))
dataset = dataset.shuffle(10000).batch(batch_size) 

In [None]:
# split dataset
ds_batches = tf.data.experimental.cardinality(dataset)
val_ds = dataset.take(ds_batches // 10)
train_ds = dataset.skip(ds_batches // 10)

print('Number of train batches: %d' % tf.data.experimental.cardinality(train_ds))
print('Number of validation batches: %d' % tf.data.experimental.cardinality(val_ds))

In [None]:
def custom_standardization(input_data):
    s = tf.strings.lower(input_data)
    s = tf.strings.regex_replace(s, '<.*?>','') # html
    s = tf.strings.regex_replace(s, 'http\S+','') # links
    s = tf.strings.regex_replace(s, '[^\x00-\x7F]+','') # non-ascii
    s = tf.strings.regex_replace(s, '[%s]' % re.escape(string.punctuation),' ')
    s = tf.strings.regex_replace(s, '\n',' ')
    s = tf.strings.regex_replace(s, '\W+',' ') # non-word characters
    s = tf.strings.regex_replace(s, '\s+',' ')
    return s

# show example preprocessed
for row in dataset.take(1).map(lambda x, y: custom_standardization(x)):
    print(str(row[0].numpy().decode('ascii')))

In [None]:
max_features = 10000
sequence_length = avg_len

vectorize_layer = keras.layers.experimental.preprocessing.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

text = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(next(iter(text)))

In [None]:
vectorize_layer.vocabulary_size()

In [None]:
vocab_len = vectorize_layer.vocabulary_size()
embedding_dim = 300

model = tf.keras.Sequential([
    keras.layers.Input(shape=(1,), dtype=tf.string),
    vectorize_layer,
    keras.layers.Embedding(vocab_len + 1, embedding_dim, input_length=sequence_length),
    keras.layers.SpatialDropout1D(0.3),
    keras.layers.Bidirectional(keras.layers.LSTM(156, return_sequences=True)),
    keras.layers.Bidirectional(keras.layers.GRU(156, return_sequences=True)),
    keras.layers.GlobalMaxPooling1D(),
    #keras.layers.Dense(128, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(6, activation='sigmoid')
])

model.summary()

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

early_stop = keras.callbacks.EarlyStopping(patience=1, restore_best_weights=True)

model.fit(train_ds, validation_data=val_ds, epochs=6, callbacks=[early_stop])

In [None]:
y_pred = model.predict(test_df['comment_text'])
test_df[labels] = y_pred

In [None]:
test_df = test_df.drop('comment_text', axis=1)
test_df.head()

In [None]:
test_df.to_csv('submission.csv', index=False)