In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# !pip3 install -q -U keras-tuner

In [None]:
import kerastuner as kt

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
stop_words = set(stopwords.words('english')) 
  
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split


In [None]:
vocab_size = 2_000
embedding_dim = 100
max_length = 1403

In [None]:
train_df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip")
print(train_df.shape)
train_df[:3]

In [None]:
test_df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip")
print(test_df.shape)
test_df[:3]

In [None]:
target_cols = np.array(['toxic','severe_toxic',
                        'obscene', 'threat',
                        'insult', 'identity_hate'])

In [None]:
# remove stop words
def remove_stopwords(sent):
    word_tokens = word_tokenize(sent) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    return " ".join(filtered_sentence)

In [None]:
# train_df.comment_text=train_df.comment_text.apply(remove_stopwords)
# test_df.comment_text=test_df.comment_text.apply(remove_stopwords)

In [None]:
def concat_labels(row):
    label_idx = np.where(row)[0]
    # print(label_idx, len(label_idx), row.index)
    
    if len(label_idx)>0:
        return " ".join(row.index[label_idx].tolist())
    else:
        return "none"

def onehot_labels(row):
        return row.astype("int").values.reshape(1, -1)
    
# (train_df[target_cols]>0)[:12].apply(concat_labels,1)
(train_df[target_cols]>0)[:12].apply(onehot_labels,1)

In [None]:
idx_x, idx_y = np.where(train_df[target_cols]>0)
len(idx_x)

In [None]:
sentences = train_df["comment_text"].tolist()

In [None]:
test_sentences = test_df["comment_text"].tolist()

In [None]:
# labels = (train_df[target_cols]>0).apply(concat_labels,1)
labels = (train_df[target_cols]>0).apply(onehot_labels,1)
labels_npy = np.concatenate(labels.values, axis=0)

# Tokenize

### Sentence

In [None]:
tokenizer = Tokenizer(oov_token = "<OOV>", num_words=vocab_size)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(len(word_index))

In [None]:
pd.Series(word_index)

In [None]:
sequences = tokenizer.texts_to_sequences(sentences) # Your Code Here
padded = pad_sequences(sequences, padding = 'post')  # Your Code here
print(padded[0])
print(padded.shape)

In [None]:
test_sequences = tokenizer.texts_to_sequences(test_sentences) # Your Code Here
test_padded = pad_sequences(test_sequences, padding = 'post', maxlen=max_length)  # Your Code here
print(test_padded[0])
print(test_padded.shape)

In [None]:
print(len(padded[0]))
print(len(test_padded[0]))

### Label

In [None]:
# v1 labels
# label_tokenizer = Tokenizer()
# label_tokenizer.fit_on_texts(labels)
# label_word_index = label_tokenizer.word_index
# label_seq = label_tokenizer.texts_to_sequences(labels)
# print(label_seq[:5])
# print(label_word_index)

# Model

In [None]:
tf.keras.backend.clear_session()
model = tf.keras.Sequential([
# YOUR CODE HERE
    tf.keras.layers.Input(max_length),
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
#     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=False)),    
    tf.keras.layers.Conv1D(64, 3, padding='valid', strides=1, activation='relu'),
    tf.keras.layers.Conv1D(64, 3, padding='same', strides=1, activation='relu'),
    tf.keras.layers.MaxPooling1D(2),
    
    tf.keras.layers.Conv1D(64, 3, padding='valid', strides=1, activation='relu'),
    tf.keras.layers.Conv1D(64, 3, padding='same', strides=1, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Flatten(), 
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(6, activation = 'sigmoid')
    
])
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['mae', tf.keras.metrics.AUC()])
model.summary()

In [None]:
np.random.seed(1291)
X_train, X_test, y_train, y_test = train_test_split(padded, labels_npy, test_size=0.2)

In [None]:
callbacks=[
    tf.keras.callbacks.EarlyStopping(monitor="val_auc", patience=5, mode="min", restore_best_weights=True, verbose=1),
    tf.keras.callbacks.ModelCheckpoint(filepath="best_model-{epoch:02d}-{val_auc:.4f}.hdf5", save_best_only=True, verbose=1, monitor="val_auc")
] 


In [None]:
history = model.fit(X_train, y_train, epochs=15, batch_size=120, validation_split=0.2, callbacks=callbacks)

In [None]:
evel_loss, evel_mae, evel_auc = model.evaluate(x=X_test, y=y_test)

In [None]:
def plot_learning_curve(history):
        # plt.plot(history.epoch, history.history["auc"], ".:")
        # plt.plot(history.epoch, history.history["val_auc"], ".:")

        plt.plot(history.epoch, history.history["loss"], ".:", label="loss")
        plt.plot(history.epoch, history.history["val_loss"], ".:", label="val_loss")
        plt.legend()

plot_learning_curve(history)
val_auc = history.history["val_auc"][-1]

In [None]:
val_auc, evel_auc, min(val_auc,evel_auc)-abs(val_auc - evel_auc)

# prediction for submission

In [None]:
test_pred = model.predict(test_padded)

In [None]:
test_pred

In [None]:
# (test_pred>0.5).astype("int")

In [None]:
pd.concat([test_df["id"], pd.DataFrame(test_pred, columns=target_cols)], 1).to_csv("sub.csv", index=False)

In [None]:
# !kaggle competitions submit -f sub.csv -m "" jigsaw-toxic-comment-classification-challenge

In [None]:
# !kaggle competitions submissions jigsaw-toxic-comment-classification-challenge

```
publicScore  privateScore
0.93708      0.93811
```