In [None]:
!pip install tensorflow_text

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import re
import os
import json
import shutil
import string
import joblib
from io import StringIO

import tensorflow_datasets as tfds
import tensorflow as tf
import tensorflow_text as text
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

from sklearn import model_selection 
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from keras.models import Model
from keras.layers import *
from tensorflow.python.keras.utils.vis_utils import plot_model


EMBEDDING_FILE = '../input/glove840b300dtxt/glove.840B.300d.txt'

In [None]:
# https://www.kaggle.com/subashgandyer/toxiccomments
with open('../input/toxiccomments/train.csv', 'r', encoding='utf-8') as f:
    train = pd.read_csv(f, sep=',', engine='python').dropna(how='all', axis=1)
with open('../input/toxiccomments/test.csv', 'r', encoding='utf-8') as f:
    test = pd.read_csv(f, sep=',', engine='python').dropna(how='all', axis=1)
with open('../input/toxiccomments/test_labels.csv', 'r', encoding='utf-8') as f:
    test_labels = pd.read_csv(f, sep=',', engine='python').dropna(how='all', axis=1)
with open('../input/toxiccomments/sample_submission.csv', 'r', encoding='utf-8') as f:
    sub = pd.read_csv(f, sep=',', engine='python').dropna(how='all', axis=1)

In [None]:
test.info()

In [None]:
def dashboard_training(history):
    plt.figure(figsize=(15,5))
    epochs = range(len(history.history['AUC']))
    
    plt.subplot(1, 2, 1)
    plt.plot(epochs,history.history['AUC'],'-o',label='Train AUC',color='#ff7f0e')
    plt.plot(epochs,history.history['val_AUC'],'-o',label='Val AUC',color='#1f77b4')
    x = np.argmax(history.history['val_AUC'] ); y = np.max( history.history['val_AUC'] )
    xdist = plt.xlim()[1] - plt.xlim()[0]; ydist = plt.ylim()[1] - plt.ylim()[0]
    plt.scatter(x,y,s=200,color='#1f77b4'); plt.text(x-0.03*xdist,y-0.13*ydist,'max auc\n%.2f'%y,size=14)
    plt.ylabel('AUC',size=14); plt.xlabel('Epoch',size=15)
    plt.legend(loc=4)
    
    
    plt.subplot(1, 2, 2)
    plt2 = plt.gca().twinx()
    plt2.plot(epochs,history.history['loss'],'-o',label='Train Loss',color='#2ca02c')
    plt2.plot(epochs,history.history['val_loss'],'-o',label='Val Loss',color='#d62728')
    x = np.argmin(history.history['val_loss'] ); y = np.min(history.history['val_loss'] )
    xdist = plt.xlim()[1] - plt.xlim()[0];ydist = plt.ylim()[1] - plt.ylim()[0]
    plt.scatter(x,y,s=200,color='#d62728'); plt.text(x-0.03*xdist,y+0.05*ydist,'min loss',size=14)
    plt.ylabel('Loss',size=14);plt.xlabel('Epoch',size=15)
    plt.legend(loc=2)
    
    plt.show()
    
def plot_lr_history(history):
    epochs = range(len(history.history['accuracy']))
    plt.figure(figsize=(10,5))
    
    plt.rc('grid', linestyle="--", color='black')
    plt.semilogx(epochs,history.history["lr"],'-o',label='Learning Rate',color='#d62728')
    x = np.argmin(history.history['lr'] ); y = np.min(history.history['lr'] )
    xdist = plt.xlim()[1] - plt.xlim()[0];ydist = plt.ylim()[1] - plt.ylim()[0]
    plt.scatter(x,y,s=200,color='#d62728'); plt.text(x-0.03*xdist,y+0.05*ydist,'Min lr\n%.2E'%y,size=14)
    plt.ylabel('Learning Rate',size=15);plt.xlabel('Epoch',size=15)
    plt.xlim(0,len(epochs)+15)
    plt.legend(loc=2)
    plt.grid(True)
    plt.show()

In [None]:
train.head(7)

In [None]:
df = train.copy()
N_splits = 20
df = df.drop(columns = ["id"])
df["kfold"] = -1 # create a new column
df = df.sample(frac=1).reset_index(drop=True)
y = df.toxic.values
kf = model_selection.StratifiedKFold(n_splits=N_splits)
for f,(t_,v_) in enumerate(kf.split(X=df,y=y)):
    df.loc[v_,"kfold"] = f


In [None]:
vocab_size = 100000
embedding_dim = 300
max_length = 150
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

DEVICE = "GPU" #or "TPU"
BUFFER_SIZE = 10000
BATCH_SIZE = 1024
EPOCHS = 10
LR = 0.01

In [None]:
if DEVICE == "GPU":
    print("connecting to TPU...")
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        print("Could not connect to TPU")
        tpu = None

    if tpu:
        try:
            print("initializing  TPU ...")
            tf.config.experimental_connect_to_cluster(tpu)
            tf.tpu.experimental.initialize_tpu_system(tpu)
            strategy = tf.distribute.experimental.TPUStrategy(tpu)
            print("TPU initialized")
        except _:
            print("failed to initialize TPU")
    else:
        DEVICE = "GPU"

if DEVICE != "TPU":
    print("Using default strategy for CPU and single GPU")
    strategy = tf.distribute.get_strategy()

if DEVICE == "GPU":
    print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
    

AUTO     = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

## Callbacks

In [None]:
# Save the model with the minimum validation loss
# https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/ModelCheckpoint
EarlyStopping_cb = EarlyStopping(monitor='val_loss', mode='min', patience=5,restore_best_weights=True, verbose=1)

# Save best model
Checkpoint_cb_AUC = ModelCheckpoint("best_AUC_model.h5",save_best_only=True,monitor='val_AUC',mode='max')
Checkpoint_cb_loss = ModelCheckpoint("best_loss_model.h5",save_best_only=True,monitor='val_loss',mode='min')

# Reduce learning rate once learning stagnates
# https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/ReduceLROnPlateau
Reduce_lr = ReduceLROnPlateau(monitor='val_loss',factor=0.2,patience=5,min_lr=1e-15,mode='min',verbose=1)

Csv_logger = tf.keras.callbacks.CSVLogger('training.log')

AdamOptimizer = tf.keras.optimizers.Adam(learning_rate = LR)
RMSpropOptimizer = tf.keras.optimizers.RMSprop(learning_rate = LR)
SGDOptimizer = tf.keras.optimizers.SGD(learning_rate = LR)
AdagradOptimizer = tf.keras.optimizers.Adagrad(learning_rate = LR)

In [None]:
def My_model_1():
    METRICS = [tf.keras.metrics.BinaryAccuracy(name='accuracy'),tf.keras.metrics.AUC(name='AUC')]
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(12, activation='relu'),
    tf.keras.layers.Dense(6, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer=AdamOptimizer, metrics=METRICS)
    return model

def My_model_2():
    METRICS = [tf.keras.metrics.BinaryAccuracy(name='accuracy'),tf.keras.metrics.AUC(name='AUC')]
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(6, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=METRICS)
    return model

def My_model_3(embedding_matrix):
    METRICS = [tf.keras.metrics.BinaryAccuracy(name='accuracy'),tf.keras.metrics.AUC(name='AUC')]
    sequence_input = tf.keras.layers.Input(shape=(max_length, ))
    x = tf.keras.layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix],trainable = False)(sequence_input)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
    x = tf.keras.layers.Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(x)
    max_pool = tf.keras.layers.GlobalMaxPooling1D()(x)
    x = tf.keras.layers.concatenate([avg_pool, max_pool]) 
    # x = Dense(128, activation='relu')(x)
    # x = Dropout(0.1)(x)
    preds = tf.keras.layers.Dense(6, activation="sigmoid")(x)
    model = tf.keras.models.Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=METRICS)
    return model

In [None]:
model = My_model_1()

train_df = df[df.kfold !=0].reset_index(drop=True)
test_df = df[df.kfold == 0].reset_index(drop=True)

training_sentences = train_df.comment_text.values
testing_sentences = test_df.comment_text.values
training_labels = train_df[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]].values
testing_labels = test_df[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]].values

tokenizer = Tokenizer(num_words=vocab_size,filters='"#$%&+-/:;<=>@[\\]^_`{|}~\n', oov_token=oov_tok,lower=True)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Need this block to get it to work with TensorFlow 2.x
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

train_dataset = tf.data.Dataset.from_tensor_slices((training_padded,training_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((testing_padded,testing_labels))

train_dataset = train_dataset.shuffle(BUFFER_SIZE)
train_dataset = train_dataset.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(train_dataset))
test_dataset = test_dataset.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(test_dataset))

history = model.fit(train_dataset,
                #steps_per_epoch=int(len(train_df)/BATCH_SIZE)//REPLICAS,  # Nv images = batch_size * steps
                epochs=EPOCHS,
                validation_data=test_dataset,
                #validation_steps=int(len(test_df)/BATCH_SIZE)//REPLICAS,  # Nb images = batch_size * steps
                verbose=1,callbacks=[Checkpoint_cb_AUC,Checkpoint_cb_loss])
dashboard_training(history)
    

In [None]:
testing_sentences = test.comment_text.values
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
testing_padded = np.array(testing_padded)
test_dataset = tf.data.Dataset.from_tensor_slices((testing_padded))
test_dataset = test_dataset.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(test_dataset))
Xtest = test_dataset

In [None]:
best_model = tf.keras.models.load_model("./best_loss_model.h5")
y_test = model.predict(Xtest, batch_size=BATCH_SIZE, verbose=1)
sub[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]] = y_test
sub.to_csv('submission.csv', index=False)

## GloVe 

In [None]:
embeddings_index = {}
with open(EMBEDDING_FILE,encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:
train_df = df[df.kfold !=0].reset_index(drop=True)
test_df = df[df.kfold == 0].reset_index(drop=True)

training_sentences = train_df.comment_text.values
testing_sentences = test_df.comment_text.values
training_labels = train_df[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]].values
testing_labels = test_df[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]].values

tokenizer = Tokenizer(num_words=vocab_size,filters='"#$%&+-/:;<=>@[\\]^_`{|}~\n', oov_token=oov_tok,lower=True)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Need this block to get it to work with TensorFlow 2.x
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

train_dataset = tf.data.Dataset.from_tensor_slices((training_padded,training_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((testing_padded,testing_labels))

train_dataset = train_dataset.shuffle(BUFFER_SIZE)
train_dataset = train_dataset.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(train_dataset))
test_dataset = test_dataset.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(test_dataset))

In [None]:
word_index = tokenizer.word_index
#prepare embedding matrix
num_words = min(vocab_size, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
model = My_model_3(embedding_matrix)
history = model.fit(train_dataset,
                #steps_per_epoch=int(len(train_df)/BATCH_SIZE)//REPLICAS,  # Nv images = batch_size * steps
                epochs=EPOCHS,
                validation_data=test_dataset,
                #validation_steps=int(len(test_df)/BATCH_SIZE)//REPLICAS,  # Nb images = batch_size * steps
                verbose=1,callbacks=[Checkpoint_cb_AUC,Checkpoint_cb_loss])
dashboard_training(history)

In [None]:
best_model = tf.keras.models.load_model("./best_loss_model.h5")
y_test = model.predict(Xtest, batch_size=BATCH_SIZE, verbose=1)
sub[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]] = y_test
sub.to_csv('submission.csv', index=False)

## Fasttext

In [None]:
EMBEDDING_FILE_FASTTEXT = "../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec"

In [None]:
embeddings_index_2 = {}
with open(EMBEDDING_FILE_FASTTEXT,encoding='utf8') as f:
    for line in f:
        values_2 = line.rstrip().rsplit(' ')
        word_2 = values_2[0]
        coefs_2 = np.asarray(values_2[1:], dtype='float32')
        embeddings_index_2[word_2] = coefs_2

In [None]:
word_index = tokenizer.word_index
#prepare embedding matrix
num_words = min(vocab_size, len(word_index) + 1)
embedding_matrix_2 = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index_2.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix_2[i] = embedding_vector

In [None]:
model = My_model_3(embedding_matrix_2)
history = model.fit(train_dataset,
                #steps_per_epoch=int(len(train_df)/BATCH_SIZE)//REPLICAS,  # Nv images = batch_size * steps
                epochs=EPOCHS,
                validation_data=test_dataset,
                #validation_steps=int(len(test_df)/BATCH_SIZE)//REPLICAS,  # Nb images = batch_size * steps
                verbose=1,callbacks=[Checkpoint_cb_AUC,Checkpoint_cb_loss])
dashboard_training(history)

## GloVe + fasttext

In [None]:
def My_model_4(embedding_matrix1,embedding_matrix2):

    METRICS = [tf.keras.metrics.BinaryAccuracy(name='accuracy'),tf.keras.metrics.AUC(name='AUC')]
    sequence_input = tf.keras.layers.Input(shape=(max_length, ))
    x1 = tf.keras.layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix1],trainable = False)(sequence_input)
    x1 = tf.keras.layers.Dropout(0.2)(x1)
    x1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x1)
    x1 = tf.keras.layers.Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x1)
    avg_pool1 = tf.keras.layers.GlobalAveragePooling1D()(x1)
    max_pool1 = tf.keras.layers.GlobalMaxPooling1D()(x1)
    x1 = tf.keras.layers.concatenate([avg_pool1, max_pool1])
    
    x2 = tf.keras.layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix2],trainable = False)(sequence_input)
    x2 = tf.keras.layers.Dropout(0.2)(x2)
    x2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x2)
    x2 = tf.keras.layers.Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x2)
    avg_pool2 = tf.keras.layers.GlobalAveragePooling1D()(x2)
    max_pool2 = tf.keras.layers.GlobalMaxPooling1D()(x2)
    x2 = tf.keras.layers.concatenate([avg_pool2, max_pool2])
    
    x = tf.keras.layers.concatenate([x1, x2])
    # x = Dense(128, activation='relu')(x)
    # x = Dropout(0.1)(x)
    preds = tf.keras.layers.Dense(6, activation="sigmoid")(x)
    model = tf.keras.models.Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=METRICS)
    return model

In [None]:
embedding_matrix_1 = embedding_matrix

model =  My_model_4(embedding_matrix_1,embedding_matrix_2)
#model = build_model_with_sequential()
# Plot model graph
plot_model(model, show_shapes=True, show_layer_names=True, to_file='model.png')

In [None]:

history = model.fit(train_dataset,
                #steps_per_epoch=int(len(train_df)/BATCH_SIZE)//REPLICAS,  # Nv images = batch_size * steps
                epochs=EPOCHS,
                validation_data=test_dataset,
                #validation_steps=int(len(test_df)/BATCH_SIZE)//REPLICAS,  # Nb images = batch_size * steps
                verbose=1,callbacks=[Checkpoint_cb_AUC,Checkpoint_cb_loss])
dashboard_training(history)

In [None]:
best_model = tf.keras.models.load_model("./best_loss_model.h5")
y_test = model.predict(Xtest, batch_size=BATCH_SIZE, verbose=1)
sub[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]] = y_test
sub.to_csv('submission.csv', index=False)

In [None]:
sub