In [None]:
import os
import pandas as pd
import numpy as np
import time
from IPython.core.display import display, HTML
from tqdm.notebook import tqdm

import tensorflow as tf

from tensorflow.keras.layers import Dense, Input,Embedding, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer
import pickle
from keras.callbacks import ModelCheckpoint
import os
import pandas as pd
from kaggle_datasets import KaggleDatasets




In [None]:
# With 80 percent data dropped accuracy was 0.4785
# With 70 percent data dropped accuracy was 0.4560
# With optimizer RMSProp and 10 EPOCH accuracy accuracy was 0.4182. So what does this mean? There was a bug. We were reinitializing tokenizer for test!!!
# With 10 percent dropped and 10 EPOCH, toxic as label instead of int 
# Also check what happens with increase in word size, learning rate settings, steps, best model, toxic as int

In [None]:
def get_strategy():
    # Detect hardware, return appropriate distribution strategy
    try:
        # TPU detection. No parameters necessary if TPU_NAME environment variable is
        # set: this is always the case on Kaggle.
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        tpu = None

    if tpu:
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
    else:
        # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
        strategy = tf.distribute.get_strategy()

    print("REPLICAS: ", strategy.num_replicas_in_sync)
    return strategy

strategy = get_strategy()

EPOCHS = 1
BATCH_SIZE = 128 * strategy.num_replicas_in_sync
VOCAB_SIZE = 100000
EMBEDDING_DIM=128
MAX_LEN=256

SMALL_DATA = False

In [None]:
def get_file_path():
    os.listdir("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/")
    base_path = "/kaggle/input/jigsaw-multilingual-toxic-comment-classification/"
    train_file = base_path + "jigsaw-toxic-comment-train.csv"
    bias_file = base_path + "jigsaw-unintended-bias-train.csv"
    validation_file = base_path + "validation.csv"
    test_file = base_path + "test.csv"
    sub_file = base_path + "sample_submission.csv"
    return train_file, bias_file, validation_file, test_file, sub_file

In [None]:
def get_data_using_pandas(train_file, bias_file, validation_file):
    train_data = pd.read_csv(train_file)
    bias_data = pd.read_csv(bias_file)
    validation_data = pd.read_csv(validation_file)
    
    train_data["lang"] = "en"
    bias_data["lang"] = "en"
    train_data.fillna(-1, inplace=True)
    
    
    display(HTML(train_data.head(1).to_html()))
    display(HTML(bias_data.head(1).to_html()))
    display(HTML(validation_data.head(1).to_html()))
    if SMALL_DATA:
        return train_data.head(10000), bias_data.head(10000), validation_data.head(10000)
    else:
        return train_data, bias_data, validation_data

def describe_data(train_data, bias_data, validation_data):
    display(HTML(train_data.describe().T.to_html()))
    display(HTML(bias_data.describe().T.to_html()))
    display(HTML(validation_data.describe().T.to_html()))
    
    
def get_statistics(train_data, bias_data, validation_data):
    print("Train Data:------------------------\n", train_data.shape[0])
    print("Bias Data:------------------------\n", bias_data.shape[0])
    print("Valdation Data:--------------------\n", validation_data["lang"].value_counts())
   

In [None]:
def get_tokenized_data(data):
    arr_data = []
    for i in tqdm(range(0, data.shape[0], CHUNK_SIZE )):
        slice_data = data.iloc[i:i+CHUNK_SIZE].copy()
        arr_data.append(tokenize_data(slice_data))
        print("Process from{} to {}. Now we have {} records".format(i, i+CHUNK_SIZE, len(arr_data)))
    data = pd.concat(arr_data)
    return data

In [None]:
tokenizer = None

def tokenize_data(data):
    global tokenizer
    data["sequences"] = tokenizer.texts_to_sequences(data["comment_text"].values)
    data["embedding_input"] = pad_sequences(data["sequences"].values, padding="post", maxlen=MAX_LEN, truncating="post").tolist()
    return data

In [None]:
def build_model():
    model = tf.keras.Sequential([
        Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_LEN, name="embedding"),
        GlobalAveragePooling1D(),
        Dense(24, activation="relu"),
        Dense(1, activation="sigmoid")
        
    ])
    
    model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


In [None]:
def fix_data(data, label):
    if 1==1:
        s = tf.strings.strip(data["embedding_input"])
        s = tf.strings.substr(
            s, 1, tf.strings.length(s) - 2)  # Remove parentheses around list
        s = tf.strings.split(s, ',')
        s = tf.strings.to_number(s, tf.int32)
        s = tf.reshape(s, [MAX_LEN])  # Force shape here needed for XLA compilation (TPU)
    
    data["embedding_input"] = s
    return data, label

In [None]:
bln_skip = True

In [None]:
GCS_PATH = KaggleDatasets().get_gcs_path("toxic-ds") # you can list the bucket with "!gsutil ls $GCS_PATH"
    
train_dataset_file  = GCS_PATH + "/train_dataset.csv"
validation_dataset_file  = GCS_PATH + "/validation_dataset.csv"
#test_dataset_file = GCS_PATH + "/x_test.pkl"
!gsutil ls $GCS_PATH


In [None]:
train_file, bias_file, validation_file, test_file, sub_file = get_file_path()

In [None]:

train_data, bias_data, validation_data = get_data_using_pandas(train_file, bias_file, validation_file)

In [None]:
if not bln_skip:
    describe_data(train_data, bias_data, validation_data)

In [None]:
if not bln_skip:
    get_statistics(train_data, bias_data, validation_data)

In [None]:
if not bln_skip:
    train_data = pd.concat([train_data, bias_data, validation_data])

In [None]:
if not bln_skip:
    del bias_data
    del validation_data

In [None]:
bln_skip

In [None]:
if not bln_skip:
    # Fit tokenizer
    tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
    tokenizer.fit_on_texts(train_data["comment_text"].values.tolist())
    CHUNK_SIZE = 50000


    train_data["label"] = train_data["toxic"].map(lambda x: 1 if x >=0.5 else 0)
    train_data["lang_label"] = train_data["label"].astype(str) + train_data["lang"]

    train_data["len"] = train_data["comment_text"].map(lambda x: len(x.split(" ")))
    train_data["len"] = train_data["len"].map(lambda x: x//10 if x < 500 else 50)
    en_notoxic_data = train_data[(train_data["lang"]=="en") & (train_data["label"]==0)]
    drop_data, _ = train_test_split(en_notoxic_data, train_size=0.1, stratify=en_notoxic_data["len"])
    train_data.drop(drop_data.index.values, inplace=True)


    train_nlp_data, validation_nlp_data = train_test_split(train_data, train_size=0.8, stratify=train_data["lang_label"], shuffle=True)
   

    tokenized_train_data = get_tokenized_data(train_nlp_data)
    tokenized_validation_data = get_tokenized_data(validation_nlp_data)


    tokenized_train_data[["embedding_input","toxic"]].to_csv("./train_dataset.csv", index=False)
    tokenized_validation_data[["embedding_input","toxic"]].to_csv("./validation_dataset.csv", index=False)
    
   
    del train_data, train_nlp_data, validation_nlp_data
    
    test_data = pd.read_csv(test_file)

    test_data = test_data.rename(columns={"content":"comment_text"})
    print(test_data.head(1))
    tokenize_data(test_data)
    x_test = test_data["embedding_input"].values.tolist()
    pickle.dump(x_test, open("x_test.pkl", "wb"))
    
else:
    GCS_PATH = KaggleDatasets().get_gcs_path("toxic-ds") # you can list the bucket with "!gsutil ls $GCS_PATH"
    
    train_dataset_file  = GCS_PATH + "/train_dataset.csv"
    validation_dataset_file  = GCS_PATH + "/validation_dataset.csv"
    test_dataset_file = "../input/toxic-ds" + "/x_test.pkl"
    with open(test_dataset_file, 'rb') as pickle_file:
        x_test = pickle.load(pickle_file)

    tokenized_train_data = pd.read_csv(train_dataset_file)
    tokenized_validation_data = pd.read_csv(validation_dataset_file)

    train_size = tokenized_train_data.shape[0]
    print("Train data size:", train_size)

    valid_size = tokenized_validation_data.shape[0]
    print("Valid data size:", valid_size)
    del tokenized_train_data, tokenized_validation_data

    train_dataset = tf.data.experimental.make_csv_dataset(train_dataset_file, label_name="toxic", batch_size=1)
    validation_dataset = tf.data.experimental.make_csv_dataset(validation_dataset_file, label_name="toxic", batch_size=1)

    train_dataset = train_dataset.unbatch()
    train_dataset = train_dataset.map(lambda data, label: fix_data(data, label), num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

    validation_dataset = validation_dataset.unbatch()
    validation_dataset = validation_dataset.map(lambda data, label: fix_data(data, label), num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)






In [None]:
if bln_skip:
    for data,label in train_dataset.take(1):
        print(data)

In [None]:
if bln_skip:
    from tensorflow.keras.models import model_from_json
    if 1==1: #not bln_skip:
        with strategy.scope():
            model = build_model()
    else:
        prefix = "../input/from-first-principles-toxic-or-not/"
        json_file = open(prefix + 'model.json', 'r')
        loaded_model_json = json_file.read()
        json_file.close()
        model = model_from_json(loaded_model_json)
        # load weights into new model
        model.load_weights(prefix + "model.h5")
    rmsprop = RMSprop()
    model.compile(optimizer=rmsprop, loss='binary_crossentropy', metrics=['accuracy'])
    model.summary()

In [None]:
if 1==2:
    del x_train
    del y_train
    del x_valid
    del y_valid
    del x_test

In [None]:
if  bln_skip:
    EPOCHS = 10
    filepath="toxic_model.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
    earlystopping = EarlyStopping(monitor='val_loss', patience=30, mode='min', min_delta=0.0001),
    callbacks_list = [checkpoint, earlystopping]
    if 1==1: #not bln_skip:
        n_steps = train_size // BATCH_SIZE
        print("Steps:", n_steps)
        train_history = model.fit(
            train_dataset,
            steps_per_epoch=n_steps,
            validation_data=validation_dataset,
            validation_steps=valid_size//BATCH_SIZE,
            epochs=EPOCHS
        )

if 1==2:
    model_json = model.to_json()
    with open("./model.json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights("./model.h5")
    print("Saved model to disk")

In [None]:
if bln_skip:
    test_dataset = (
        tf.data.Dataset
        .from_tensor_slices(x_test)
        .batch(8)
    )

In [None]:
if bln_skip:
    sub = pd.read_csv(sub_file)
    sub['toxic'] = model.predict(test_dataset, verbose=1)
    sub.to_csv('submission.csv', index=False)

In [None]:
#from kaggle_datasets import KaggleDataset
#GCS_PATH = KaggleDatasets().get_gcs_path() # you can list the bucket with "!gsutil ls $GCS_PATH"