In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Installing bayesian-optimization package
!pip install bayesian-optimization
# Installing for Stratified Split
!pip install iterative-stratification

In [None]:
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer 
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Dropout, Conv1D, GlobalMaxPooling1D, MaxPooling1D, Flatten
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.merge import concatenate
from keras.optimizers import Adam
from keras.utils.vis_utils import plot_model
from keras import backend as K
# For hyperparameter tuning
from bayes_opt import BayesianOptimization
# For Splitting Data
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
AUTO = tf.data.experimental.AUTOTUNE
BATCH_SIZE=64
TOTAL_BATCH_SIZE = BATCH_SIZE * strategy.num_replicas_in_sync
print("Total Batch Size:",TOTAL_BATCH_SIZE)

# Loading Data

In [None]:
# Loading Training set
df_train=pd.read_json('/kaggle/input/toxic-comment-cnn-cleaned/train.json')
print('Shape=>',df_train.shape)
df_train.head()

In [None]:
# Loading Test set
df_test=pd.read_json('/kaggle/input/toxic-comment-cnn-cleaned/test.json')
print('Shape=>',df_test.shape)
df_test.head()

# Tokenizing Text

In [None]:
tokenizer = Tokenizer()
#creating index for words
tokenizer.fit_on_texts(df_train['cleaned'])

In [None]:
print('Vocabulary Size=>',len(tokenizer.word_index))

In [None]:
# Converting word sequence to integer sequence
train_seq = tokenizer.texts_to_sequences(df_train['cleaned']) 
test_seq = tokenizer.texts_to_sequences(df_test['cleaned'])

# Padding

In [None]:
# Padding with zero
train_seq=pad_sequences(train_seq,maxlen=100,padding='post')
test_seq=pad_sequences(test_seq,maxlen=100,padding='post')

In [None]:
vocabulary=len(tokenizer.word_index)+1
print('Vocabulary Size=>',vocabulary)

In [None]:
print('Shape of train_sequence=>',train_seq.shape)
print('Shape of test_sequence=>',test_seq.shape)

In [None]:
y_train=df_train[['toxic','severe_toxic','obscene','threat','insult','identity_hate']].values
print(y_train.shape)

# Preparing Data for TPU

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((train_seq, y_train))
    .shuffle(42)
    .batch(TOTAL_BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)
test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(test_seq)
    .batch(TOTAL_BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

In [None]:
print(train_dataset)
print(test_dataset)

# Modeling

In [None]:
def generate_model(filters,dropout):
    input_1=Input(shape=(100,))
    embedding_1=Embedding(vocabulary,100)(input_1)
    conv_1=Conv1D(filters=int(round(filters)),kernel_size=7,padding="same")(embedding_1)
    dropout_1=Dropout(dropout)(conv_1)
    pool_1=GlobalMaxPooling1D()(dropout_1)

    dense=Dense(128,activation='relu')(pool_1)
    output=Dense(6,activation='sigmoid')(dense)

    model=Model(inputs=[input_1],outputs=output)
    # Compile Model
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),loss=tf.keras.losses.BinaryCrossentropy(),metrics=["accuracy"])
    
    return model

In [None]:
# def evaluate_network(filters,dropout):
#     boot=MultilabelStratifiedKFold(n_splits=5,shuffle=True,random_state=42)
#     score_list=[]
    
#     # Loop through samples
#     for train, test in boot.split(train_seq,y_train):
#         # Creating Train Set
#         x_train_split,y_train_split=train_seq[train],y_train[train]
#         train_split=(
#             tf.data.Dataset
#             .from_tensor_slices((x_train_split, y_train_split))
#             .batch(TOTAL_BATCH_SIZE)
#             .cache()
#             .prefetch(AUTO)
#         )
#         # Creating Test Set
#         x_test_split,y_test_split=train_seq[test],y_train[test]
#         test_split=(
#             tf.data.Dataset
#             .from_tensor_slices((x_test_split,y_test_split))
#             .batch(TOTAL_BATCH_SIZE)
#             .cache()
#             .prefetch(AUTO)
#         )
        
#         with strategy.scope():
#             model=generate_model(filters,dropout)
#         es=EarlyStopping(monitor='val_loss', mode='min', verbose=0,patience=5,min_delta=1e-5)
#         model.fit(train_split, batch_size=512,
#                   epochs=100, verbose=0, validation_data=test_split, callbacks=[es])
        
#         # Validating on test split
#         pred=model.predict(test_split)
#         #Evaluating on ROC-AUC
#         score=roc_auc_score(y_test_split,pred)
#         score_list.append(score)
#         #Clearing session
#         K.clear_session()
        
#     return np.mean(score_list)

In [None]:
# # Bounding region for parameter space
# param_space={
#     "filters":(320,370),
#     "dropout":(0,0.1)
# }

In [None]:
# %%time

# # Running Bayesian Optimizer
# optimizer=BayesianOptimization(f=evaluate_network,pbounds=param_space,random_state=42,verbose=2)
# optimizer.maximize(init_points=2, n_iter=20)
# max_param=optimizer.max
# print(max_param)

In [None]:
with strategy.scope():
    model=generate_model(352,0.06675)
es=EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=5,min_delta=1e-5)
mc = ModelCheckpoint("/kaggle/working/model.hdf5", monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(train_seq, y_train, batch_size=512,
          epochs=100, verbose=1, validation_split=0.2, callbacks=[es,mc])

In [None]:
#In-sample Evaluation
train_pred=model.predict(train_seq)
print('In-sample Evaluation ROC-AUC Score:\n',roc_auc_score(y_train,train_pred))

In [None]:
final_pred=model.predict(test_dataset)

In [None]:
#Dataframe for final probabilties
prob=pd.DataFrame(columns=['id','toxic','severe_toxic','obscene','threat','insult','identity_hate'],index=df_test.index)
prob['id']=df_test['id']
for index,value in enumerate(['toxic','severe_toxic','obscene','threat','insult','identity_hate']):
    prob[value]=final_pred[:,index]

In [None]:
prob

In [None]:
prob.to_csv('submission-CNN-single-2-100-100-7-opt-opt.csv',index=False)