In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Setup

In [None]:
!pip install tensorflow-text
!pip install bert-tensorflow==1.0.1
!pip install -q tf-models-official==2.4.0
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [None]:
import re
import nltk
import tensorflow_text
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer
import tensorflow as tf
lem = WordNetLemmatizer()
ps = PorterStemmer()
from sklearn.model_selection import train_test_split

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense,Input,GlobalMaxPooling1D
from keras.layers import Conv1D,MaxPooling1D,Embedding,Bidirectional
from keras.layers import LSTM,Dropout
from keras import regularizers
from keras.optimizers import Adam
from keras.models import Sequential
from keras.callbacks import ReduceLROnPlateau, TensorBoard
from sklearn.metrics import roc_auc_score

## Configuration

In [None]:
# some configuration
max_sequence_length = 100
max_vocab_size = 20000
embedding_dim = 300
validation_split = 0.2
batch_size = 128
epoch = 5

sizes ={"tiny":16,"mini":32,"small":64,"medium":128,"large":256,"grand":512}
size = "tiny"

In [None]:
df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
df2 = df.copy()
df_test = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')

In [None]:
df.head()
df_test.head()

In [None]:
df.info()

In [None]:
len(df.comment_text.max())

In [None]:
df.describe()

In [None]:
column_list = [f for f in df.columns if df.dtypes[f] != 'object']
dfP = pd.DataFrame(columns=column_list)
for col in column_list:
    dfP.loc[0,col] = df[df[col] == 1][col].sum()
dfP['non_hate'] = df.shape[0] - dfP.sum(axis=1)    

In [None]:
pie, ax = plt.subplots(figsize=[13,10])
labels = dfP.keys()
plt.pie(x=dfP.values[0], autopct="%.12f", explode=[0.05]*len(dfP.values[0]), labels=labels, pctdistance=0.55)
plt.title("Types of Toxic Comments", fontsize=14);
del dfP
del column_list

## Preprocessing

In [None]:
def clean_text(text):
    
    text = text.lower()
    text = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', '', text) # clean url
    text = re.sub(r'#(\w+)', '', text)   # clean hashes
    text = re.sub(r'@(\w+)', '', text)   # clean @
    text = re.sub(r'<[^>]+>', '', text)  # clean tags
    text = re.sub(r'\d+', '', text)      # clean digits
    text = re.sub(r'[,!@\'\"?\.$%_&#*+-:;]', '', text)   # clean punctuation
    #word_tokens = nltk.word_tokenize(text)
    #filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    #text = "".join(filtered_sentence)
    text = lem.lemmatize(text)
    return text


## Creating embedding layer

In [None]:
# load in pre-trained vectors
# loading word vectors by using pre trained glove.6B.txt file
print('Loading word vectors...')
word2vec = {}
with open(os.path.join('../input/glove6b/glove.6B.%sd.txt' % embedding_dim)) as f:
    # word vec[0] vec[1] vec[2] ...
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float128')
        word2vec[word] = vec

In [None]:
df.isnull().sum()

In [None]:
# prepare text samples and their labels
print('loading in comments...')
clean_sentences = df2['comment_text'].apply(clean_text)
sentences = clean_sentences
sentences[0]

In [None]:
print('loading in test comments...')
test_sentences = df_test['comment_text'].values

In [None]:
possible_labels = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
train_labels = df[possible_labels].values

In [None]:
# convert the sentences into tokens/integers
tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

In [None]:
# get word -> integer mapping
word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))

In [None]:
# pad sequences so that we get a NxT matrix
train_data = pad_sequences(sequences,maxlen=max_sequence_length)

In [None]:
test_data = pad_sequences(test_sequences,maxlen=max_sequence_length)
test_data

In [None]:
# prepare embedding matrix
print('Filling pre-trained embeddings...')
num_words = min(max_vocab_size, len(word2idx) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word2idx.items():
      if i < max_vocab_size:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
          # words not found in embedding index will be all zeros.
          embedding_matrix[i] = embedding_vector

In [None]:
import datetime

In [None]:
from cyclic_lr import CyclicLR

clr_cnn = CyclicLR(base_lr=8e-5,max_lr=4e-4,step_size=4000,mode='triangular2')

clr_lstm = CyclicLR(base_lr=1e-6,max_lr=1e-4,step_size=4000,mode='triangular2')

clr_hybrid = CyclicLR(base_lr=1e-6,max_lr=1e-4,step_size=2000,mode='triangular2')

In [None]:
embedding_layer = Embedding(num_words,
                           embedding_dim,
                           weights=[embedding_matrix],
                           input_length= max_sequence_length,
                           trainable = False,name='Embedding')


In [None]:
#Create train,validation set, with split of 0.2
X_train, X_test, y_train, y_test = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss',patience=2,mode='min',min_delta=0.005)
 
    
    
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.3,
                                                     patience=2, min_lr=0.000001)
from lrfinder import LRFinder
lr_finder = LRFinder(min_lr=1e-7, 
                                 max_lr=1e-2, 
                                 steps_per_epoch=998, 
                                 epochs=5)


In [None]:
def print_auc_loss(history):
    history_dict = history.history
    print(history_dict.keys())

    acc = history_dict['auc']
    val_acc = history_dict['val_auc']
    loss = history_dict['loss']
    val_loss = history_dict['val_loss']

    epochs = range(1, len(acc) + 1)
    fig = plt.figure(figsize=(10, 6))
    plt.subplot(2, 1, 1)
    # "bo" is for "blue dot"
    plt.plot(epochs, loss, 'r', label='Training loss')
    # b is for "solid blue line"
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()
    
    
    plt.plot(epochs, acc, 'r', label='Training AUC')
    plt.plot(epochs, val_acc, 'b', label='Validation AUC')
    plt.title('Training and validation AUC')
    plt.xlabel('Epochs')
    plt.ylabel('AUC')
    plt.legend(loc='lower right')
    plt.show()

## Hybrid Model

In [None]:
model_hybrid = Sequential(name='Hybrid')
model_hybrid.add(embedding_layer)
model_hybrid.add(Bidirectional(LSTM(sizes[size],return_sequences=True),name='Bidirectional'))
model_hybrid.add(Dense(sizes[size],name='Dense1'))
model_hybrid.add(Dropout(0.2,name='Dropout1'))
model_hybrid.add(Conv1D(sizes[size],3))
model_hybrid.add(GlobalMaxPooling1D(name='Pooling'))
model_hybrid.add(Dense(sizes[size],name='Dense2'))
model_hybrid.add(Dropout(0.2,name='Dropout2'))
model_hybrid.add(Dense(6,activation='sigmoid',name='Classifier'))
model_hybrid.summary()
model_hybrid.compile(loss = 'binary_crossentropy', optimizer = Adam(), metrics = ['AUC'])
history_hybrid = model_hybrid.fit(X_train,y_train,validation_data=(X_test,y_test),epochs = epoch, batch_size = batch_size ,callbacks=[clr_hybrid,early_stop])

In [None]:
print_auc_loss(history_hybrid)

In [None]:
#Prints the learning rate finder loss vs lr
#lr_finder.plot_loss()
#lr_finder.plot_lr()

## CNN Model

In [None]:
print('Training model')

model_cnn = Sequential(name='CNN')
input_ = Input(shape=(max_sequence_length,))
model_cnn.add(embedding_layer)
model_cnn.add(Conv1D(sizes[size],3,activation='relu',name='Convolutional1'))
model_cnn.add(MaxPooling1D(3))
model_cnn.add(Conv1D(sizes[size],3,activation='relu',name='Convolutional2'))
model_cnn.add(GlobalMaxPooling1D(name='Pooling'))
model_cnn.add(Dense(sizes[size],name='Dense'))
model_cnn.add(Dropout(0.2))
model_cnn.add(Dense(len(possible_labels),activation='sigmoid',name='Classifier'))

model_cnn.compile(loss='binary_crossentropy',
             optimizer=Adam(),
             metrics=['AUC'])


history_cnn = model_cnn.fit(X_train,y_train,batch_size=batch_size, epochs=epoch,validation_data=(X_test,y_test),callbacks=[clr_cnn,early_stop])

In [None]:
#model_losses = pd.DataFrame(history_cnn.history)
print_auc_loss(history_cnn)

In [None]:
#Plots the learning rate when using lr_finder
#lr_finder.plot_loss()
#lr_finder.plot_lr()

## LSTM Model

In [None]:
print('Training model')

model_lstm = Sequential(name='LSTM')
model_lstm.add(embedding_layer)
model_lstm.add(Bidirectional(LSTM(sizes[size]),name='BidirectionalLSTM'))
model_lstm.add(Dense(sizes[size],name='Dense'))
#odel_lstm.add(Dropout(0.2))
model_lstm.add(Dense(sizes[size]))
#odel_lstm.add(Dropout(0.2))
model_lstm.add(Dense(len(possible_labels),activation='sigmoid',name='Classifier'))
    
model_lstm.compile(loss='binary_crossentropy',
             optimizer=Adam(),
             metrics=['AUC','accuracy'])


history_lstm = model_lstm.fit(X_train,y_train,batch_size=batch_size, epochs=epoch,validation_data=(X_test,y_test),callbacks=[clr_lstm,early_stop])



In [None]:
print_auc_loss(history_lstm)

## Finding the mean AUC score of the models trained.

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

In [None]:
total_string = ""

In [None]:
#LSTM
pL = model_lstm.predict(X_test)
aucsL = []
for j in range(6):
    auc = roc_auc_score(y_test[:,j],pL[:,j])
    aucsL.append(auc)
print("auc lstm:" +str(np.mean(aucsL)))
total_string+="auc lstm:" +str(np.mean(aucsL))+"\n"

In [None]:
#CNN
pC = model_cnn.predict(X_test)
aucsC = []
for j in range(6):
    auc = roc_auc_score(y_test[:,j],pC[:,j])
    aucsC.append(auc)
print("auc cnn:" +str(np.mean(aucsC)))
total_string+="auc cnn:" +str(np.mean(aucsC))+"\n"

In [None]:
#Hybrid
pH = model_hybrid.predict(X_test)
aucsH = []
##precs = []
#recalls = []
#f1_scores = []
for j in range(6):
    auc = roc_auc_score(y_test[:,j],pH[:,j])
    aucsH.append(auc)
    
print("auc hybrid:" +str(np.mean(aucsH)))
total_string+="auc hybrid:" +str(np.mean(aucsH))+"\n"


In [None]:
print("size is: "+size+"\n"+"embedding dimensions: "+str(embedding_dim)+"\n"+total_string)

Make CSV out of predictions to be submission ready.

In [None]:
#p = model_cnn.predict(test_data)
#predict = np.hstack((df_test.id[:, np.newaxis], p))
#subm = pd.DataFrame(predict, columns = ['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])
#subm.to_csv('subm_CNN.csv', index = False)

In [None]:
#p = model_lstm.predict(test_data)
#predict = np.hstack((df_test.id[:, np.newaxis], p))
#subm = pd.DataFrame(predict, columns = ['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])
#subm.to_csv('subm_LSTM2.csv', index = False)

In [None]:
#p = model_hybrid.predict(test_data)
#predict = np.hstack((df_test.id[:, np.newaxis], p))
#subm = pd.DataFrame(predict, columns = ['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])
#subm.to_csv('subm_HYBRID2.csv', index = False)

#### Prints the model overview to file.

In [None]:
""""tf.keras.utils.plot_model( model_hybrid, to_file='model_hybrid.png', show_shapes=False, show_dtype=False,
    show_layer_names=True, rankdir='TB', expand_nested=False, dpi=48)"""

In [None]:
#first exp
x = [16,32,64,128,256]
c = [0.924,0.934,0.942,0.952,0.957]
l =[0.948,0.952,0.959,0.964,0.971]
h =[0.953,0.951,0.959,0.966,0.974]
plt.plot(x, c, "-o")
plt.plot(x,l, "-o")
plt.plot(x,h,"-o")

plt.show()

In [None]:
#second exp
x = [16,32,64,128,256]
c = [0.928,0.933,0.944,0.951,0.962]
l =[0.950,0.954,0.955,0.971,0.975]
h =[0.945,0.951,0.958,0.963,0.973]
plt.plot(x, c, "-o")
plt.plot(x,l, "-o")
plt.plot(x,h,"-o")

plt.show()

## BERT MODEL IMPLEMENTATION
Following this tutorial
[Bert google colab](https://colab.research.google.com/github/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb#scrollTo=US_EAnICvP7f)


In [None]:
from official import nlp
from transformers import AutoTokenizer,TFAutoModel
from bert_tokenizer_v2 import FullTokenizer

import tensorflow_hub as hub

In [None]:
from official.nlp import bert
# Load the required submodules
import official.nlp.bert.bert_models
import official.nlp.bert.configs
import official.nlp.bert.run_classifier
import official.nlp.bert.tokenization
import official.nlp.data.classifier_data_lib
import official.nlp.modeling.losses
import official.nlp.modeling.models
import official.nlp.modeling.networks
from official.nlp import optimization

In [None]:
epochs = 4
steps_per_epoch = np.ceil(len(X_train)/batch_size)
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizerr = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [None]:
bert_model_name = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1'
#bert_model_name = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1'
bert_preprocess_name = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

tfhub_handle_encoder = bert_model_name
tfhub_handle_preprocess = bert_preprocess_name

print(f'BERT model selected           : {tfhub_handle_encoder}') 
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

In [None]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

In [None]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.25,name='Dropout')(net)
  net = tf.keras.layers.Dense(6, activation='sigmoid', name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [None]:
bert_model = build_classifier_model()
bert_model.compile(loss='binary_crossentropy',
             optimizer=optimizerr,
             metrics=['AUC'])

bert_model.summary()

In [None]:
#The bert training is commented out, because it takes a lot of time. 

print(f'Training model with {tfhub_handle_encoder}')

X_train, X_test, y_train, y_test = train_test_split(sentences, train_labels, test_size=0.2, random_state=42)

#history_bert = bert_model.fit(X_train,y_train,batch_size=batch_size,
        #                 epochs=epochs,validation_data=(X_test,y_test))

In [None]:
#print_acc_loss(history_bert)

In [None]:
#test_s = df_test['comment_text']
#test_s.apply(clean_text)

In [None]:
"""p = bert_model.predict(test_s)
aucs = []
for j in range(6):
    auc = roc_auc_score(y_test[:,j],p[:,j])
    aucs.append(auc)
print("auc bert:" +str(np.mean(aucs)))"""