# Import all packages ###

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
from tensorflow import keras
from tensorflow.keras.layers import Dense,Dropout, Input
from tqdm import tqdm
import pickle
from sklearn.metrics import confusion_matrix,f1_score,classification_report
import matplotlib.pyplot as plt
import itertools
from sklearn.utils import shuffle
from tensorflow.keras import regularizers
from transformers import *
from transformers import BertTokenizer, TFBertModel, BertConfig,TFDistilBertModel,DistilBertTokenizer,DistilBertConfig

#  Preprocessing and cleaning functions  ###

In [2]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def clean_stopwords_shortwords(w):
    stopwords_list=stopwords.words('english')
    words = w.split() 
    clean_words = [word for word in words if (word not in stopwords_list) and len(word) > 2]
    return " ".join(clean_words) 

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w=clean_stopwords_shortwords(w)
    w=re.sub(r'@\w+', '',w)
    return w

#  Reading and Cleaning the Dataset  ###

In [21]:
# data_file='./data/spam.csv'
# data=pd.read_csv(data_file,encoding='ISO-8859-1')
# data.head()

# import pandas as pd
# data = pd.read_csv("/kaggle/input/cyberbull/dataset.csv")
# data


import pandas as pd
data = pd.read_csv("/kaggle/input/cyber-bulling-large-dataset/Cyber_Bullying.csv", encoding='latin1', delimiter=';')
data

Unnamed: 0,Text,Label
0,`- This is not ``creative``. Those are the di...,0.0
1,` :: the term ``standard model`` is itself le...,0.0
2,"True or false, the situation as of March 200...",0.0
3,"Next, maybe you could work on being less cond...",0.0
4,This page will need disambiguation.,0.0
...,...,...
448883,` These sources don't exactly exude a sense ...,0.0
448884,The Institute for Historical Review is a pee...,0.0
448885,:The way you're trying to describe it in this...,0.0
448886,== Warning == There is clearly a protection...,0.0


# Removing Null value ###

In [23]:
data.isnull().any()
missing_values = ['NONE', 'None', 'none', 'NAN', 'Nan', 'nan', 'NA', 'Na', 'na']
data.isnull().sum()

data.dropna(inplace=True)

# Change column type ###

In [24]:
data['Text']=data['Text'].astype(str)
data['Label'] = data['Label'].astype(int)

# Shuffling the dataset ###

In [25]:
print('File has {} rows and {} columns'.format(data.shape[0],data.shape[1]))
data = shuffle(data)

data.head()

File has 448873 rows and 2 columns


Unnamed: 0,Text,Label
266337,"Start Page This is a start page, so if anyone ...",0
375966,I'm not defending anything to you . Your bia...,1
324222,@GemmaNoon @OaklandElle scrubbing bubbles work...,0
362469,Unblock me please. I will do my job as a edi...,0
338343,To Sean (repeated from my Talk Page): It's O...,0


# Rename column and applying the preprocess function to the dataset

In [None]:
data=data.rename(columns = {'Text': 'text', 'Label': 'label'}, inplace = False)

# data['gt'] = data['label'].map({'ham':0,'spam':1})

print('Available labels: ',data.label.unique())
data['text']=data['text'].map(preprocess_sentence)

num_classes=len(data.label.unique())

data.head()

Available labels:  [0 1]


# Save the preprocessed data ###

In [None]:
data.to_csv('/kaggle/working/mycsvfile.csv',index=False)

In [None]:
data["text"].dtypes
data["label"].dtypes

#  Loading DistilBERT Tokenizer and the DistilBERT model  ###

In [None]:
dbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
dbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

#  Preparing input for the model  ###

In [None]:
max_len=32
sentences=data['text']
labels=data['label']
len(sentences),len(labels)
data

#  Create a basic NN model using DistilBERT embeddings to get the predictions  ###

In [None]:
def create_model():
    inps = Input(shape = (max_len,), dtype='int64')
    masks= Input(shape = (max_len,), dtype='int64')
    dbert_layer = dbert_model(inps, attention_mask=masks)[0][:,0,:]
    dense = Dense(512,activation='relu',kernel_regularizer=regularizers.l2(0.01))(dbert_layer)
    dropout= Dropout(0.5)(dense)
    pred = Dense(num_classes, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)
    model = tf.keras.Model(inputs=[inps,masks], outputs=pred)
    print(model.summary())
    return model   
model=create_model()

# Prepare the model input 

In [None]:
input_ids=[]
attention_masks=[]

for sent in sentences:
    dbert_inps=dbert_tokenizer.encode_plus(sent,add_special_tokens = True,max_length =max_len,pad_to_max_length = True,return_attention_mask = True,truncation=True)
    input_ids.append(dbert_inps['input_ids'])
    attention_masks.append(dbert_inps['attention_mask'])

input_ids=np.asarray(input_ids)
attention_masks=np.array(attention_masks)
labels=np.array(labels)

# Train Test split and setting up the loss function, accuracy and optimizer for the model. 

In [None]:
train_inp,val_inp,train_label,val_label,train_mask,val_mask=train_test_split(input_ids,labels,attention_masks,test_size=0.2)

train_inp =tf.convert_to_tensor(train_inp, dtype=tf.float32)
val_inp = tf.convert_to_tensor(val_inp, dtype=tf.float32)

train_mask = tf.convert_to_tensor(train_mask, dtype=tf.float32)
val_mask = tf.convert_to_tensor(val_mask, dtype=tf.float32)

print('Train inp shape {} Val input shape {}\nTrain label shape {} Val label shape {}\nTrain attention mask shape {} Val attention mask shape {}'.format(train_inp.shape,val_inp.shape,train_label.shape,val_label.shape,train_mask.shape,val_mask.shape))


log_dir='dbert_model'
model_save_path='./dbert_model.h5'

callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path,save_weights_only=True,monitor='val_loss',mode='min',save_best_only=True),keras.callbacks.TensorBoard(log_dir=log_dir)]

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)

model.compile(loss=loss,optimizer=optimizer, metrics=[metric])

In [None]:
callbacks= [tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path,save_weights_only=True,monitor='val_loss',mode='min',save_best_only=True),keras.callbacks.TensorBoard(log_dir=log_dir)]
model.compile(loss=loss,optimizer=optimizer, metrics=[metric])

# Training

In [None]:
history=model.fit([train_inp,train_mask],train_label,batch_size=16,epochs=20,validation_data=([val_inp,val_mask],val_label),callbacks=callbacks)

In [None]:
# plot model loss
plt.figure(dpi=100)
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='val')
plt.legend(loc='best')
plt.xlabel('epochs')
plt.ylabel('mse')
plt.title('Model loss')
plt.show()

In [None]:
import keras
from matplotlib import pyplot as plt
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()