### I have first trained DistilBert seperately and then used the trained model to predict the toxicity.

### The notebook for DistilBert is [here](https://www.kaggle.com/devkhant24/distilbert-for-jigsaw-comment) and the trained model can be found [here](https://www.kaggle.com/devkhant24/distilbert-jigsaw-comments).

In [None]:
# Importing libraries

import math
import os
import random
import numpy as np
import pandas as pd
import re
import unidecode
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from imblearn.under_sampling import RandomUnderSampler


import tensorflow as tf
from transformers import DistilBertTokenizerFast, TFDistilBertModel
from transformers import Trainer, TrainingArguments
from tokenizers import BertWordPieceTokenizer

In [None]:
# Defining constants

Max_length = 512
model_name = "../input/distilbertbaseuncased"
Batch_size = 8
AUTO = tf.data.experimental.AUTOTUNE

train_prev_comp = "../input/toxic-comment/jigsaw-toxic-comment-train.csv"
test_cur_comp = "../input/jigsaw-toxic-severity-rating/comments_to_score.csv"


def seed_everything():
    np.random.seed(123)
    random.seed(123)
    os.environ["TF_CPP_MIN_LOG_LEVEL"] = '2'
    os.environ['PYTHONHASHSEED'] = str(123)

seed_everything()

In [None]:
def build_model(transformer, max_len=512):
    
    input_word_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:,1]
    out = tf.keras.layers.Dense(1, activation='sigmoid')(cls_token)
    
    model = tf.keras.Model(inputs=input_word_ids, outputs=out)
    model.compile(tf.keras.optimizers.Adam(learning_rate=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):

    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(length = maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size]
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [None]:
# Function for cleaning comments

def clean_data(sent):
    sent = sent.replace('\\n', ' ').replace('\n', ' ').replace('\t',' ').replace('\\', ' ').replace('. com', '.com')
    soup = BeautifulSoup(sent, "html.parser")
    sent = soup.get_text(separator=" ")
    remove_https = re.sub(r'http\S+', '', sent)
    sent = re.sub(r"\ [A-Za-z]*\.com", " ", remove_https)
    sent = unidecode.unidecode(sent)
    sent = sent.lower()
    sent = re.sub(r"[^a-zA-Z0-9:$-,()%.?!]+", ' ', sent) 
    sent = re.sub(r"[:$-,()%.?!]+", ' ',sent)
    stoplist = stopwords.words("english")
    sent = [word for word in word_tokenize(sent) if word not in stoplist]
    sent = " ".join(sent)
    
    return sent

In [None]:
# Reading train file from previous competition

df = pd.read_csv(train_prev_comp)


df["y"] = (df[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].sum(axis=1) > 0).astype(int)
df.drop(["id","toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"], axis=1, inplace = True)
df.head()

In [None]:
# Seeing that dataset is imbalanced

df["y"].value_counts()

In [None]:
# Balacing dataset

X = np.array(df["comment_text"].values)
X = X.reshape(-1,1)
y = np.array(df["y"].values)
rus = RandomUnderSampler(random_state=0)
x, y = rus.fit_resample(X, y)

x = x.flatten()
df = pd.DataFrame()
df["text"] = x
df["target"] = y


# Now its balanced

df["target"].value_counts()

In [None]:
# Creating column clean_text for cleaned comments

df["text"] = df["text"].map(clean_data)

x = list(df["text"])
y = list(df["target"])
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2)

In [None]:
# Initializing Tokenizer

tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
tokenizer.save_pretrained('.')

fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer

In [None]:
# Creating encoding for train and validation
train_encodings = fast_encode(xtrain, fast_tokenizer, maxlen = Max_length)
val_encodings = fast_encode(xtest, fast_tokenizer, maxlen = Max_length)

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((train_encodings, ytrain))
    .repeat()
    .shuffle(1024)
    .batch(Batch_size)
    .prefetch(AUTO)
)

val_dataset = (
    tf.data.Dataset
    .from_tensor_slices((val_encodings, ytest))
    .repeat()
    .shuffle(1024)
    .batch(Batch_size)
    .prefetch(AUTO)
)

In [None]:
transformer_layer = TFDistilBertModel.from_pretrained(model_name)

model = build_model(transformer_layer, max_len = Max_length)
model.summary()

In [None]:
n_steps = len(xtrain)

predictor = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=val_dataset,
    epochs=3
)

In [None]:
# Initializing Bert Tokenizer and Model
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)

model = RobertaForSequenceClassification.from_pretrained(model_name).to("cuda")

In [None]:
# Function to get predicitions
def get_prediction(text):
    
    text = clean_data(text)
    inputs = tokenizer(text, truncation = True, padding = True, max_length = Max_length, return_tensors = "pt").to("cuda")
    output = model(**inputs)
    probs = output[0].softmax(1)
    return probs[:,1].item()

In [None]:
# Reading given test dataset 
# Storing predicted values in score column

test = pd.read_csv(file_path)

test["score"] = test["text"].map(get_prediction)

In [None]:
# Making submission file

final = pd.DataFrame()
final["comment_id"] = test["comment_id"]
final["score"] = test["score"]
final.to_csv("submission.csv", index=False)

In [None]:
final.head()