## XLNet dataset can be found [here](https://www.kaggle.com/devkhant24/tensorlfow-xlnet).

In [None]:
# Importing libraries

import math
import os
import random
import numpy as np
import pandas as pd
import re
import unidecode
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from imblearn.under_sampling import RandomUnderSampler


import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel

In [None]:
# Defining constants

AUTO = tf.data.experimental.AUTOTUNE

model_name = "../input/tensorlfow-xlnet"
Max_len = 512
Batch_size = 8


data_prev_comp = "../input/toxic-comment/jigsaw-toxic-comment-train.csv"
data_cur_comp = "../input/jigsaw-toxic-severity-rating/comments_to_score.csv"


def seed_everything():
    np.random.seed(123)
    random.seed(123)
    os.environ["TF_CPP_MIN_LOG_LEVEL"] = '2'
    os.environ['PYTHONHASHSEED'] = str(123)

seed_everything()

In [None]:
# Function for cleaning comments

def clean_data(sent):
    sent = sent.replace('\\n', ' ').replace('\n', ' ').replace('\t',' ').replace('\\', ' ').replace('. com', '.com')
    soup = BeautifulSoup(sent, "html.parser")
    sent = soup.get_text(separator=" ")
    remove_https = re.sub(r'http\S+', '', sent)
    sent = re.sub(r"\ [A-Za-z]*\.com", " ", remove_https)
    sent = unidecode.unidecode(sent)
    sent = sent.lower()
    sent = re.sub(r"[^a-zA-Z0-9:$-,()%.?!]+", ' ', sent) 
    sent = re.sub(r"[:$-,()%.?!]+", ' ',sent)
    stoplist = stopwords.words("english")
    sent = [word for word in word_tokenize(sent) if word not in stoplist]
    sent = " ".join(sent)
    
    return sent

In [None]:
# Reading train file from previous competition

df = pd.read_csv(data_prev_comp)
y_features = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

df["severe_toxic"] = df["severe_toxic"] * 2
df["y"] = (df[y_features].sum(axis=1)).astype(int)
df["y"] = df["y"] / df["y"].max()
df.drop(y_features, axis=1, inplace = True)
df.head()

In [None]:
# For dropping 170000 rows of y with value 0
# To balance dataset

df.drop(df[df["y"] == 0].sample(180000).index.tolist(), inplace = True)

In [None]:
# Seeing that dataset is imbalanced

df["y"].value_counts()

In [None]:
# Creating column clean_text for cleaned comments

df["comment_text"] = df["comment_text"].map(clean_data)

xtrain, xtest, ytrain, ytest = train_test_split(df["comment_text"], df["y"], test_size = 0.2)

In [None]:
# Function for creating word encodings
#用于创建单词编码的函数
def roberta_encode(texts, tokenizer):
    ct = len(texts)
    input_ids = np.ones((ct, Max_len), dtype='int32')
    attention_mask = np.zeros((ct, Max_len), dtype='int32')

    for k, text in enumerate(texts):
        encoded_text = tokenizer.tokenize(text)
        
        # Truncate and convert tokens to numerical IDs
        enc_text = tokenizer.convert_tokens_to_ids(encoded_text[:(Max_len-2)])
        
        input_length = len(enc_text) + 2
        input_length = input_length if input_length < Max_len else Max_len
        
        # Add tokens [CLS] and [SEP] at the beginning and the end
        input_ids[k,:input_length] = np.asarray([0] + enc_text + [2], dtype='int32')
        
        # Set to 1s in the attention input
        attention_mask[k,:input_length] = 1
        
    return{
        'input_words_ids': input_ids,
        'input_mask': attention_mask,
    }

In [None]:
# Initializing Tokenizer
#初始化标记器
tokenizer = AutoTokenizer.from_pretrained(model_name)


# Creating encodings for train and validation data
X_train = roberta_encode(xtrain, tokenizer)
X_validation = roberta_encode(xtest, tokenizer)

Y_train = np.asarray(ytrain, dtype="int32")
Y_validation = np.asarray(ytest, dtype="int32")

In [None]:
# Function for building the Roberta Model
#用于构建Roberta模型的函数
def build_model():
    input_words_ids = tf.keras.Input(shape=(Max_len, ), dtype = tf.int32, name="input_words_ids")
    input_mask = tf.keras.Input(shape=(Max_len, ), dtype = tf.int32, name="input_mask")
    
    roberta_model = TFAutoModel.from_pretrained(model_name)
    x = roberta_model(input_words_ids, attention_mask = input_mask)
    x = tf.keras.layers.Dropout(0.2)(x[0])
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(256, activation='relu')(x)
    x = tf.keras.layers.Dense(1, activation = "sigmoid")(x)
    
    model = tf.keras.Model(inputs = [input_words_ids, input_mask], outputs = x)
    
    model.compile(
        optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5),
        loss = "BinaryCrossentropy",
        metrics=["accuracy"]
    )
    return model

In [None]:
# Training the model

model = build_model()

history = model.fit(
            X_train,
            Y_train,
            epochs = 1,
            batch_size = Batch_size,
            validation_data = (X_validation, Y_validation)
)

In [None]:
# Reading the test data

test = pd.read_csv(data_cur_comp)

test["text"] = test["text"].map(clean_data)
X_test = roberta_encode(test["text"], tokenizer)

In [None]:
# Making prediction and creating submission file

pred = model.predict(X_test)

final = pd.DataFrame()
final["comment_id"] = test["comment_id"]
final["score"] = pred
final.to_csv("submission.csv", index = False)

In [None]:
final.head()