# Package imports 

In [None]:
# Data wrangling 
import pandas as pd 

# Fastetext embeddings
import fasttext

# Fasttext utilities
import fasttext.util

# Importing regex 
import re 

# Array math 
import numpy as np

# OS traversal
import os 

# Zip files
import zipfile

# Plotting 
import matplotlib.pyplot as plt

# Itteration tracking
from tqdm import tqdm

# Machine learning 
import xgboost as xgb

# Time tracking 
import time

# Data scalers
from sklearn.preprocessing import MinMaxScaler

# Vectorization of text 
from sklearn.feature_extraction.text import CountVectorizer

# Hp parameter search 
from sklearn.model_selection import ParameterGrid

# Text cleaning function 

The preprocesing of text is key in many NLP objectives. The function which will clean data is defined bellow. 

In [None]:
def clean_text(text: str) -> str:
    """
    Function to clean the text
    """
    # Lowering 
    text = text.lower()
    
    # Leaving only the english letters and numerics
    text = text.replace('\n', ' ')

    # Removing the punctuations
    text = re.sub(r'[^\w\s]', ' ', text)

    # Removing the special characters
    text = re.sub('[^A-Za-z0-9]+', ' ', text)

    # Removing more than 1 whitespaces
    text = re.sub('\s+', ' ', text)

    return text

# Data reading 

In order to create a good classifier to evaluate the toxicity of a comment, we need to gather as much prior labeled observations as possible. Luckaly, there are numerous data sources that provide a label for toxicity in one way or another. In this notebook, I shall use data from: 

* https://www.kaggle.com/julian3833/jigsaw-toxic-comment-classification-challenge
* https://www.kaggle.com/julian3833/jigsaw-unintended-bias-in-toxicity-classification
* This competition's data

The goal is to create a classifier $f$:

$$f: \mathbb{X} \rightarrow \mathbb{Y}$$

Where 

$\mathbb{X}$ - comment (text)

$\mathbb{Y}$ - toxicity score ($\in$ $\mathbb{R}$)

The output score is real number thus we need to convert data in all the datasets to be applicable for a **regression** ML algorithm.

## Main data

In [None]:
# Defining the input directory
_input_dir = '/kaggle/input/jigsaw-toxic-severity-rating/'

# Defining the path to the input file 
_input_file = os.path.join(_input_dir, 'comments_to_score.csv')
_val_file = os.path.join(_input_dir, 'validation_data.csv')

# Reading the data file
d = pd.read_csv(_input_file)

# Reading the validation data 
dval = pd.read_csv(_val_file)

In [None]:
print(f"Number of comments:\n{d.shape[0]}\nColumns:\n{d.columns.tolist()}")

In [None]:
# Eyeballing some data 
print(d.sample(1)['text'].tolist())

## Inspecting the validation data set 

In [None]:
print(f"Number of observations in validation set:\n{dval.shape[0]}")
print(f"Sample of data:\n{dval.sample(10)}")

In [None]:
# Cleaning the main data 
d['clean_text'] = [clean_text(x) for x in tqdm(d['text'], desc='Cleaning the main submission file text', total=len(d))]
dval['clean_less_toxic'] = [clean_text(x) for x in tqdm(dval['less_toxic'], desc='Cleaning the less toxic text in validation', total=len(dval))]
dval['clean_more_toxic'] = [clean_text(x) for x in tqdm(dval['more_toxic'], desc='Cleaning the more toxic text in validation', total=len(dval))]

## Additional data 

### Jigsaw toxicity classification challenge

In [None]:
# Path to data 
aux_path = '/kaggle/input/jigsaw-toxic-comment-classification-challenge/'
aux_file_path = os.path.join(aux_path, 'train.csv')

# Reading the data 
daux = pd.read_csv(aux_file_path)

In [None]:
print(f"Shape of data:\n{daux.shape}\nColumns:\n{daux.columns.tolist()}")

In [None]:
print(daux.head(10))

## Creating input for model 

In this competition, it is important to measure the scale of toxicity. It is importnat to distinguish between low toxicity and high toxicity. 

To encompass that logic, from the collumns 
* toxic
* severe_toxic
* obscene
* threat
* insult
* identity_hate

I will create a column $Y$ that is the sum of all of the above columns. The higher the sum - the higher the toxicity. 

At the end, we will scale the Y variable to be in the range of [0, 1]. 

In [None]:
# Cleaning the text 
daux['comment_text_clean'] = [clean_text(x) for x in tqdm(daux['comment_text'], desc='Preprocesing the comments', total=len(daux))]

In [None]:
# Defining the toxicity level columns
toxicity_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Summing all the columns
daux['Y'] = daux.apply(lambda x: x[toxicity_columns].sum(), axis=1)

# Inspecting the distribution of the Y 
agg = daux.groupby("Y", as_index=False).size()
agg['share_in_data'] = agg['size'] / agg['size'].sum()

plt.bar(x=agg['Y'], height=agg['share_in_data'])
plt.title(f"Total data points: {agg['size'].sum()}")
plt.show()

In [None]:
# Scaling to 0 - 1 range the Y variable
scaler = MinMaxScaler()
daux['Y'] = scaler.fit_transform(daux['Y'].values.reshape(-1, 1))

# Leaving only the needed columns 
daux = daux[['comment_text_clean', "Y"]].copy()

## Jigsaw unintended bias competition 

In [None]:
# Path to data 
aux_path = '/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/'
aux_file_path = os.path.join(aux_path, 'train.csv')

# Reading the data 
daux2 = pd.read_csv(aux_file_path, usecols=['comment_text', 'target'])

In [None]:
print(f"Shape of data:\n{daux2.shape}\nColumns:\n{daux2.columns.tolist()}")

In [None]:
print(daux2.head(10))

### Creating model digestable input 

This target in this dataset fits perfectly with the target from the previous auxilary data set. The only thing needed is to do some text cleaning. 

In [None]:
daux2['comment_text_clean'] = [clean_text(x) for x in tqdm(daux2['comment_text'], desc='Cleaning additional data source text', total=len(daux2))]

In [None]:
# Renaming the target to the Y variable 
daux2.rename(columns={'target': 'Y'}, inplace=True)

# Rearanging
daux2 = daux2[['comment_text_clean', 'Y']]

## Merging all the datasources data 

In [None]:
# Concatenating the dataframes
#train = pd.concat([daux, daux2]).copy()
train = daux.copy()

# Deleting the big objects from memory 
del daux, daux2

In [None]:
print(f"Number of training observations: {train.shape[0]}")
print(f"Share of 'neutral' comments in the dataset: {round(np.sum(train['Y'] == 0) * 100 / train.shape[0], 3)}%")

# Creating sparse matrix for training 

In order to convert text to numbers for any ML algorithm to work with, we will use the bag-of-words representation of the texts. 

Lets say we have $n$ texts.

For that representation, we must define the maximum number of words $k$ that we shall use. 

The input matrix $\mathbb{X}$ will then have the following shape: 

$$\mathbb{X}_{nxk}$$

Each entry in the matrix is either 1 or 0 - corresponding an appereance of a word in text. 

In [None]:
# Defining the hyper parameters 
vect_dict = {
    'max_features': 40000,
    'ngram_range': (1, 4),
    'binary': False,
    'stop_words': 'english',
}

# Initiating the count vectorizer 
vectorizer = CountVectorizer(**vect_dict)

# Fitting on text 
vectorizer.fit(train['comment_text_clean'])

In [None]:
# Creating the sparse X matrix 
X = vectorizer.transform(train['comment_text_clean'])

# Extracting the Y variable
Y = train['Y'].values

## Training the xgboost regression model 

We will treat this as a regression problem. 

The ML algorithm of choice is **xgboost.** 

To search for the optimal parameters, we will use a simple grid search. 

To evaluate which parameters are the best, we will use the added validation set. 

* Get the score for less toxic 
* Get the score for more toxic 
* Calculate the number of correct predictions

In [None]:
# Creating the BOW matrices of the validation data 
bow_less_toxic = vectorizer.transform(dval['clean_less_toxic'])
bow_more_toxic = vectorizer.transform(dval['clean_more_toxic'])

In [None]:
# Creating the BOW matrix for the final submission 
bow_submission = vectorizer.transform(d['clean_text'])

In [None]:
# Defining a list of hyperparameters 
hp_dict = {
    "objective": ['reg:squarederror'],
    "tree_method": ['gpu_hist'],
    "max_depth": [4, 6, 8],
    'n_estimators': [200, 400, 600, 800]
}

# Creating the hp grid 
hp_grid = ParameterGrid(hp_dict)

# Max score tracker  
max_score = 0

# Best hp dictionary 
best_hp = {}

for hp in hp_grid: 
    # Initiating the empty model
    reg = xgb.XGBRegressor(**hp)

    # Fitting on data 
    reg.fit(X, Y)

    # Predicting 
    less_toxic_hat = reg.predict(bow_less_toxic)
    more_toxic_hat = reg.predict(bow_more_toxic)

    # Calculating how many entries are larger in more toxic set
    # than in less toxic set 
    current_score = np.sum([less_toxic_hat[i] < more_toxic_hat[i] for i in range(len(less_toxic_hat))])
    current_score = current_score / len(less_toxic_hat)

    # Checking if this is the highest auc 
    if current_score > max_score:
        max_score = current_score 
        best_hp = hp 
        
        print(f"New best hp parameters found:\n{best_hp}\nBest score: {round(max_score, 3)}")
        
        # Applying the best found model
        score = reg.predict(bow_submission)

# Creating the final submission file 

In [None]:
# Saving the score
d['score'] = score

# Sorting by distance 
d.sort_values(by='score', inplace=True)

In [None]:
# Most "light" comments
d.head(10)

In [None]:
# Most "severe" comments
d.tail(10)

In [None]:
# File for submission
d[['comment_id', 'score']].to_csv("submission.csv", index=False)