# Package imports 

In [None]:
# Data wrangling 
import pandas as pd 

# Fastetext embeddings
import fasttext

# Importing regex 
import re 

# Array math 
import numpy as np

# OS traversal
import os 

# Zip files
import zipfile

# Plotting 
import matplotlib.pyplot as plt

# Itteration tracking
from tqdm import tqdm

# Machine learning 
import xgboost as xgb

# Time tracking 
import time

# Data scalers
from sklearn.preprocessing import MinMaxScaler

# Hp parameter search 
from sklearn.model_selection import ParameterGrid

# Data reading 

In order to create a good classifier to evaluate the toxicity of a comment, we need to gather as much prior labeled observations as possible. Luckaly, there are numerous data sources that provide a label for toxicity in one way or another. In this notebook, I shall use data from: 

* https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data 
* This competition's data

The goal is to create a classifier $f$:

$$f: \mathbb{X} \rightarrow \mathbb{Y}$$

Where 

$\mathbb{X}$ - comment (text)

$\mathbb{Y}$ - toxicity score ($\in$ $\mathbb{R}$)

In [None]:
# Defining the input directory
_input_dir = '/kaggle/input/jigsaw-toxic-severity-rating/'

# Defining the path to the input file 
_input_file = os.path.join(_input_dir, 'comments_to_score.csv')
_val_file = os.path.join(_input_dir, 'validation_data.csv')

# Reading the data file
d = pd.read_csv(_input_file)

# Reading the validation data 
dval = pd.read_csv(_val_file)

In [None]:
print(f"Number of comments:\n{d.shape[0]}\nColumns:\n{d.columns.tolist()}")

In [None]:
# Eyeballing some data 
print(d.sample(1)['text'].tolist())

## Inspecting the validation data set 

In [None]:
print(f"Number of observations in validation set:\n{dval.shape[0]}")
print(f"Sample of data:\n{dval.sample(10)}")

## Additional data 

### Jigsaw toxicity classification challenge

In [None]:
# Path to data 
_aux_path = '/kaggle/input/jigsaw-toxic-comment-classification-challenge/'
_aux_file_path = os.path.join(_aux_path, 'train.csv')

# Reading the data 
_d_jigsaw = pd.read_csv(_aux_file_path)

In [None]:
print(f"Shape of data:\n{_d_jigsaw.shape}\nColumns:\n{_d_jigsaw.columns.tolist()}")

In [None]:
print(_d_jigsaw.head(10))

# Loading the fasttext embeddings 

The embeddings can be dowloaded using kaggle's **add data** feature. The link to embeddings is: https://www.kaggle.com/kambarakun/fasttext-pretrained-word-vectors-english

In [None]:
# Reading the embeddings 
embeddings = fasttext.load_model('/kaggle/input/fasttext-pretrained-word-vectors-english/wiki.en.bin')

# Text cleaning

The text cleaning function will be used throughout all the data sources for consistency. 

In [None]:
def clean_text(text: str) -> str:
    """
    Function to clean the text for embedding creation
    """
    # Lowering 
    text = text.lower()
    
    # Leaving only the english letters and numerics
    text = text.replace('\n', ' ')

    # Removing the punctuations
    text = re.sub(r'[^\w\s]', ' ', text)

    # Removing the special characters
    text = re.sub('[^A-Za-z0-9]+', ' ', text)

    # Removing more than 1 whitespaces
    text = re.sub('\s+', ' ', text)

    return text

# Applying the fucntion to the texts
d['clean_text'] = [clean_text(x) for x in d['text']]
dval['clean_less_toxic'] = [clean_text(x) for x in dval['less_toxic']]
dval['clean_more_toxic'] = [clean_text(x) for x in dval['more_toxic']]

# Model for jigsaw classification challenge 

## Creating the Y variable 

In this competition, it is important to measure the scale of toxicity. It is importnat to distinguish between low toxicity and high toxicity. 

To encompass that logic, from the collumns 
* toxic
* severe_toxic
* obscene
* threat
* insult
* identity_hate

I will create a column $Y$ that is the sum of all of the above columns. The higher the sum - the higher the toxicity. 

In [None]:
toxicity_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Ensuring correct types 
_d_jigsaw['Y'] = _d_jigsaw.apply(lambda x: x[toxicity_columns].sum(), axis=1)

In [None]:
# Inspecting the distribution of the Y 
agg = _d_jigsaw.groupby("Y", as_index=False).size()
agg['share_in_data'] = agg['size'] / agg['size'].sum()

plt.bar(x=agg['Y'], height=agg['share_in_data'])
plt.title(f"Total data points: {agg['size'].sum()}")
plt.show()

In [None]:
Y = _d_jigsaw['Y'].values.tolist()

## Creating the X matrix for training

The term embedding means to convert text into a number vector. In our case, the vector will have 300 coordinates: 

$$f(text) \rightarrow \mathbb{R}^{300} = [x_{1}, x_{2}, ..., x_{300}]$$

The unit of text is a comment. Thus, no matter the number of words in the comment, the strategy is to somehow represent the text as a vector of 300 coordinates.

The basic strategy is the following:

* Get the embeddings of each of the words in the description.
* Average each coordinate of the gotten word embeddings. 

For example, if a comment has $k$ words, then the initial all word embedding matrix is $\mathbb{M}_{kx300}$. To get the final embedding of the comment, we will average column wise the matrix. Thus, each coordinate of the final embedding $y$ is: 

$$y = [\dfrac{1}{k} \sum_{i=1}^{k}M[i, 1], \dfrac{1}{k} \sum_{i=1}^{k}M[i, 2],..., \dfrac{1}{k} \sum_{i=1}^{k}M[i, k]]$$

In [None]:
# Cleaning the descriptions 
_d_jigsaw['comment_text_clean'] = [clean_text(x) for x in tqdm(_d_jigsaw['comment_text'], desc='Cleaning comments', total=len(_d_jigsaw))]

# Getting the mean embedding for each comment (X matrix for models) 
X = [embeddings.get_sentence_vector(text) for text in tqdm(_d_jigsaw['comment_text_clean'], desc='Creating embeddings', total=len(_d_jigsaw))]

# Converting to an array that is digestable for ML frameworks
X = np.array(X).reshape(-1, 300)

## Grid search for best hyper parameters

We will treat this as a regression problem. 

The ML algorithm of choice is **xgboost.** 

We will test out the model based on the provided validation set. 

The strategy to evaluate the model is the following: 

* Get the score for less toxic 
* Get the score for more toxic 
* Calculate the number of correct predictions

In [None]:
# Creating the embeddings for less_toxic comments 
less_toxic_embedding = [embeddings.get_sentence_vector(text) for text in dval['clean_less_toxic']]
more_toxic_embedding = [embeddings.get_sentence_vector(text) for text in dval['clean_more_toxic']]

# Reshaping for model 
less_toxic_embedding = np.array(less_toxic_embedding).reshape(-1, 300)
more_toxic_embedding = np.array(more_toxic_embedding).reshape(-1, 300)

In [None]:
# Getting the embeddings for all the sentences 
submission_matrix = [embeddings.get_sentence_vector(text) for text in d['clean_text']]

# Reshaping 
submission_matrix = np.array(submission_matrix).reshape(-1, 300)

In [None]:
# Defining a list of hyperparameters 
hp_dict = {
    "objective": ['reg:squarederror'],
    "tree_method": ['gpu_hist'],
    "max_depth": [4, 5, 6, 7, 8],
    'n_estimators': [200, 400, 600, 800]
}

# Creating the hp grid 
hp_grid = ParameterGrid(hp_dict)

# Max score tracker  
max_score = 0

# Best hp dictionary 
best_hp = {}

# Initiating the "best" score list 
score = []

for hp in hp_grid: 

    # Initiating the empty model
    reg = xgb.XGBRegressor(**hp)

    # Fitting on data 
    reg.fit(X, Y)
    
    # Predicting 
    less_toxic_hat = reg.predict(less_toxic_embedding)
    more_toxic_hat = reg.predict(more_toxic_embedding)

    # Calculating how many entries are larger in more toxic set
    # than in less toxic set 
    current_score = np.sum([less_toxic_hat[i] < more_toxic_hat[i] for i in range(len(less_toxic_hat))])
    current_score = current_score / len(less_toxic_hat)

    # Checking if this is the highest auc 
    if current_score > max_score:
        max_score = current_score 
        best_hp = hp 
        
        print(f"New best hp parameters found:\n{best_hp}\nBest score: {round(max_score, 3)}")
        
        # Applying the best found model
        score = reg.predict(submission_matrix)

# Creating the final submission 

In [None]:
# Saving to the dataframe 
d['score'] = score

# Sorting by distance 
d.sort_values(by='score', inplace=True)

# Final submission file 

In [None]:
# Most "light" comments
d.head(10)

In [None]:
# Most "severe" comments
d.tail(10)

In [None]:
d[['comment_id', 'score']].to_csv("submission.csv", index=False)