In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Libraries

In [None]:
from xgboost import XGBRegressor 
import optuna
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import sys
import pandas as pd
from datasets import Dataset
from transformers import PreTrainedTokenizerFast
from tokenizers import (
    models,
    normalizers,
    pre_tokenizers,
    trainers,
    Tokenizer,
)



In [None]:
SEED = 41
CLASSIFY_DATA = "/kaggle/input/comment-classify/train.csv"
RUDDIT_DATA = "/kaggle/input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv"

n_trials = 20
DATA = 'ruddit'

# Data extraction from file 

In [None]:
def get_ruddit_data():
    df_ruddit = pd.read_csv("../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")
    df_ruddit = df_ruddit[['txt', 'offensiveness_score']].rename(columns = {'txt': 'text', 'offensiveness_score': 'y'})
    df_ruddit['y'] = (df_ruddit['y'] - df_ruddit.y.min()) / (df_ruddit.y.max() - df_ruddit.y.min())
    df_ruddit = df_ruddit[df_ruddit['text']!='[deleted]'] 
    df_ruddit = df_ruddit.drop_duplicates()
    df_ruddit = df_ruddit.dropna()    

    raw_tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
    raw_tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)
    raw_tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
    special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
    trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)

    dataset = Dataset.from_pandas(df_ruddit[['text']])

    def get_training_corpus():
        for i in range(0, len(dataset), 1000):
            yield dataset[i : i + 1000]["text"]

    raw_tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

    tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=raw_tokenizer,
        unk_token="[UNK]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        sep_token="[SEP]",
        mask_token="[MASK]",
    )

    labels = df_ruddit['y']
    comments = df_ruddit['text']

    return comments, labels, tokenizer


In [None]:

def get_data_classify():
    df_train = pd.read_csv(TRAIN_DATA_PATH)
    cat_mtpl = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
                'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}
    for category in cat_mtpl:
        df_train[category] = df_train[category] * cat_mtpl[category]
    df_train['score'] = df_train.loc[:, 'toxic':'identity_hate'].mean(axis=1)
    df_train['y'] = df_train['score']
    min_len = (df_train['y'] > 0).sum()  # len of toxic comments
    df_y0_undersample = df_train[df_train['y'] == 0].sample(n=min_len, random_state=SEED)  # take non toxic comments
    df_train_final = pd.concat([df_train[df_train['y'] > 0], df_y0_undersample])  # make new df
    raw_tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
    raw_tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)
    raw_tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
    special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
    trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)

    dataset = Dataset.from_pandas(df_train_final[['comment_text']])

    def get_training_corpus():
        for i in range(0, len(dataset), 1000):
            yield dataset[i : i + 1000]["comment_text"]

    raw_tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

    tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=raw_tokenizer,
        unk_token="[UNK]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        sep_token="[SEP]",
        mask_token="[MASK]",
    )

    labels = df_train_final['y']
    comments = df_train_final['comment_text']

    return comments, labels, tokenizer

# Choose which data evaluation to be done

In [None]:
if DATA  == 'classify':
    comments, labels, tokenizer = get_data_classify()
    tokenized_comments = tokenizer(comments.to_list(), padding=True)['input_ids']
elif DATA  == 'ruddit':
    comments, labels, tokenizer = get_ruddit_data()
    tokenized_comments = tokenizer(comments.to_list(), padding=True)['input_ids']


In [None]:

def objective_xgb(trial):    
    params = {
            'n_estimators':trial.suggest_int("n_estimators", 1000, 20000),
            'learning_rate' : trial.suggest_uniform('learning_rate', 0.001, 1),
            'subsample': trial.suggest_uniform('subsample', 0.1, 1),
            'colsample_bytree':trial.suggest_uniform('colsample_bytree', 0.1, 1),
            'max_depth': trial.suggest_categorical('max_depth', [1,3,5,7,9,11,13,15,17,20]),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 10.0),
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 10.0),
            'tree_method': 'gpu_hist'
        }
    model = XGBRegressor(**params)
    x_train, x_valid, y_train, y_valid =  train_test_split(tokenized_comments , labels, test_size=0.1, shuffle=True, random_state=1)
    model.fit(
        x_train , y_train,
        eval_set=[(x_valid, y_valid)],
        early_stopping_rounds=100,
        verbose=0
    )
    return mean_squared_error(y_valid, model.predict(x_valid))

def objective_ridge(trial):    
    params = {
            'max_iter':trial.suggest_int("max_iter", 1000, 20000),
            'tol': trial.suggest_loguniform('tol', 1e-4, 0.1),
            'alpha': trial.suggest_uniform('alpha', 0.1, 1),
            'solver': trial.suggest_categorical('solver', ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'])
        }

    x_train, x_valid, y_train, y_valid =  train_test_split(tokenized_comments , labels, test_size=0.1, shuffle=True, random_state=1)
    model = Ridge(**params)
    model.fit(
        x_train , y_train
    )
    return mean_squared_error(y_valid, model.predict(x_valid)) 

In [None]:
study = optuna.create_study(
    direction='minimize',
    study_name='XG_boost'
)

study.optimize(
    objective_xgb,
    n_trials=n_trials
)

best_classify_trial = study.best_trial.value
best_classify_params = study.best_trial.params

study = optuna.create_study(
    direction='minimize',
    study_name='Ridge'
)

study.optimize(
    objective_ridge,
    n_trials=n_trials
)

print(f"Best Trial Ridge: {study.best_trial.value}")
print(f"Best Params Ridge: {study.best_trial.params}")

print(f"Best Trial XGB: {best_classify_trial}")
print(f"Best Params XGB: {best_classify_params}")
