In [10]:
import sys
import os
import gc

import json
import time
import pandas as pd
from pathlib import Path
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score
import numpy as np
import lightgbm as lgb
from lightgbm import LGBMClassifier
import catboost as ctb
from catboost import CatBoostClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
    SentencePieceBPETokenizer
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

import optuna
from optuna.samplers import TPESampler, CmaEsSampler
from optuna.pruners import MedianPruner, HyperbandPruner
from optuna.visualization import (
    plot_optimization_history,
    plot_param_importances,
    plot_parallel_coordinate,
)

optuna.logging.set_verbosity(optuna.logging.WARNING)

import zipfile
import warnings
from warnings import simplefilter 
warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [11]:
#Needs from the KAGGLE_USERNAME and KAGGLE_KEY defines in the kaggle.json downloaded from your Kaggle account
!kaggle datasets download -d thedrcat/daigt-v2-train-dataset
!kaggle competitions download -c llm-detect-ai-generated-text

In [12]:
class CFG:
    JOBS_PATH = Path("/", "kaggle", "working", "job_artifacts")
    TRAINING_PATH = Path(".", "daigt-v2-train-dataset", "train_v2_drcat_02.csv")
    TEST_PATH = Path(".", "llm-detect-ai-generated-text", "test_essays.csv")
    SUB_PATH = Path(".", "llm-detect-ai-generated-text", "sample_submission.csv")
    BEST_PARAMS = "best_params_voting.json"
    INFERENCE = False
    ENSEMBLE = False
    TEST_SIZE = 0.1
    NGRAM_RANGE = (3, 5)
    LOWERCASE = False
    VOCAB_SIZE = 30522
    RANDOM_STATE = 6743
    N_FOLDS = 1
    USE_GPU = False
    MODEL = "voting_classifier"
    SAMPLER = TPESampler()
    PRUNER = HyperbandPruner()
    CROSSVAL = StratifiedShuffleSplit(n_splits=N_FOLDS, test_size=TEST_SIZE, random_state=RANDOM_STATE)
    USE_BEST = False
    HYPERTUNE = False
    PRELOAD_STUDY = False
    EARLY_STOP = 50
    N_TRIALS = 100
    N_JOBS = 1

In [13]:
with zipfile.ZipFile(CFG.TRAINING_PATH.parent.stem + ".zip", "r") as zip_ref:
    zip_ref.extractall(CFG.TRAINING_PATH.parent)
with zipfile.ZipFile(CFG.TEST_PATH.parent.stem + ".zip", "r") as zip_ref:
    zip_ref.extractall(CFG.TEST_PATH.parent)

In [14]:
def xgb_param_space(trial):
    # num_leaves should be smaller than 2^{max_depth}
    #num_leaves = trial.suggest_int("num_leaves", 8, 64)

    param_space = {
    }

    return param_space

def xgb_callbacks(trial):
    optuna_prune = optuna.integration.XGBoostPruningCallbackXG(trial, "auc")

    #return [inner_early_stop]
    return [optuna_prune]

def xgb_best_params():
   
    param_space = {
        "device": "gpu" if CFG.USE_GPU else "cpu",
        "verbosity": 0,
        #"n_jobs": 1,
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "n_estimators": 3000, # automatically via early stopping
        "learning_rate": 0.00581909898961407, 
        "colsample_bytree": 0.78,
        "colsample_bynode": 0.8,
        "random_state": 6743
    }

    return param_space

def lgbm_param_space(trial):
    # num_leaves should be smaller than 2^{max_depth}
    max_depth = trial.suggest_int("max_depth", 5, 12)
    num_leaves = trial.suggest_int("num_leaves", 8, int((2**max_depth) * 0.75))
    #num_leaves = trial.suggest_int("num_leaves", 8, 64)

    param_space = {
        "device": "gpu" if CFG.USE_GPU else "cpu",
        "verbose": -1,
        "n_jobs": 1,
        "objective": "cross_entropy",
        "metric": "auc",
        "importance_type": "gain",
        "boosting_type": "gbdt",
        "n_estimators": 3000, # automatically via early stopping
        "subsample": trial.suggest_float("subsample", 0.4, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 0, 3),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1.0, log=True),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "num_leaves": num_leaves,
        "max_depth": max_depth
    }

    return param_space

def lgbm_callbacks(trial):
    optuna_prune = optuna.integration.LightGBMPruningCallback(trial, "auc")
    inner_early_stop = lgb.early_stopping(50, verbose=False)

    #return [inner_early_stop]
    return [optuna_prune, inner_early_stop]


def lgbm_best_params():
   
    param_space = {
        "device": "gpu" if CFG.USE_GPU else "cpu",
        "verbose": -1,
        #"n_jobs": 1,
        "objective": "binary",
        "metric": "auc",
        #"importance_type": "gain",
        "n_estimators": 1500, # automatically via early stopping
        "learning_rate": 0.05073909898961407,
        "colsample_bytree": 0.726023996436955,
        "colsample_bynode": 0.5803681307354022,
        #"random_state": 6743
        "lambda_l1": 8.562963348932286,
        "lambda_l2": 4.893256185259296,
        "min_data_in_leaf": 115,
        "max_depth": 23,
        "max_bin": 898
    }

    return param_space


def ctb_param_space(trial):
    # num_leaves should be smaller than 2^{max_depth}
    max_depth = trial.suggest_int("max_depth", 5, 12)
    num_leaves = trial.suggest_int("num_leaves", 8, int((2**max_depth) * 0.75))
    #num_leaves = trial.suggest_int("num_leaves", 8, 64)

    param_space = {
        #"task_type": "GPU" if CFG.USE_GPU else "CPU",
        "verbose": 0,
        "loss_function": "CrossEntropy",
        "eval_metric": "AUC",
        "use_best_model": True,
        "allow_const_label": True,
        "bootstrap_type": "Bayesian",
        "n_estimators": 3000, # automatically via early stopping
        #"subsample": trial.suggest_float("subsample", 0.4, 1.0),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10.0, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1.0, log=True),
        "random_strength": trial.suggest_float("random_strength", 1e-6, 10.0, log=True),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 10.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        #"num_leaves": num_leaves,
        "max_depth": max_depth
    }

    return param_space


def ctb_callbacks(trial):
    optuna_prune = optuna.integration.CatBoostPruningCallback(trial, "AUC")

    return [optuna_prune]


def ctb_best_params():

    param_space = {
        #"n_estimators": 3000,
        "iterations": 1000,
        "verbose": 0,
        "task_type": "GPU" if CFG.USE_GPU else "CPU",
        "random_seed": 1234,
        #"eval_metric": "AUC",
        #"bootstrap_type": "Bernoulli",
        "l2_leaf_reg": 6.6591278779517808,
        "learning_rate": 0.005689066836106983/2,
        #"subsample": 0.35,
        "allow_const_label": True,
        "loss_function": "CrossEntropy"
    }
    
    return param_space


def mnb_param_space(trial):
    
    param_space = {
        "alpha": trial.suggest_float("alpha", 1e-5, 1, log=True)
    }
    
    return param_space

def mnb_callbacks(trial):
    return []


def mnb_best_params():
    
    param_space = {
        "alpha": 0.02
    }

    return param_space


def sgd_param_space(trial):

    param_space = {
        "max_iter": trial.suggest_categorical("max_iter", [2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]), 
        "tol": 1e-4,
        "penalty": "elasticnet",
        "loss": trial.suggest_categorical("loss", ["log_loss", "modified_huber"]),
        "l1_ratio": trial.suggest_float("l1_ratio", 0., 1.),
        "alpha": trial.suggest_float("alpha", 1e-5, 1, log=True),
    }
    
    return param_space


def sgd_callbacks(trial):
    return []


def sgd_best_params():
    
    param_space = {
        "max_iter": 8000, 
        "tol": 1e-4, 
        "loss": "modified_huber", 
        #"random_state": 6743
    }

    return param_space


def vot_param_space(trial):

    param_space = {
        "w_mnb": trial.suggest_float("w_mnb", 0., 1.), 
        "w_sgd": trial.suggest_float("w_sgd", 0., 1.),
        "w_ctb": trial.suggest_float("w_ctb", 0., 1.), 
        "w_lgb": trial.suggest_float("w_lgb", 0., 1.),
    }
    
    return param_space


def vot_callbacks(trial):
    return []


def vot_best_params():

    param_space = {
        "w_mnb": 0.05,
        "w_sgd": 0.35,
        #"w_xgb": 0.2,
        "w_ctb": 0.85,
        "w_lgb": 0.05
    }

    return param_space


In [15]:
MODEL_CFG = {
    LGBMClassifier.__name__ : {
        "param_space": lgbm_param_space,
        "callbacks": lgbm_callbacks,
        "best_params": lgbm_best_params
    },
    XGBClassifier.__name__ : {
        "param_space": xgb_param_space,
        "callbacks": xgb_callbacks,
        "best_params": xgb_best_params
    },
    CatBoostClassifier.__name__ : {
        "param_space": ctb_param_space,
        "callbacks": ctb_callbacks,
        "best_params": ctb_best_params
    },
    MultinomialNB.__name__ : {
        "param_space": mnb_param_space,
        "callbacks": mnb_callbacks,
        "best_params": mnb_best_params
    },
    SGDClassifier.__name__ : {
        "param_space": sgd_param_space,
        "callbacks": sgd_callbacks,
        "best_params": sgd_best_params
    },
    "voting_classifier": {
        "param_space": vot_param_space,
        "callbacks": vot_callbacks,
        "best_params": vot_best_params
    }
}  

In [16]:
def voting_classifier(*args, **kwargs):
    weights = []
    if "w_mnb" in kwargs:
        weights.append(kwargs["w_mnb"])
        kwargs["params_mnb"] = MODEL_CFG[MultinomialNB.__name__]["best_params"]()
    if "w_sgd" in kwargs:
        weights.append(kwargs["w_sgd"])
        kwargs["params_sgd"] = MODEL_CFG[SGDClassifier.__name__]["best_params"]()
    if "w_xgb" in kwargs:
        weights.append(kwargs["w_xgb"])
        kwargs["params_xgb"] = MODEL_CFG[XGBClassifier.__name__]["best_params"]()
    if "w_lgb" in kwargs:
        weights.append(kwargs["w_lgb"])
        kwargs["params_lgb"] = MODEL_CFG[LGBMClassifier.__name__]["best_params"]()
    if "w_ctb" in kwargs:
        weights.append(kwargs["w_ctb"])
        kwargs["params_ctb"] = MODEL_CFG[CatBoostClassifier.__name__]["best_params"]()


    def _voting_classifier_inner(weights, *args, **kwargs):

        estimators = []
        if "params_mnb" in kwargs:
            estimators.append(("mnb", MultinomialNB(**kwargs["params_mnb"])))
        if "params_sgd" in kwargs:
            estimators.append(("sgd", SGDClassifier(**kwargs["params_sgd"])))
        if "params_xgb" in kwargs:
            estimators.append(("xgb", XGBClassifier(**kwargs["params_xgb"])))
        if "params_lgb" in kwargs:
            estimators.append(("lgb", LGBMClassifier(**kwargs["params_lgb"])))
        if "params_ctb" in kwargs:
            estimators.append(("ctb", CatBoostClassifier(**kwargs["params_ctb"])))


        voting = VotingClassifier(estimators=estimators, voting='soft', weights=weights, verbose=True)
        return voting
    
    return _voting_classifier_inner(weights, *args, **kwargs)


if CFG.MODEL == voting_classifier.__name__:
    CFG.MODEL = locals()[voting_classifier.__name__]

In [17]:
CFG.JOBS_PATH.mkdir(parents=True, exist_ok=True)

In [18]:
test = pd.read_csv(CFG.TEST_PATH)
sub = pd.read_csv(CFG.SUB_PATH)
train = pd.read_csv(CFG.TRAINING_PATH, sep=',')

In [19]:
excluded_prompt_name_list = ['Distance learning','Grades for extracurricular activities', 'Summer projects']
train = train[~(train['prompt_name'].isin(excluded_prompt_name_list))]
train = train.drop_duplicates(subset=['text'])
train.reset_index(drop=True, inplace=True)

In [20]:
# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()] if CFG.LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=CFG.VOCAB_SIZE, special_tokens=special_tokens)
dataset = Dataset.from_pandas(test[['text']])
def train_corp_iter(): 
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)
tokenized_texts_test = []
for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

tokenized_texts_train = []
for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))

100%|██████████| 3/3 [00:00<?, ?it/s]
100%|██████████| 34497/34497 [00:53<00:00, 644.04it/s]


In [21]:
# This dummy tokenizer is because the text has been already parsed
def dummy(text):
    return text

vectorizer = TfidfVectorizer(ngram_range=CFG.NGRAM_RANGE, lowercase=False, sublinear_tf=True, analyzer = 'word',
    tokenizer = dummy,
    preprocessor = dummy,
    token_pattern = None, strip_accents='unicode')

vectorizer.fit(tokenized_texts_test)

# Getting vocab
vocab = vectorizer.vocabulary_

vectorizer = TfidfVectorizer(ngram_range=CFG.NGRAM_RANGE, lowercase=False, sublinear_tf=True, 
                            vocabulary=vocab if CFG.INFERENCE else None, 
                            analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            token_pattern = None, strip_accents='unicode'
                            )

tf_train = vectorizer.fit_transform(tokenized_texts_train).astype(np.float32)
tf_test = vectorizer.transform(tokenized_texts_test).astype(np.float32)
y_train = train['label'].values

del vectorizer, tokenized_texts_test, tokenized_texts_train, dataset, raw_tokenizer, tokenizer, train, test, vocab
gc.collect()

In [None]:
def cross_validate(model_type, model_params, x, y, cv=StratifiedKFold(), scorer=roc_auc_score, *args, **kwargs):

    scores = np.zeros(cv.n_splits)

    models = []
    print(f"Starting evaluation...")
    print("=" * 30)

    y_preds = []
    for i, (train_index, val_index) in enumerate(cv.split(x, y)):

        x_train, x_val = x[train_index], x[val_index]
        y_train, y_val = y[train_index], y[val_index]

        model = model_type(**model_params)

        eval_set = None
        if x_val.shape[0] > 0:
            eval_set = [(x_val, y_val)]

        start = time.time()
        try:
            model.fit(x_train, y_train, eval_set=eval_set, *args, **kwargs)
        except:
            model.fit(x_train, y_train)
        end = time.time()

        models.append(model)

        if eval_set is not None:
            y_pred = model.predict_proba(x_val)[:, 1]
            scores[i] = scorer(y_val, y_pred)
            y_preds.append(y_pred)

            print(f"Fold {i + 1}: {scores[i]:.4f} (took {end - start:.2f}s)")
            
        gc.collect()

    print("-" * 30)
    print(f"Average MAE = {scores.mean():.4f} ± {scores.std():.4f}")
    print("=" * 30)
    
    return scores, models

In [None]:
# Optimization runner
def run_optimization(objective, n_trials=100, sampler=TPESampler(), pruner=None, n_jobs=1, best_trial=None):
    """Run the given objective with Optuna and return the study results."""

    study_name = "optuna-study"
    storage = "sqlite:///" + str(CFG.JOBS_PATH.joinpath(study_name)) + ".db"
    print(f"Creating study...")


    if CFG.PRELOAD_STUDY:
        study = optuna.load_study(
            study_name=study_name, 
            sampler=sampler,
            pruner=pruner,
            storage=storage
        )

    else:
        try:
            optuna.study.delete_study(
                study_name=study_name,
                storage=storage
            )
        except:
            print(f"Study does not exist...")

        study = optuna.create_study(
            study_name=study_name, 
            direction="maximize", 
            sampler=sampler,
            pruner=pruner,
            storage=storage
        )

    def save_best_params(study, trial):
        with open(CFG.JOBS_PATH.joinpath(CFG.BEST_PARAMS), "w") as f:
            json.dump(study.best_params, f, indent=4)
        
    if best_trial is not None:
        print("Enqueuing previous best trial ...")
        study.enqueue_trial(best_trial)

    print(f"Running hyperparameter search...")
    study.optimize(objective, n_trials=n_trials, n_jobs=n_jobs, show_progress_bar=True, callbacks=[save_best_params])

    print(f"Number of finished trials: {len(study.trials)}")
    print(f"Best AUC: {study.best_value:.4f}")

    print("Parameters: ")
    print("=" * 10)
    print(study.best_params)

    print(f"Saving best parameters...")

    return study

In [None]:
def get_objective_function(model_cls, param_func, x, y, cv=StratifiedKFold(), callbacks_func=None, *args, **kwargs):
    """Returns the objective function for optuna."""
        
    def objective_function(trial):
        """Optimizes a model with cross-validation."""

        param_space = param_func(trial)
        callbacks = callbacks_func(trial) if callbacks_func is not None else None  
        scores, _ = cross_validate(model_cls, param_space, x, y, cv=cv, callbacks=callbacks, *args, **kwargs)
        return scores.mean()

    return objective_function

In [None]:
model_cls = CFG.MODEL
model_cfg = MODEL_CFG[CFG.MODEL.__name__]
param_space_func = model_cfg["param_space"]
callbacks_func = model_cfg["callbacks"]

if CFG.HYPERTUNE:
    objective = get_objective_function(
        model_cls, 
        param_space_func, 
        tf_train, y_train, 
        cv=CFG.CROSSVAL,
        callbacks_func=callbacks_func,
        early_stopping_rounds=CFG.EARLY_STOP
    )
    study = run_optimization(
        objective, 
        sampler=CFG.SAMPLER, 
        pruner=CFG.PRUNER,
        n_trials=CFG.N_TRIALS, 
        n_jobs=CFG.N_JOBS
    )

In [None]:
model_cls = CFG.MODEL
model_cfg = MODEL_CFG[CFG.MODEL.__name__]
model_params = model_cfg["best_params"]()

if CFG.ENSEMBLE:
    scores, trained_models = cross_validate(model_cls, model_params, tf_train, y_train, cv=CFG.CROSSVAL)
    gc.collect()
    
    predictions = 0.
    for model in trained_models:
        predictions += model.predict_proba(tf_test)[:, 1] / len(trained_models)

else:
    model = model_cls(**model_params)
    print(model)
    model.fit(tf_train, y_train)
    gc.collect()
    
    predictions = model.predict_proba(tf_test)[:, 1]

In [None]:
sub['generated'] = predictions
sub.to_csv('submission.csv', index=False)
sub