In [1]:
import os
import json
import pathlib
from datetime import datetime
import optuna
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import torch
from typing import List, Dict, Union, Tuple, NamedTuple
from tqdm import tqdm
import scml
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()

In [2]:
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
job_dir = f"models/xgb/{ts}"
pathlib.Path(job_dir).mkdir(parents=True, exist_ok=True)
num_boost_round: int = 100
lr: Tuple[float, float] = (1e-3, 1e-3)
feature_fraction: Tuple[float, float] = (1, 1)
min_data_in_leaf: Tuple[int, int] = (20, 20)
objective: str = "binary:logistic"
n_trials: int = 1
label = "generated"

In [3]:
df = pd.read_parquet("input/features.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39120 entries, 0 to 39119
Data columns (total 38 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   essay_id                         39120 non-null  int32  
 1   generated                        39120 non-null  int8   
 2   source                           39120 non-null  object 
 3   prompt                           39120 non-null  object 
 4   text                             39120 non-null  object 
 5   text_bsc                         39120 non-null  object 
 6   text_bow                         39120 non-null  object 
 7   text_bow_len                     39120 non-null  int16  
 8   prompt_bsc                       39120 non-null  object 
 9   prompt_bow                       39120 non-null  object 
 10  prompt_bow_len                   39120 non-null  int16  
 11  white_sim                        39120 non-null  float32
 12  ch_len            

In [4]:
features = []
prefixes = ["ch_", "ts_"]
for col in df.columns:
    for prefix in prefixes:
        if col.startswith(prefix):
            features.append(col)
features.sort()
print(f"{len(features)} features\n{features}")

26 features
['ch_digit_frac', 'ch_len', 'ch_letter_frac', 'ch_punc_frac', 'ch_repeat_char_frac', 'ch_space_frac', 'ch_upper_frac', 'ts_automated_readability_index', 'ts_coleman_liau_index', 'ts_dale_chall_readability_score', 'ts_difficult_words', 'ts_flesch_kincaid_grade', 'ts_flesch_reading_ease', 'ts_gunning_fog', 'ts_lexicon_count', 'ts_linsear_write_formula', 'ts_mcalpine_eflaw', 'ts_monosyllable_frac', 'ts_polysyllable_frac', 'ts_sentence_count', 'ts_smog_index', 'ts_spache_readability', 'ts_syllable_count', 'ts_syllables_per_sent', 'ts_syllables_per_word', 'ts_words_per_sent']


In [5]:
#X_train, X_test, y_train, y_test = train_test_split(tra[features], tra[label], test_size=0.2)

tra = df[df["white_sim"]>=0.45]
val = df[df["white_sim"]<0.45]
t = len(tra)
v = len(val)
n = t+v
print(f"val%={v/n:.4f}, len(tra)={t:,}, len(val)={v:,}")
dtrain = xgb.DMatrix(tra[features], tra[label], enable_categorical=False)
dval = xgb.DMatrix(val[features], val[label], enable_categorical=False)

val%=0.0293, len(tra)=37,974, len(val)=1,146


In [6]:
%%time
model = xgb.train(
   params={
       "objective": objective,
       "learning_rate": 5e-2,
       "min_child_weight": 20,
       "colsample_bytree": 0.5,
       "max_depth": 6,
   },
   dtrain=dtrain,
   num_boost_round=1000,
   evals=[(dtrain, "train"), (dval, "val")],
   verbose_eval=40,
   early_stopping_rounds=40,
)
print(f"best score {model.best_score:.5f} at iteration {model.best_iteration}")
model.save_model(f"{job_dir}/model.json")

[0]	train-logloss:0.51106	val-logloss:1.07027
[40]	train-logloss:0.18531	val-logloss:0.43078
[80]	train-logloss:0.13724	val-logloss:0.35905
[120]	train-logloss:0.12196	val-logloss:0.34477
[160]	train-logloss:0.11445	val-logloss:0.34112
[200]	train-logloss:0.10861	val-logloss:0.34042
[232]	train-logloss:0.10438	val-logloss:0.34144
best score 0.34019 at iteration 193
CPU times: user 2.56 s, sys: 2.22 s, total: 4.77 s
Wall time: 359 ms


In [7]:
%%time
y_true = val[label].tolist()
y_pred = model.predict(data=dval, iteration_range=(0, model.best_iteration+1))
auc = roc_auc_score(y_true, y_pred, average="macro")
print(f"auc={auc:.4f}")
print(f"y_pred={y_pred.shape}\n{y_pred[:5]}")

auc=0.9718
y_pred=(1146,)
[0.66876435 0.9581846  0.76490575 0.40556785 0.9887474 ]
CPU times: user 7.92 ms, sys: 9.31 ms, total: 17.2 ms
Wall time: 1.76 ms


In [8]:
%%time
scores = model.get_score(importance_type="gain")
assert len(scores)!=0
rows = []
for feature, score in scores.items():
    rows.append({'importance': score, 'feature': feature})
idf = pd.DataFrame.from_records(rows)
idf = idf.sort_values(["importance"], ascending=False, ignore_index=True)
fp = f"{job_dir}/importance.csv"
idf.to_csv(fp, index=True)
print(f"Saved {fp}")
idf.T.head()

Saved models/xgb/20240119_023535/importance.csv
CPU times: user 7.69 ms, sys: 15.7 ms, total: 23.4 ms
Wall time: 2.3 ms


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25
importance,248.988647,211.463318,128.60704,86.614632,63.013161,62.729,61.17429,60.815582,49.650307,33.610146,31.162924,30.303158,29.758476,28.55234,26.219093,24.848303,23.787834,23.266651,21.641712,19.54439,14.838866,14.37348,12.513229,12.010085,9.378815,8.070851
feature,ts_polysyllable_frac,ts_syllables_per_word,ch_space_frac,ch_punc_frac,ch_digit_frac,ts_sentence_count,ch_letter_frac,ts_lexicon_count,ts_smog_index,ts_words_per_sent,ts_coleman_liau_index,ts_monosyllable_frac,ch_upper_frac,ch_len,ts_syllables_per_sent,ts_syllable_count,ts_difficult_words,ts_dale_chall_readability_score,ts_flesch_reading_ease,ts_spache_readability,ts_gunning_fog,ts_flesch_kincaid_grade,ts_mcalpine_eflaw,ch_repeat_char_frac,ts_linsear_write_formula,ts_automated_readability_index


In [9]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:00:00.595526
