In [1]:
import os
import json
import pathlib
from datetime import datetime
import optuna
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
import torch
from typing import List, Dict, Union, Tuple, NamedTuple
from tqdm import tqdm
import scml
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()

In [2]:
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
job_dir = f"models/xgb/{ts}"
pathlib.Path(job_dir).mkdir(parents=True, exist_ok=True)
num_boost_round: int = 100
lr: Tuple[float, float] = (1e-3, 1e-3)
feature_fraction: Tuple[float, float] = (1, 1)
min_data_in_leaf: Tuple[int, int] = (20, 20)
objective: str = "binary:logistic"
n_trials: int = 1
label = "generated"

In [3]:
tra = pd.read_parquet("input/features.parquet")
tra.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39515 entries, 0 to 39514
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   essay_id                  39515 non-null  int64  
 1   generated                 39515 non-null  int8   
 2   source                    39515 non-null  object 
 3   prompt                    39515 non-null  object 
 4   text                      39515 non-null  object 
 5   text_bsc                  39515 non-null  object 
 6   text_bow                  39515 non-null  object 
 7   text_bow_len              39515 non-null  int32  
 8   prompt_bsc                39515 non-null  object 
 9   prompt_bow                39515 non-null  object 
 10  prompt_bow_len            39515 non-null  int32  
 11  ch_len                    39515 non-null  int32  
 12  ch_digit_frac             39515 non-null  float32
 13  ch_letter_frac            39515 non-null  float32
 14  ch_spa

In [4]:
features = []
for col in tra.columns:
    if col.startswith("ch_"):
        features.append(col)
features.sort()
print(f"{len(features)} features\n{features}")

8 features
['ch_digit_frac', 'ch_len', 'ch_letter_frac', 'ch_punc_frac', 'ch_repeat_char_frac', 'ch_repeat_substring_frac', 'ch_space_frac', 'ch_upper_frac']


In [5]:
X_train, X_test, y_train, y_test = train_test_split(tra[features], tra[label], test_size=0.2)
dtrain = xgb.DMatrix(X_train, y_train, enable_categorical=False)
dval = xgb.DMatrix(X_test, y_test, enable_categorical=False)

In [6]:
%%time
model = xgb.train(
   params={
       "objective": objective,
       "learning_rate": 5e-2,
       "min_child_weight": 20,
       "colsample_bytree": 0.5,
       "max_depth": 6,
   },
   dtrain=dtrain,
   num_boost_round=1000,
   evals=[(dtrain, "train"), (dval, "validation")],
   verbose_eval=40,
   early_stopping_rounds=40,
)
print(f"best score {model.best_score:.5f} at iteration {model.best_iteration}")
model.save_model(f"{job_dir}/model.json")

[0]	train-logloss:0.54567	validation-logloss:0.54251
[40]	train-logloss:0.24947	validation-logloss:0.24876
[80]	train-logloss:0.19754	validation-logloss:0.20065
[120]	train-logloss:0.18014	validation-logloss:0.18655
[160]	train-logloss:0.17224	validation-logloss:0.18052
[200]	train-logloss:0.16781	validation-logloss:0.17782
[240]	train-logloss:0.16458	validation-logloss:0.17621
[280]	train-logloss:0.16201	validation-logloss:0.17501
[320]	train-logloss:0.15986	validation-logloss:0.17412
[360]	train-logloss:0.15775	validation-logloss:0.17342
[400]	train-logloss:0.15558	validation-logloss:0.17304
[440]	train-logloss:0.15360	validation-logloss:0.17259
[480]	train-logloss:0.15169	validation-logloss:0.17237
[520]	train-logloss:0.14982	validation-logloss:0.17185
[560]	train-logloss:0.14797	validation-logloss:0.17167
[600]	train-logloss:0.14634	validation-logloss:0.17152
[640]	train-logloss:0.14527	validation-logloss:0.17170
[645]	train-logloss:0.14504	validation-logloss:0.17168
best score 0.1

In [7]:
%%time
scores = model.get_score(importance_type="gain")
assert len(scores)!=0
rows = []
for feature, score in scores.items():
    rows.append({'importance': score, 'feature': feature})
df = pd.DataFrame.from_records(rows)
df = df.sort_values(["importance"], ascending=False, ignore_index=True)
_path = f"{job_dir}/importance.csv"
df.to_csv(_path, index=True)
print(f"Saved {_path}")
df.T.head()

Saved models/xgb/20240118_071558/features.csv
CPU times: user 6.07 ms, sys: 16.3 ms, total: 22.3 ms
Wall time: 2.02 ms


Unnamed: 0,0,1,2,3,4,5,6,7
importance,54.395298,35.851475,35.267052,21.672609,21.451225,9.819571,5.997214,4.037098
feature,ch_space_frac,ch_letter_frac,ch_punc_frac,ch_digit_frac,ch_len,ch_upper_frac,ch_repeat_substring_frac,ch_repeat_char_frac


In [8]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:00:00.946309
