In [1]:
import os
import json
import pathlib
from datetime import datetime
import optuna
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
import torch
from typing import List, Dict, Union, Tuple, NamedTuple
from tqdm import tqdm
import scml
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()

In [2]:
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
job_dir = f"models/xgb/{ts}"
pathlib.Path(job_dir).mkdir(parents=True, exist_ok=True)
num_boost_round: int = 100
lr: Tuple[float, float] = (1e-3, 1e-3)
feature_fraction: Tuple[float, float] = (1, 1)
min_data_in_leaf: Tuple[int, int] = (20, 20)
objective: str = "binary:logistic"
n_trials: int = 1
label = "generated"

In [3]:
tra = pd.read_parquet("input/features_tra.parquet")
tra.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1378 entries, 0 to 1377
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        1378 non-null   object 
 1   prompt_id                 1378 non-null   int64  
 2   text                      1378 non-null   object 
 3   generated                 1378 non-null   int8   
 4   text_bsc                  1378 non-null   object 
 5   text_bow                  1378 non-null   object 
 6   text_bow_len              1378 non-null   int16  
 7   ch_len                    1378 non-null   int32  
 8   ch_digit_frac             1378 non-null   float32
 9   ch_letter_frac            1378 non-null   float32
 10  ch_space_frac             1378 non-null   float32
 11  ch_punc_frac              1378 non-null   float32
 12  ch_upper_frac             1378 non-null   float32
 13  ch_repeat_char_frac       1378 non-null   float32
 14  ch_repea

In [4]:
features = ["text_bow_len"]
for col in tra.columns:
    if col.startswith("ch_"):
        features.append(col)
features.sort()
print(f"{len(features)} features\n{features}")

9 features
['ch_digit_frac', 'ch_len', 'ch_letter_frac', 'ch_punc_frac', 'ch_repeat_char_frac', 'ch_repeat_substring_frac', 'ch_space_frac', 'ch_upper_frac', 'text_bow_len']


In [5]:
X_train, X_test, y_train, y_test = train_test_split(tra[features], tra[label], test_size=0.2)
dtrain = xgb.DMatrix(X_train, y_train, enable_categorical=False)
dval = xgb.DMatrix(X_test, y_test, enable_categorical=False)

In [6]:
%%time
model = xgb.train(
   params={
       "objective": objective,
       "learning_rate": 1e-1,
       "min_child_weight": 4,
       "colsample_bytree": 0.5,
       "max_depth": 6,
   },
   dtrain=dtrain,
   num_boost_round=100,
   evals=[(dtrain, "train"), (dval, "validation")],
   verbose_eval=10,
   early_stopping_rounds=10,
)
print(f"best score {model.best_score:.5f} at iteration {model.best_iteration}")
model.save_model(f"{job_dir}/model.json")

[0]	train-logloss:0.11888	validation-logloss:0.12278
[10]	train-logloss:0.04738	validation-logloss:0.05354
[20]	train-logloss:0.02284	validation-logloss:0.03093
[30]	train-logloss:0.01440	validation-logloss:0.02390
[40]	train-logloss:0.01169	validation-logloss:0.02222
[50]	train-logloss:0.01118	validation-logloss:0.02210
[55]	train-logloss:0.01118	validation-logloss:0.02210
best score 0.02210 at iteration 45
CPU times: user 117 ms, sys: 213 ms, total: 331 ms
Wall time: 30.1 ms


In [7]:
%%time
scores = model.get_score(importance_type="gain")
assert len(scores)!=0
rows = []
for feature, score in scores.items():
    rows.append({'importance': score, 'feature': feature})
df = pd.DataFrame.from_records(rows)
df = df.sort_values(["importance"], ascending=False, ignore_index=True)
_path = f"{job_dir}/features.csv"
df.to_csv(_path, index=True)
print(f"Saved {_path}")
df.T.head()

Saved models/xgb/20240117_020926/features.csv
CPU times: user 7.24 ms, sys: 18 ms, total: 25.3 ms
Wall time: 2.21 ms


Unnamed: 0,0,1,2,3,4,5,6,7
importance,0.486534,0.459423,0.388494,0.377687,0.363913,0.176891,0.137881,0.081103
feature,ch_letter_frac,ch_len,ch_upper_frac,ch_digit_frac,text_bow_len,ch_punc_frac,ch_repeat_substring_frac,ch_repeat_char_frac


In [8]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:00:00.095558
