In [1]:
import os
import json
import pathlib
from datetime import datetime
import optuna
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import torch
from typing import List, Dict, Union, Tuple, NamedTuple
from tqdm import tqdm
import scml
from scml import pandasx as pdx
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()

In [2]:
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
job_dir = f"models/xgb/{ts}"
pathlib.Path(job_dir).mkdir(parents=True, exist_ok=True)
num_boost_round: int = 100
lr: Tuple[float, float] = (1e-3, 1e-3)
feature_fraction: Tuple[float, float] = (1, 1)
min_data_in_leaf: Tuple[int, int] = (20, 20)
objective: str = "binary:logistic"
n_trials: int = 1
label = "generated"

In [3]:
df = pd.read_parquet("input/features.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39120 entries, 0 to 39119
Columns: 29780 entries, essay_id to tf_Ġzygomatic
dtypes: float32(29765), int16(2), int32(5), int8(1), object(7)
memory usage: 4.3+ GB


In [4]:
features = []
prefixes = ["ch_", "ts_", "va_", "tf_"]
for col in df.columns:
    for prefix in prefixes:
        if col.startswith(prefix):
            features.append(col)
features.sort()
print(f"{len(features)} features\n{features[:100]}")

29768 features
['ch_digit_frac', 'ch_len', 'ch_letter_frac', 'ch_punc_frac', 'ch_repeat_char_frac', 'ch_space_frac', 'ch_upper_frac', 'tf_0', 'tf_00', 'tf_000', 'tf_03', 'tf_1', 'tf_10', 'tf_11', 'tf_12', 'tf_13', 'tf_14', 'tf_15', 'tf_16', 'tf_17', 'tf_18', 'tf_19', 'tf_199', 'tf_1990', 'tf_2', 'tf_20', 'tf_200', 'tf_2002', 'tf_21', 'tf_23', 'tf_24', 'tf_25', 'tf_27', 'tf_28', 'tf_3', 'tf_30', 'tf_31', 'tf_32', 'tf_33', 'tf_34', 'tf_38', 'tf_39', 'tf_4', 'tf_40', 'tf_41', 'tf_43', 'tf_45', 'tf_5', 'tf_50', 'tf_538', 'tf_58', 'tf_6', 'tf_60', 'tf_62', 'tf_7', 'tf_70', 'tf_74', 'tf_76', 'tf_79', 'tf_8', 'tf_87', 'tf_9', 'tf_a', 'tf_aa', 'tf_aae', 'tf_aage', 'tf_aaion', 'tf_ab', 'tf_aban', 'tf_abe', 'tf_abel', 'tf_aber', 'tf_abet', 'tf_abeth', 'tf_abil', 'tf_abilites', 'tf_abilitie', 'tf_abilities', 'tf_ability', 'tf_abill', 'tf_abilty', 'tf_abitable', 'tf_abital', 'tf_abl', 'tf_able', 'tf_abled', 'tf_ables', 'tf_abling', 'tf_ablish', 'tf_ablished', 'tf_ablities', 'tf_ablity', 'tf_ably',

In [5]:
#X_train, X_test, y_train, y_test = train_test_split(tra[features], tra[label], test_size=0.2)

tra = df[df["white_sim"]>=0.45]
val = df[df["white_sim"]<0.45]
t = len(tra)
v = len(val)
n = t+v
print(f"val%={v/n:.4f}, len(tra)={t:,}, len(val)={v:,}")
dtrain = xgb.DMatrix(tra[features], tra[label], enable_categorical=False)
dval = xgb.DMatrix(val[features], val[label], enable_categorical=False)
pdx.value_counts(val[label])

val%=0.0293, len(tra)=37,974, len(val)=1,146


Unnamed: 0_level_0,count,percent
generated,Unnamed: 1_level_1,Unnamed: 2_level_1
1,907,0.791449
0,239,0.208551


In [6]:
%%time
model = xgb.train(
   params={
       "objective": objective,
       "learning_rate": 5e-2,
       "min_child_weight": 20,
       "colsample_bytree": 0.5,
       "max_depth": 6,
   },
   dtrain=dtrain,
   num_boost_round=2000,
   evals=[(dtrain, "train"), (dval, "val")],
   verbose_eval=40,
   early_stopping_rounds=100,
)
print(f"best score {model.best_score:.5f} at iteration {model.best_iteration}")
model.save_model(f"{job_dir}/model.json")

[0]	train-logloss:0.51076	val-logloss:1.08934
[40]	train-logloss:0.12861	val-logloss:0.38881
[80]	train-logloss:0.06243	val-logloss:0.23493
[120]	train-logloss:0.03940	val-logloss:0.17246
[160]	train-logloss:0.02863	val-logloss:0.13892
[200]	train-logloss:0.02209	val-logloss:0.11792
[240]	train-logloss:0.01776	val-logloss:0.10540
[280]	train-logloss:0.01470	val-logloss:0.09464
[320]	train-logloss:0.01241	val-logloss:0.08805
[360]	train-logloss:0.01070	val-logloss:0.08333
[400]	train-logloss:0.00941	val-logloss:0.07873
[440]	train-logloss:0.00840	val-logloss:0.07597
[480]	train-logloss:0.00755	val-logloss:0.07408
[520]	train-logloss:0.00687	val-logloss:0.07204
[560]	train-logloss:0.00629	val-logloss:0.07023
[600]	train-logloss:0.00585	val-logloss:0.06919
[640]	train-logloss:0.00546	val-logloss:0.06826
[680]	train-logloss:0.00515	val-logloss:0.06820
[720]	train-logloss:0.00488	val-logloss:0.06712
[760]	train-logloss:0.00465	val-logloss:0.06672
[800]	train-logloss:0.00444	val-logloss:0.06

In [7]:
%%time
y_true = val[label].tolist()
y_pred = model.predict(data=dval, iteration_range=(0, model.best_iteration+1))
auc = roc_auc_score(y_true, y_pred, average="macro")
print(f"auc={auc:.4f}")
print(f"y_pred={y_pred.shape}\n{y_pred[:5]}")

auc=0.9979
y_pred=(1146,)
[0.9656257  0.96811175 0.99439245 0.01315708 0.99998796]
CPU times: user 228 ms, sys: 388 ms, total: 616 ms
Wall time: 62.2 ms


In [8]:
%%time
scores = model.get_score(importance_type="gain")
assert len(scores)!=0
rows = []
for feature, score in scores.items():
    rows.append({'importance': score, 'feature': feature})
idf = pd.DataFrame.from_records(rows)
idf = idf.sort_values(["importance"], ascending=False, ignore_index=True)
fp = f"{job_dir}/importance.csv"
idf.to_csv(fp, index=True)
print(f"Saved {fp}")
idf.T.head()

Saved models/xgb/20240119_155859/importance.csv
CPU times: user 15.2 ms, sys: 32.4 ms, total: 47.5 ms
Wall time: 4.85 ms


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318
importance,781.810486,490.538818,485.316528,336.407135,288.690765,288.211456,229.231964,219.743591,209.669327,181.665131,179.183456,169.901947,168.316895,167.481339,151.356659,143.653076,140.939529,110.069252,101.857819,93.974632,90.115738,88.624748,86.975922,85.581299,77.569267,77.131325,75.604225,70.384636,69.825089,69.298241,67.570801,66.250443,63.203785,62.863422,61.277332,61.199062,57.946815,57.901653,56.458355,54.44944,53.377113,52.523922,50.847752,50.176514,46.688572,46.042809,44.76157,43.390869,42.235153,40.835976,40.318829,40.199787,39.128548,36.081169,36.04126,34.270901,33.954178,33.37429,32.947266,31.265629,30.823727,30.135227,29.626429,29.362932,28.551178,27.447458,26.943707,26.039213,25.908745,25.699692,25.50363,24.424545,24.239599,24.071177,23.032135,22.710127,22.611753,22.584135,21.534424,21.493031,21.450483,21.356768,20.933449,20.710167,20.395821,20.098389,19.472715,19.420181,19.391479,18.932508,18.800821,18.507572,18.358309,18.106339,17.710629,17.690317,17.369865,17.249359,17.130758,16.952984,16.912823,16.879992,16.801064,16.675583,16.646465,16.53372,16.395014,16.263538,16.058931,15.312901,14.979809,14.0781,13.79497,13.216605,13.105409,13.085265,12.985317,12.907841,12.708452,12.667187,12.591174,12.368817,12.266745,12.248855,12.22839,12.214986,12.189629,12.023846,11.685711,11.51645,11.498635,11.412957,11.209677,10.870028,10.845504,10.8053,10.681151,10.634077,10.436765,10.416463,10.245605,9.8927,9.564544,9.530143,9.510941,9.464931,9.168464,9.160788,9.05891,9.055192,8.884005,8.883018,8.678984,8.647542,8.55966,8.511078,8.084966,8.036461,7.956953,7.909729,7.55541,7.534116,7.440822,7.436173,7.433044,7.310189,7.203867,7.152648,7.146918,7.11026,7.101671,6.941996,6.941086,6.813098,6.753841,6.731386,6.682386,6.637693,6.609685,6.514729,6.467379,6.006093,5.971101,5.930752,5.877871,5.752261,5.685339,5.64645,5.622975,5.396545,5.314053,5.280446,5.232251,5.213174,5.166775,5.094082,5.069883,5.025868,4.883653,4.853292,4.834744,4.783371,4.732041,4.612059,4.603532,4.580497,4.535528,4.528878,4.481597,4.420849,4.358688,4.354087,4.349886,4.252539,4.079772,4.05571,3.973468,3.923386,3.863719,3.863282,3.832214,3.77792,3.740831,3.656284,3.606041,3.538441,3.515561,3.495106,3.396851,3.372038,3.335905,3.179139,3.097436,3.039652,3.02588,3.02353,3.010557,2.998088,2.99721,2.875458,2.818633,2.782501,2.717593,2.684284,2.676867,2.652729,2.641006,2.593753,2.583044,2.550439,2.528988,2.523669,2.436218,2.430205,2.418217,2.374122,2.238956,2.226804,2.208097,2.202519,2.190012,2.185219,2.170147,2.127258,2.09442,2.03263,2.009032,2.008704,1.952991,1.919455,1.857847,1.850523,1.818397,1.776517,1.7752,1.757103,1.74378,1.728997,1.66828,1.662821,1.654527,1.638698,1.630503,1.629389,1.59688,1.588866,1.558329,1.52522,1.513808,1.512163,1.476213,1.462033,1.418929,1.396214,1.343253,1.288854,1.281238,1.275908,1.16557,1.143188,1.11098,1.036482,1.035149,0.995978,0.984765,0.976348,0.885169,0.848381,0.841537,0.834027,0.820547,0.80508,0.676841,0.629142,0.569557,0.508066,0.478187,0.420549,0.333369
feature,tf_Ġhey,ts_coleman_liau_index,ts_syllables_per_word,ts_polysyllable_frac,tf_Ġessential,ch_space_frac,tf_Ġessay,tf_Ġthank,tf_Ġbecause,tf_Ġsuccess,tf_Ġadditionally,tf_Ġsuper,tf_Ġachieving,tf_Ġgoals,tf_th,tf_Ġconfused,ts_smog_index,tf_Ġnt,tf_Ġgrader,tf_Ġvery,tf_Ġfirstly,tf_Ġwould,tf_Ġensures,tf_Ġvehicle,tf_Ġ8,ch_letter_frac,ch_punc_frac,tf_Ġsustainable,tf_Ġourselves,tf_Ġelectors,tf_Ġ3,tf_Ġand,tf_Ġultimately,tf_Ġconclusion,ts_monosyllable_frac,tf_Ġimportance,tf_Ġlike,tf_Ġaddress,tf_Ġimportant,tf_Ġvenus,tf_Ġfacial,tf_Ġdear,tf_Ġcomputer,tf_Ġperspectives,ts_lexicon_count,tf_Ġnasa,tf_Ġpotential,tf_Ġday,tf_Ġthen,tf_Ġchina,tf_Ġcool,ch_digit_frac,tf_Ġsense,tf_Ġsignificant,tf_Ġprotect,tf_Ġeurope,tf_Ġsystem,tf_Ġearth,tf_Ġcar,tf_Ġhumans,tf_Ġus,tf_Ġargue,tf_Ġalthough,ch_len,tf_Ġplus,tf_Ġparagraph,tf_Ġetc,tf_Ġskills,tf_Ġcars,tf_Ġpercent,tf_Ġreally,tf_Ġinformed,tf_Ġensure,tf_Ġextracurricular,tf_Ġwriting,tf_Ġactivity,tf_Ġhuman,tf_Ġprovide,tf_Ġprincipal,tf_Ġseagoing,tf_Ġdriving,tf_Ġthe,tf_Ġmost,tf_Ġphone,tf_Ġoverall,tf_Ġprobably,ts_words_per_sent,tf_Ġexperiences,tf_Ġlead,tf_Ġreduce,tf_Ġresources,tf_Ġalmost,tf_Ġhand,ts_sentence_count,tf_Ġdo,tf_Ġleast,tf_Ġeveryday,tf_Ġunique,tf_Ġtext,tf_Ġfurthermore,tf_Ġyou,ts_dale_chall_readability_score,tf_Ġschool,tf_Ġstate,tf_Ġmean,tf_Ġfair,tf_Ġconsider,tf_Ġhere,tf_Ġstates,tf_Ġreason,tf_Ġsecondly,tf_Ġservice,tf_Ġmany,tf_Ġsupport,tf_Ġgrade,tf_Ġcommunity,tf_Ġhealth,tf_Ġimpact,tf_Ġoffer,tf_Ġmy,tf_Ġwill,tf_Ġis,ts_mcalpine_eflaw,tf_Ġgo,tf_Ġlearn,tf_Ġmight,va_valence,tf_Ġeven,tf_Ġwhat,tf_Ġfinally,tf_Ġtrue,tf_Ġagree,tf_Ġso,tf_Ġtransportation,tf_Ġif,tf_Ġsincerely,tf_Ġthough,tf_Ġcould,tf_Ġtechnology,tf_Ġstudent,tf_Ġarticle,tf_Ġvehicles,tf_Ġknow,tf_Ġreasons,tf_Ġcreate,ch_upper_frac,tf_Ġstudents,tf_Ġdifficult,tf_Ġpoint,tf_Ġbenefits,tf_Ġafter,tf_Ġlet,tf_Ġbelieve,tf_Ġboth,tf_Ġi,tf_Ġclear,tf_Ġown,va_arousal,tf_Ġlimiting,tf_Ġam,tf_Ġat,ts_difficult_words,tf_Ġplanet,tf_Ġwe,tf_Ġthroughout,tf_Ġmatter,tf_Ġmeans,ts_syllable_count,tf_Ġshould,ts_spache_readability,tf_Ġit,tf_Ġfocus,tf_Ġunited,tf_Ġexplore,tf_Ġno,tf_Ġeverything,tf_Ġour,ts_flesch_kincaid_grade,tf_Ġget,ts_gunning_fog,tf_Ġsaid,tf_Ġkids,tf_Ġthey,ts_flesch_reading_ease,tf_Ġmay,tf_Ġdone,tf_Ġcan,tf_Ġbig,ts_syllables_per_sent,tf_Ġanother,va_dominance,tf_Ġwas,tf_Ġan,tf_Ġlast,tf_Ġoften,tf_Ġwhich,tf_Ġhard,tf_Ġhelping,tf_Ġits,tf_Ġgoing,tf_Ġthink,tf_Ġfriend,tf_Ġabout,tf_Ġpeople,tf_Ġname,tf_Ġme,tf_Ġexample,tf_Ġbut,tf_Ġyour,tf_Ġwhy,ts_automated_readability_index,tf_Ġwere,tf_Ġhome,tf_Ġwhile,tf_Ġfun,tf_Ġmuch,tf_Ġto,tf_Ġsports,tf_Ġevery,tf_Ġwho,tf_Ġfirst,tf_Ġeasier,tf_Ġsure,tf_Ġin,tf_Ġout,tf_Ġover,tf_Ġtake,tf_Ġnot,tf_Ġtwo,tf_Ġlives,tf_Ġthis,tf_Ġschools,tf_Ġmake,tf_Ġonce,tf_Ġworking,tf_Ġhis,tf_Ġthan,tf_Ġjust,tf_Ġwith,tf_Ġall,tf_Ġgood,tf_Ġhelps,tf_Ġnow,tf_Ġhowever,tf_Ġcause,tf_Ġstay,tf_Ġhaving,tf_Ġone,tf_Ġby,tf_Ġwhen,tf_Ġlearning,tf_Ġtime,tf_Ġface,tf_Ġtheir,tf_Ġinstead,tf_Ġbenefit,tf_Ġa,tf_Ġdown,tf_Ġfor,tf_Ġthere,tf_Ġusing,tf_Ġdoing,ts_linsear_write_formula,tf_Ġnumber,tf_Ġkeep,tf_Ġbecome,tf_Ġlife,tf_Ġperson,tf_Ġbeing,tf_Ġor,tf_Ġsame,tf_Ġhelp,tf_Ġare,tf_Ġmore,tf_Ġalso,tf_Ġbe,tf_Ġsome,tf_Ġput,tf_Ġas,tf_Ġthings,tf_Ġthat,tf_Ġhave,tf_Ġof,tf_Ġthem,ch_repeat_char_frac,tf_Ġon,tf_Ġfrom,tf_Ġbetter,tf_Ġduring,tf_Ġothers,tf_Ġonly,tf_Ġalways,tf_Ġdifferent,tf_Ġsee,tf_Ġother,tf_Ġnew,tf_Ġpart,tf_Ġthing,tf_Ġhow,tf_Ġpolicy,tf_Ġwant,tf_Ġidea,tf_Ġneed,tf_Ġway,tf_Ġup,tf_Ġwell,tf_Ġable,tf_Ġwork,tf_Ġaround,tf_Ġs,tf_Ġsuch,tf_Ġthese,tf_Ġfeel,tf_Ġfuture,tf_Ġsay,tf_Ġlot,tf_Ġfriends,tf_Ġsomeone,tf_Ġgreat


In [9]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:16:10.125440
