In [1]:
import os
import json
import pathlib
from datetime import datetime
import optuna
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import torch
from typing import List, Dict, Union, Tuple, NamedTuple
from tqdm import tqdm
import scml
from scml import pandasx as pdx
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()

In [2]:
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
job_dir = f"models/xgb/{ts}"
pathlib.Path(job_dir).mkdir(parents=True, exist_ok=True)
num_boost_round: int = 100
lr: Tuple[float, float] = (1e-3, 1e-3)
feature_fraction: Tuple[float, float] = (1, 1)
min_data_in_leaf: Tuple[int, int] = (20, 20)
objective: str = "binary:logistic"
n_trials: int = 1
label = "generated"

In [3]:
df = pd.read_parquet("input/features.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39120 entries, 0 to 39119
Columns: 29787 entries, essay_id to tf_Ġzygomatic
dtypes: float32(29772), int16(2), int32(5), int8(1), object(7)
memory usage: 4.3+ GB


In [4]:
features = []
prefixes = ["ch_", "ws_", "ts_", "va_", "tf_"]
for col in df.columns:
    for prefix in prefixes:
        if col.startswith(prefix):
            features.append(col)
features.sort()
print(f"{len(features)} features\n{features[:100]}")

29775 features
['ch_digit_frac', 'ch_len', 'ch_letter_frac', 'ch_punc_frac', 'ch_repeat_char_frac', 'ch_space_frac', 'ch_upper_frac', 'tf_0', 'tf_00', 'tf_000', 'tf_03', 'tf_1', 'tf_10', 'tf_11', 'tf_12', 'tf_13', 'tf_14', 'tf_15', 'tf_16', 'tf_17', 'tf_18', 'tf_19', 'tf_199', 'tf_1990', 'tf_2', 'tf_20', 'tf_200', 'tf_2002', 'tf_21', 'tf_23', 'tf_24', 'tf_25', 'tf_27', 'tf_28', 'tf_3', 'tf_30', 'tf_31', 'tf_32', 'tf_33', 'tf_34', 'tf_38', 'tf_39', 'tf_4', 'tf_40', 'tf_41', 'tf_43', 'tf_45', 'tf_5', 'tf_50', 'tf_538', 'tf_58', 'tf_6', 'tf_60', 'tf_62', 'tf_7', 'tf_70', 'tf_74', 'tf_76', 'tf_79', 'tf_8', 'tf_87', 'tf_9', 'tf_a', 'tf_aa', 'tf_aae', 'tf_aage', 'tf_aaion', 'tf_ab', 'tf_aban', 'tf_abe', 'tf_abel', 'tf_aber', 'tf_abet', 'tf_abeth', 'tf_abil', 'tf_abilites', 'tf_abilitie', 'tf_abilities', 'tf_ability', 'tf_abill', 'tf_abilty', 'tf_abitable', 'tf_abital', 'tf_abl', 'tf_able', 'tf_abled', 'tf_ables', 'tf_abling', 'tf_ablish', 'tf_ablished', 'tf_ablities', 'tf_ablity', 'tf_ably',

In [5]:
#X_train, X_test, y_train, y_test = train_test_split(tra[features], tra[label], test_size=0.2)

tra = df[df["white_sim"]>=0.45]
val = df[df["white_sim"]<0.45]
t = len(tra)
v = len(val)
n = t+v
print(f"val%={v/n:.4f}, len(tra)={t:,}, len(val)={v:,}")
dtrain = xgb.DMatrix(tra[features], tra[label], enable_categorical=False)
dval = xgb.DMatrix(val[features], val[label], enable_categorical=False)
pdx.value_counts(val[label])

val%=0.0293, len(tra)=37,974, len(val)=1,146


Unnamed: 0_level_0,count,percent
generated,Unnamed: 1_level_1,Unnamed: 2_level_1
1,907,0.791449
0,239,0.208551


In [6]:
%%time
model = xgb.train(
   params={
       "objective": objective,
       "learning_rate": 5e-2,
       "min_child_weight": 20,
       "colsample_bytree": 0.5,
       "max_depth": 6,
   },
   dtrain=dtrain,
   num_boost_round=2000,
   evals=[(dtrain, "train"), (dval, "val")],
   verbose_eval=40,
   early_stopping_rounds=100,
)
print(f"best score {model.best_score:.5f} at iteration {model.best_iteration}")
model.save_model(f"{job_dir}/model.json")

[0]	train-logloss:0.51395	val-logloss:1.07884
[40]	train-logloss:0.12450	val-logloss:0.36778
[80]	train-logloss:0.05951	val-logloss:0.22819
[120]	train-logloss:0.03711	val-logloss:0.17022
[160]	train-logloss:0.02651	val-logloss:0.13479
[200]	train-logloss:0.02038	val-logloss:0.11529
[240]	train-logloss:0.01619	val-logloss:0.10146
[280]	train-logloss:0.01334	val-logloss:0.09335
[320]	train-logloss:0.01129	val-logloss:0.08700
[360]	train-logloss:0.00969	val-logloss:0.08254
[400]	train-logloss:0.00849	val-logloss:0.07864
[440]	train-logloss:0.00752	val-logloss:0.07500
[480]	train-logloss:0.00679	val-logloss:0.07311
[520]	train-logloss:0.00621	val-logloss:0.07109
[560]	train-logloss:0.00575	val-logloss:0.07022
[600]	train-logloss:0.00536	val-logloss:0.06864
[640]	train-logloss:0.00504	val-logloss:0.06831
[680]	train-logloss:0.00476	val-logloss:0.06800
[720]	train-logloss:0.00452	val-logloss:0.06790
[760]	train-logloss:0.00433	val-logloss:0.06766
[800]	train-logloss:0.00415	val-logloss:0.06

In [7]:
%%time
y_true = val[label].tolist()
y_pred = model.predict(data=dval, iteration_range=(0, model.best_iteration+1))
auc = roc_auc_score(y_true, y_pred, average="macro")
print(f"auc={auc:.4f}")
print(f"y_pred={y_pred.shape}\n{y_pred[:5]}")

auc=0.9978
y_pred=(1146,)
[0.9499928  0.97822237 0.99891245 0.00628967 0.99998033]
CPU times: user 312 ms, sys: 400 ms, total: 712 ms
Wall time: 59.1 ms


In [8]:
%%time
scores = model.get_score(importance_type="gain")
assert len(scores)!=0
rows = []
for feature, score in scores.items():
    rows.append({'importance': score, 'feature': feature})
idf = pd.DataFrame.from_records(rows)
idf = idf.sort_values(["importance"], ascending=False, ignore_index=True)
fp = f"{job_dir}/importance.csv"
idf.to_csv(fp, index=True)
print(f"Saved {fp}")
idf.T.head()

Saved models/xgb/20240120_041933/importance.csv
CPU times: user 21.2 ms, sys: 38.7 ms, total: 59.9 ms
Wall time: 5.05 ms


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317
importance,535.333862,491.513977,487.654358,362.805176,333.964478,332.914612,226.789703,219.15686,214.638794,211.520065,179.099335,171.318848,149.939621,144.979034,138.28273,124.150146,121.580658,115.805115,114.466164,108.677856,101.883476,101.435951,98.002876,84.509155,76.646042,76.45359,75.889709,69.939545,66.292404,66.05307,65.250473,63.459717,62.241436,56.666737,54.976513,54.656471,53.885494,53.665058,53.12447,52.420853,52.350555,51.53833,50.223473,49.986206,49.040386,47.444031,46.485104,46.267811,40.464008,40.103317,39.966915,38.075565,37.707947,37.261292,36.566113,36.080936,35.468941,33.705811,33.141094,30.172977,29.95191,29.405884,29.22114,28.309143,27.806349,27.620529,26.203878,26.058992,25.645138,25.579153,25.304052,24.991737,24.953922,24.29339,23.767916,22.674658,22.334621,21.925852,21.656654,21.583004,21.053091,20.341541,20.336012,19.925665,19.696354,19.651701,19.599421,19.536987,19.467497,19.375626,18.577335,18.273558,18.098648,18.076061,17.982529,17.886499,17.813538,17.708664,17.696554,17.532536,17.430914,17.187277,17.073383,16.762617,16.444614,16.400848,16.220268,16.089945,16.070419,15.904037,15.884532,15.785007,15.684215,14.927987,14.50132,14.360294,14.077721,14.069798,14.040625,13.981558,13.925736,13.558373,13.070453,13.039894,13.004128,12.855943,12.818004,12.804463,12.793447,12.426529,12.19511,12.099344,12.074789,12.037127,11.92334,11.868005,11.733463,11.649146,11.290364,11.227482,10.756605,10.752293,10.73936,10.539305,10.441528,10.283181,10.095117,9.960876,9.8633,9.83662,9.826649,9.656018,9.445007,9.382985,9.347935,9.164958,9.125072,8.971872,8.706679,8.687664,8.57763,8.3493,8.201652,7.949665,7.875024,7.80212,7.797144,7.544501,7.445221,7.244454,7.150042,6.944552,6.941762,6.930393,6.874111,6.838174,6.832476,6.773543,6.651915,6.580213,6.485192,6.468338,6.456057,6.327619,6.102661,6.045862,6.010983,5.981054,5.956406,5.908396,5.868488,5.848146,5.813245,5.774518,5.641491,5.617348,5.562448,5.496105,5.446288,4.977261,4.940655,4.803391,4.802198,4.739406,4.701181,4.675903,4.542944,4.366622,4.336646,4.335607,4.310486,4.16504,4.124278,4.074976,4.058128,4.047112,4.033809,3.958862,3.883852,3.794348,3.684112,3.624991,3.612345,3.540681,3.475764,3.435429,3.407918,3.379809,3.342614,3.306682,3.263934,3.259813,3.166806,3.154149,3.078102,2.938603,2.877229,2.867808,2.849793,2.829164,2.787444,2.77756,2.739072,2.726932,2.602082,2.598131,2.557087,2.551506,2.533246,2.490699,2.484266,2.471044,2.448205,2.415814,2.31446,2.101922,2.099108,2.086792,2.078264,1.99505,1.982076,1.962839,1.936546,1.923023,1.905253,1.903507,1.870733,1.86094,1.827483,1.821375,1.767868,1.741594,1.710009,1.684256,1.683862,1.640992,1.616394,1.573892,1.500577,1.498889,1.496079,1.44943,1.437862,1.391529,1.379298,1.369891,1.338372,1.248961,1.240137,1.197424,1.115942,1.110137,1.07143,1.045055,1.04131,1.03263,1.030561,1.013166,0.980068,0.973082,0.916443,0.817195,0.816088,0.770794,0.759886,0.702525,0.693356,0.686387,0.643064,0.625346,0.590192,0.586884,0.576589,0.562961,0.556713,0.546316,0.301447,0.283508
feature,ts_polysyllable_frac,ts_syllables_per_word,tf_Ġhey,tf_Ġperformance,tf_Ġsuper,ch_space_frac,tf_Ġessay,tf_Ġensures,ts_flesch_kincaid_grade,tf_Ġgoals,tf_Ġessential,tf_Ġtotally,tf_Ġbecause,tf_th,tf_Ġadditionally,tf_Ġultimately,tf_Ġwould,tf_Ġimportance,tf_Ġgrader,tf_Ġstuff,tf_Ġfirstly,tf_Ġnt,tf_Ġvery,tf_Ġattempt,ch_punc_frac,ts_coleman_liau_index,tf_Ġsustainable,tf_Ġsuccess,tf_Ġconclusion,ch_letter_frac,tf_Ġchina,tf_Ġpresident,tf_Ġand,tf_Ġfacial,tf_Ġvenus,tf_Ġthank,ws_sent_len_std,tf_Ġimportant,ts_smog_index,tf_Ġlike,tf_Ġaddress,tf_Ġeurope,tf_Ġanimals,tf_Ġperspectives,tf_Ġcomputer,tf_Ġdriverless,ws_sent_len_delta_mean,tf_Ġ8,ts_lexicon_count,tf_Ġteacher,tf_Ġpotential,tf_Ġextracurricular,tf_Ġdear,ch_digit_frac,tf_Ġinformed,tf_Ġsignificant,tf_Ġseagoing,tf_Ġachieve,tf_Ġcar,ws_sent_len_delta_std,tf_Ġalthough,tf_Ġnasa,tf_Ġelectors,tf_Ġphone,tf_Ġthen,tf_Ġadvantages,tf_Ġsmaller,tf_Ġwriting,tf_Ġactivity,tf_Ġhumans,tf_Ġsecondly,tf_Ġcool,ts_monosyllable_frac,tf_Ġplus,tf_Ġskills,tf_Ġhuman,tf_Ġus,tf_Ġprovide,tf_Ġoverall,tf_Ġensure,tf_Ġexperiences,ts_dale_chall_readability_score,tf_Ġalmost,tf_Ġparagraph,tf_Ġprobably,tf_Ġdo,tf_Ġdriving,tf_Ġcareer,ts_automated_readability_index,tf_Ġreducing,tf_Ġcars,tf_Ġunique,tf_Ġhand,tf_Ġphones,tf_Ġlead,tf_Ġconsider,tf_Ġallows,ch_len,tf_Ġresources,tf_Ġleast,tf_Ġpercent,tf_Ġyou,tf_Ġeveryday,tf_Ġargue,tf_Ġtext,tf_Ġreduce,tf_Ġservice,ts_sentence_count,tf_Ġsmog,tf_Ġlet,tf_Ġthe,tf_Ġfurthermore,tf_Ġfair,tf_Ġgo,tf_Ġgrade,tf_Ġday,tf_Ġmany,tf_Ġprincipal,ts_difficult_words,tf_Ġcommunity,tf_Ġdifficult,tf_Ġsense,tf_Ġstates,ts_gunning_fog,tf_Ġexplore,tf_Ġagree,va_valence_mean,tf_Ġmight,tf_Ġso,tf_Ġthough,tf_Ġwill,tf_Ġearth,tf_Ġmost,tf_Ġfinally,tf_Ġcomputers,tf_Ġsincerely,tf_Ġexperience,tf_Ġtrue,tf_Ġstudents,tf_Ġreally,tf_Ġhealth,tf_Ġi,tf_Ġschool,tf_Ġlearn,tf_Ġpublic,tf_Ġtechnology,tf_Ġsupport,tf_Ġdue,tf_Ġmy,tf_Ġtransportation,ts_spache_readability,tf_Ġif,tf_Ġrequired,tf_Ġstudent,tf_Ġcould,ch_upper_frac,tf_Ġimpact,tf_Ġname,tf_Ġafter,va_arousal_mean,tf_Ġbenefits,tf_Ġboth,ts_words_per_sent,tf_Ġwhat,tf_Ġshould,tf_Ġmuch,tf_Ġreasons,tf_Ġmeans,tf_Ġlimiting,ts_syllable_count,tf_Ġnot,tf_Ġour,ts_syllables_per_sent,tf_Ġyour,ts_flesch_reading_ease,tf_Ġam,tf_Ġpoint,tf_Ġit,tf_Ġthey,tf_Ġknow,tf_Ġat,tf_Ġget,va_dominance_mean,tf_Ġfocus,tf_Ġown,ws_sent_len_mean,tf_Ġeverything,tf_Ġkids,tf_Ġwe,tf_Ġhard,tf_Ġover,tf_Ġhome,tf_Ġthink,tf_Ġmay,tf_Ġcan,tf_Ġstudying,tf_Ġthing,tf_Ġtwo,tf_Ġoften,tf_Ġexample,tf_Ġsaid,tf_Ġall,tf_Ġwhile,tf_Ġfirst,ts_mcalpine_eflaw,tf_Ġsports,tf_Ġhelping,tf_Ġwas,tf_Ġdone,tf_Ġlast,tf_Ġfeedback,tf_Ġbelieve,tf_Ġjust,tf_Ġan,tf_Ġpeople,tf_Ġanother,tf_Ġwhy,tf_Ġwho,tf_Ġreason,tf_Ġlearning,tf_Ġschools,tf_Ġone,tf_Ġno,tf_Ġkeep,tf_Ġbut,tf_Ġgoing,tf_Ġabout,tf_Ġwere,tf_Ġthan,tf_Ġstay,tf_Ġput,tf_Ġgood,va_valence_std,tf_Ġbe,tf_Ġmake,tf_Ġalways,tf_Ġnow,tf_Ġhow,tf_Ġout,tf_Ġis,tf_Ġthis,tf_Ġwith,tf_Ġto,tf_Ġits,tf_Ġbeing,ts_linsear_write_formula,tf_Ġcreate,tf_Ġsay,tf_Ġsure,tf_Ġperson,tf_Ġusing,tf_Ġdown,tf_Ġwhich,ch_repeat_char_frac,tf_Ġfor,tf_Ġsame,tf_Ġtake,tf_Ġare,tf_Ġmore,tf_Ġin,tf_Ġwhen,tf_Ġthere,tf_Ġcause,tf_Ġothers,tf_Ġfun,tf_Ġbenefit,tf_Ġthings,tf_Ġevery,tf_Ġdoing,va_dominance_std,tf_Ġtheir,tf_Ġlife,tf_Ġlives,tf_Ġs,tf_Ġduring,tf_Ġup,tf_Ġhave,tf_Ġpart,tf_Ġa,tf_Ġtime,tf_Ġonly,tf_Ġalso,tf_Ġthat,va_arousal_std,tf_Ġdifferent,tf_Ġaround,tf_Ġor,tf_Ġthem,tf_Ġon,tf_Ġinstead,tf_Ġsome,tf_Ġas,tf_Ġnew,tf_Ġby,tf_Ġof,tf_Ġwant,tf_Ġthese,tf_Ġeven,tf_Ġway,tf_Ġbetter,tf_Ġnumber,tf_Ġable,tf_Ġother,tf_Ġsee,tf_Ġwork,tf_Ġidea,tf_Ġfrom,tf_Ġsuch,tf_Ġany,tf_Ġlot,tf_Ġfeel,tf_Ġlook,tf_Ġfuture,tf_Ġhelp,tf_Ġwell,tf_Ġsomeone,tf_Ġgreat,tf_Ġtakes


In [9]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:17:29.462461
