-
Notifications
You must be signed in to change notification settings - Fork 3
/
xgboost.clj
489 lines (416 loc) · 21.8 KB
/
xgboost.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
(ns scicloj.ml.xgboost
"Require this namespace to get xgboost support for classification and regression.
Defines a full range of xgboost model definitions and supports xgboost explain
functionality."
(:require [tech.v3.datatype :as dtype]
[tech.v3.datatype.errors :as errors]
[scicloj.metamorph.ml.loss :as loss]
[scicloj.metamorph.ml :as ml]
[scicloj.ml.xgboost.model :as model]
[scicloj.metamorph.ml.gridsearch :as ml-gs]
[tech.v3.dataset :as ds]
[tech.v3.dataset.tensor :as ds-tens]
[tech.v3.dataset.modelling :as ds-mod]
[tech.v3.dataset.utils :as ds-utils]
[tech.v3.tensor :as dtt]
[clojure.set :as set]
[clojure.string :as s]
[clojure.tools.logging :as log])
(:import [ml.dmlc.xgboost4j.java Booster XGBoost XGBoostError DMatrix]
[ml.dmlc.xgboost4j LabeledPoint]
[smile.util SparseArray SparseArray$Entry]
[java.util Iterator UUID LinkedHashMap Map]
[java.io ByteArrayInputStream ByteArrayOutputStream]))
(set! *warn-on-reflection* true)
(def objective-types
{:linear-regression
{:objective "reg:linear"
:options [{:name :eta
:description "Step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features, and eta shrinks the feature weights to make the boosting process more conservative."}
{:name :gamma
:description "Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger gamma is, the more conservative the algorithm will be."}
{:name :max-depth
:description "Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit. 0 is only accepted in lossguide growing policy when tree_method is set as hist or gpu_hist and it indicates no limit on depth. Beware that XGBoost aggressively consumes memory when training a deep tree."}
{:name :min-child-weight
:description "Minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression task, this simply corresponds to minimum number of instances needed to be in each node. The larger min_child_weight is, the more conservative the algorithm will be."}
{:name "max_delta_step "
:description "Maximum delta step we allow each leaf output to be. If the value is set to 0, it means there is no constraint. If it is set to a positive value, it can help making the update step more conservative. Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced. Set it to value of 1-10 might help control the update."}
{:name "subsample"
:description "Subsample ratio of the training instances. Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees. and this will prevent overfitting. Subsampling will occur once in every boosting iteration."}
{:name "sampling_method"
:description "The method to use to sample the training instances.\nuniform: each training instance has an equal probability of being selected. Typically set subsample >= 0.5 for good results.\n
gradient_based: the selection probability for each training instance is proportional to the regularized absolute value of gradients (more specifically, ).
subsample may be set to as low as 0.1 without loss of model accuracy. Note that this sampling method is only supported when tree_method is set to gpu_hist; other tree methods only support uniform sampling.
"}
{:name "colsample_bytree"
:description ""}
{:name "colsample_bylevel"
:description ""}
{:name "colsample_bynode"
:description ""}
{:name "lambda"
:description "L2 regularization term on weights. Increasing this value will make model more conservative."}
{:name "alpha"
:description "L1 regularization term on weights. Increasing this value will make model more conservative."}
{:name "tree_method"
:description ""}
{:name "sketch_eps"
:description ""}
{:name "scale_pos_weight"
:description ""}
{:name "updater"
:description ""}
{:name "refresh_leaf"
:description ""}
{:name "process_type"
:description ""}
{:name "grow_policy"
:description ""}
{:name "max_leaves"
:description ""}
{:name "max_bin"
:description ""}
{:name "predictor"
:description ""}
{:name "num_parallel_tree"
:description ""}
{:name "monotone_constraints"
:description ""}
{:name "interaction_constraints"
:description ""}]}
:squared-error-regression {:objective "reg:squarederror"}
:logistic-regression {:objective "reg:logistic"}
;;logistic regression for binary classification
:logistic-binary-classification {:objective "binary:logistic"}
;; logistic regression for binary classification, output score before logistic
;; transformation
:logistic-binary-raw-classification {:objective "binary:logitraw"}
;;hinge loss for binary classification. This makes predictions of 0 or 1, rather
;;than producing probabilities.
:binary-hinge-loss {:objective "binary:hinge"}
;; versions of the corresponding objective functions evaluated on the GPU; note that
;; like the GPU histogram algorithm, they can only be used when the entire training
;; session uses the same dataset
:gpu-linear-regression {:objective "gpu:reg:linear"}
:gpu-logistic-regression {:objective "gpu:reg:logistic"}
:gpu-binary-logistic-classification {:objective "gpu:binary:logistic"}
:gpu-binary-logistic-raw-classification {:objective "gpu:binary:logitraw"}
;; poisson regression for count data, output mean of poisson distribution
;; max_delta_step is set to 0.7 by default in poisson regression (used to safeguard
;; optimization)
:count-poisson {:objective "count:poisson"}
;; Cox regression for right censored survival time data (negative values are
;; considered right censored). Note that predictions are returned on the hazard ratio
;; scale (i.e., as HR = exp(marginal_prediction) in the proportional hazard function
;; h(t) = h0(t) * HR).
:survival-cox {:objective "survival:cox"}
;; set XGBoost to do multiclass classification using the softmax objective, you also
;; need to set num_class(number of classes)
:multiclass-softmax {:objective "multi:softmax"}
;; same as softmax, but output a vector of ndata * nclass, which can be further
;; reshaped to ndata * nclass matrix. The result contains predicted probability of
;; each data point belonging to each class.
:multiclass-softprob {:objective "multi:softprob"}
;; Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized
:rank-pairwise {:objective "rank:pairwise"}
;; Use LambdaMART to perform list-wise ranking where Normalized Discounted Cumulative
;; Gain (NDCG) is maximized
:rank-ndcg {:objective "rank:ndcg"}
;; Use LambdaMART to perform list-wise ranking where Mean Average Precision (MAP) is
;; maximized
:rank-map {:objective "rank:map"}
;; gamma regression with log-link. Output is a mean of gamma distribution. It might
;; be useful, e.g., for modeling insurance claims severity, or for any outcome that
;; might be gamma-distributed.
:gamma-regression {:objective "reg:gamma"}
;; Tweedie regression with log-link. It might be useful, e.g., for modeling total
;; loss in insurance, or for any outcome that might be Tweedie-distributed.
:tweedie-regression {:objective "reg:tweedie"}})
(defmulti ^:private model-type->xgboost-objective
(fn [model-type]
model-type))
(defmethod model-type->xgboost-objective :default
[model-type]
(if-let [retval (get-in objective-types [model-type :objective])]
retval
(throw (ex-info "Unrecognized xgboost model type"
{:model-type model-type
:possible-types (keys objective-types)}))))
(defmethod model-type->xgboost-objective :regression
[_]
(model-type->xgboost-objective :squared-error-regression))
(defmethod model-type->xgboost-objective :binary-classification
[_]
(model-type->xgboost-objective :logistic-binary-classification))
(defmethod model-type->xgboost-objective :classification
[_]
(model-type->xgboost-objective :multiclass-softprob))
(defn- sparse->labeled-point [^SparseArray sparse target n-sparse-columns]
(let [x-i-s
(map
#(hash-map :i (.i ^SparseArray$Entry %) :x (.x ^SparseArray$Entry %))
(iterator-seq
(.iterator sparse)))]
(LabeledPoint. target
n-sparse-columns
(into-array Integer/TYPE (map :i x-i-s))
(into-array Float/TYPE (map :x x-i-s)))))
(defn- sparse-feature->dmatrix [feature-ds target-ds sparse-column n-sparse-columns]
(DMatrix.
(.iterator
^Iterable (map
(fn [features target ] (sparse->labeled-point features target n-sparse-columns))
(get feature-ds sparse-column)
(or (get target-ds (first (ds-mod/inference-target-column-names target-ds)))
(repeat 0.0))))
nil))
(defn- dataset->labeled-point-iterator
"Create an iterator to labeled points from a possibly quite large
sequence of maps. Sets expected length to length of first entry"
^Iterable [feature-ds target-ds]
(let [feature-tens (ds-tens/dataset->tensor feature-ds :float32)
target-tens (when target-ds
(ds-tens/dataset->tensor target-ds :float32))]
(errors/when-not-errorf
(or (not target-ds)
(== 1 (ds/column-count target-ds)))
"Multi-column regression/classification is not supported. Target ds has %d columns"
(ds/column-count target-ds))
(map (fn [features target]
(LabeledPoint. (float target) (first (dtype/shape features)) nil (dtype/->float-array features)))
feature-tens (or (when target-tens (dtype/->reader target-tens))
(repeat (float 0.0))))))
(defn- dataset->dmatrix
"Dataset is a sequence of maps. Each contains a feature key.
Returns a dmatrix."
(^DMatrix [feature-ds target-ds]
(DMatrix. (.iterator (dataset->labeled-point-iterator feature-ds target-ds))
nil))
(^DMatrix [feature-ds]
(dataset->dmatrix feature-ds nil)))
(defn- options->objective
[options]
(model-type->xgboost-objective
(or (when (:model-type options)
(keyword (name (:model-type options))))
:linear-regression)))
(defn- multiclass-objective?
[objective]
(or (= objective "multi:softmax")
(= objective "multi:softprob")))
(def ^:private hyperparameters
{:subsample (ml-gs/linear 0.7 1.0 3)
:scale-pos-weight (ml-gs/linear 0.7 1.31 6)
:max-depth (ml-gs/linear 1 10 10 :int64)
:lambda (ml-gs/linear 0.01 0.31 30)
:gamma (ml-gs/linear 0.001 1 10)
:eta (ml-gs/linear 0 1 10)
:round (ml-gs/linear 5 46 5 :int64)
:alpha (ml-gs/linear 0.01 0.31 30)})
(defn ->dmatrix [feature-ds target-ds sparse-column n-sparse-columns]
(if sparse-column
(sparse-feature->dmatrix feature-ds target-ds sparse-column n-sparse-columns)
(dataset->dmatrix feature-ds target-ds)))
(defn- train
[feature-ds label-ds options]
;;XGBoost uses all cores so serialization here avoids over subscribing
;;the machine.
(locking #'multiclass-objective?
(let [objective (options->objective options)
sparse-column-or-nil (:sparse-column options)
train-dmat (->dmatrix feature-ds label-ds sparse-column-or-nil (:n-sparse-columns options))
base-watches (or (:watches options) {})
feature-cnames (ds/column-names feature-ds)
target-cnames (ds/column-names label-ds)
watches (->> base-watches
(reduce (fn [^Map watches [k v]]
(.put watches (ds-utils/column-safe-name k)
(->dmatrix
(ds/select-columns v feature-cnames)
(ds/select-columns v target-cnames)
sparse-column-or-nil
(:n-sparse-columns options)))
watches)
;;Linked hash map to preserve order
(LinkedHashMap.)))
round (or (:round options) 25)
early-stopping-round (or (when (:early-stopping-round options)
(int (:early-stopping-round options)))
0)
_ (when (and (> (count watches) 1)
(not (instance? LinkedHashMap (:watches options)))
(not= 0 early-stopping-round))
(log/warn "Early stopping indicated but watches has undefined iteration order.
Early stopping will always use the 'last' of the watches as defined by the iteration
order of the watches map. Consider using a java.util.LinkedHashMap for watches.
https://github.com/dmlc/xgboost/blob/master/jvm-packages/xgboost4j/src/main/java/ml/dml
c/xgboost4j/java/XGBoost.java#L208"))
watch-names (->> base-watches
(map-indexed (fn [idx [k v]]
[idx k]))
(into {}))
label-map (when (multiclass-objective? objective)
(ds-mod/inference-target-label-map label-ds))
params (->> (-> (dissoc options :model-type :watches)
(assoc :objective objective))
;;Adding in some defaults
(merge {}
{
:alpha 0.0
:eta 0.3
:lambda 1.0
:max-depth 6
:scale-pos-weight 1.0
:subsample 0.87
:silent 1}
options
(when label-map
{:num-class (count label-map)}))
(map (fn [[k v]]
(when v
[(s/replace (name k) "-" "_" ) v])))
(remove nil?)
(into {}))
^"[[F" metrics-data (when-not (empty? watches)
(->> (repeatedly (count watches)
#(float-array round))
(into-array)))
^Booster model (XGBoost/train train-dmat params
(long round)
(or watches {}) metrics-data nil nil
(int early-stopping-round))
out-s (ByteArrayOutputStream.)]
(.saveModel model out-s)
(merge
{:model-data (.toByteArray out-s)}
(when (seq watches)
{:metrics
(->> watches
(map-indexed vector)
(map (fn [[watch-idx [watch-name watch-data]]]
[(get watch-names watch-idx)
(aget metrics-data watch-idx)]))
(into {})
(ds/->>dataset {:dataset-name :metrics}))})))))
(defn- thaw-model
[model-data]
(-> (if (map? model-data)
(:model-data model-data)
model-data)
(ByteArrayInputStream.)
(XGBoost/loadModel)))
(defn- predict
[feature-ds thawed-model {:keys [target-columns target-categorical-maps options]}]
(let [sparse-column-or-nil (:sparse-column options)
dmatrix (->dmatrix feature-ds nil sparse-column-or-nil (:n-sparse-columns options))
prediction (.predict ^Booster thawed-model dmatrix)
predict-tensor (->> prediction
(dtt/->tensor))
target-cname (first target-columns)]
(if (multiclass-objective? (options->objective options))
(->
(model/finalize-classification predict-tensor
(ds/row-count feature-ds)
target-cname
target-categorical-maps)
(tech.v3.dataset.modelling/probability-distributions->label-column
(first target-columns))
(ds/update-column (first target-columns)
#(vary-meta % assoc :column-type :prediction)))
(model/finalize-regression predict-tensor target-cname))))
(defn- explain
[thawed-model {:keys [feature-columns options]}
{:keys [importance-type]
:or {importance-type "gain"}}]
(let [^Booster booster thawed-model
sparse-column-or-nil (:sparse-column options)]
(if sparse-column-or-nil
(let [score-map (.getScore booster "" (str importance-type))]
(ds/->dataset {:feature (keys score-map)
(keyword importance-type) (vals score-map)}))
(let [feature-col-map (->> feature-columns
(map (fn [name]
[name (ds-utils/column-safe-name name)]))
(into {}))
feature-columns (into-array String (map #(get feature-col-map %)
feature-columns))
^Map score-map (.getScore booster
^"[Ljava.lang.String;" feature-columns
^String importance-type)
col-inv-map (set/map-invert feature-col-map)]
;;It's not a great map...Something is off about iteration so I have
;;to transform it back into something sane.
(->> (keys score-map)
(map (fn [item-name]
{:importance-type importance-type
:colname (get col-inv-map item-name)
(keyword importance-type) (.get score-map item-name)}))
(sort-by (keyword importance-type) >)
(ds/->>dataset))))))
(doseq [objective (concat [:regression :classification]
(keys objective-types))]
(let [reg-def (get objective-types objective)]
(ml/define-model! (keyword "xgboost" (name objective))
train predict {:thaw-fn thaw-model
:explain-fn explain
:options (:options reg-def)
:hyperparameters hyperparameters
:documentation {:javadoc "https://xgboost.readthedocs.io/en/latest/jvm/javadocs/index.html"
:user-guide "https://xgboost.readthedocs.io/en/latest/jvm/index.html"}})))
(comment
(require '[tech.v3.dataset.column-filters :as cf])
(def src-ds (ds/->dataset "test/data/iris.csv"))
(def ds (-> src-ds
(ds/categorical->number cf/categorical)
(ds-mod/set-inference-target "species")))
(def feature-ds (cf/feature ds))
(def split-data (ds-mod/train-test-split ds))
(def train-ds (:train-ds split-data))
(def test-ds (:test-ds split-data))
(def model (ml/train train-ds {:validate-parameters 1
:round 10
:silent 0
:verbosity 3
:model-type :xgboost/classification}))
(def predictions (ml/predict test-ds model))
(ml/explain model)
(require '[tech.v3.ml.loss :as loss])
(require '[tech.v3.dataset.categorical :as ds-cat])
(loss/classification-accuracy (predictions "species")
(test-ds "species"))
;;0.93333
(def titanic (-> (ds/->dataset "test/data/titanic.csv")
(ds/drop-columns ["Name"])
(ds/update-column "Survived" (fn [col]
(dtype/emap #(if (== 1 (long %))
"survived"
"drowned")
:string col)))
(ds-mod/set-inference-target "Survived")))
(def titanic-numbers (ds/categorical->number titanic cf/categorical))
(def split-data (ds-mod/train-test-split titanic-numbers))
(def train-ds (:train-ds split-data))
(def test-ds (:test-ds split-data))
(def model (ml/train train-ds {:model-type :xgboost/classification}))
(def predictions (ml/predict test-ds model))
(loss/classification-accuracy (predictions "Survived")
(test-ds "Survived"))
;;0.8195488721804511
;;0.8308270676691729
(require '[tech.v3.ml.gridsearch :as ml-gs])
(def opt-map (merge {:model-type :xgboost/classification}
hyperparameters))
(def options-sequence (take 200 (ml-gs/sobol-gridsearch opt-map)))
(defn test-options
[options]
(let [model (ml/train train-ds options)
predictions (ml/predict test-ds model)
loss (loss/classification-loss (predictions "Survived")
(test-ds "Survived"))]
(assoc model :loss loss)))
(def models
(->> (map test-options options-sequence)
(sort-by :loss)
(take 10)
(map #(select-keys % [:loss :options])))))
;;consistently gets .849 or so accuracy on best models.