# XGboost with Poisson objective in Python: adding offset to deal with low frequencies

the following is taken from

https://stackoverflow.com/questions/46698872/poisson-regression-in-xgboost-fails-for-low-frequencies

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb

In [2]:
def get_preds(mult):
    # generate toy dataset for illustration
    # 4 observations with linearly increasing frequencies
    # the frequencies are scaled by `mult`
    dmat = xgb.DMatrix(data=np.array([[0, 0], [0, 1], [1, 0], [1, 1]]),
                       label=[i*mult for i in [1, 2, 3, 4]],
                       weight=[1000, 1000, 1000, 1000])

    ## adding an offset term equal to the log of the mean frequency
    offset = np.log(np.mean([i*mult for i in [1, 2, 3, 4]]))
    dmat.set_base_margin(np.repeat(offset, 4))

    # train a poisson booster on the toy data
    bst = xgb.train(
        params={"objective": "count:poisson"},
        dtrain=dmat,
        num_boost_round=100000,
        early_stopping_rounds=5,
        evals=[(dmat, "train")],
        verbose_eval=False)

    # return fitted frequencies after reversing scaling
    return bst.predict(dmat)/mult

In [3]:
# test multipliers in the range [10**(-8), 10**1]
# display fitted frequencies 
mults = [10**i for i in range(-8, 1)]
## round to 1 decimal point to show the result approaches 2.5
df = pd.DataFrame(np.round(np.vstack([get_preds(m) for m in mults]), 1))
df.index = mults
df.columns = ["(0, 0)", "(0, 1)", "(1, 0)", "(1, 1)"]
df

Unnamed: 0,"(0, 0)","(0, 1)","(1, 0)","(1, 1)"
1e-08,2.5,2.5,2.5,2.5
1e-07,2.5,2.5,2.5,2.5
1e-06,2.5,2.5,2.5,2.5
1e-05,2.5,2.5,2.5,2.5
0.0001,2.4,2.5,2.5,2.6
0.001,1.0,2.0,3.0,4.0
0.01,1.0,2.0,3.0,4.0
0.1,1.0,2.0,3.0,4.0
1.0,1.0,2.0,3.0,4.0
