In [1]:
import pandas as pd

# Kaggle GiveMeSomeCredit daat
df = pd.read_csv('./data/train.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [2]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from ngboost.distns import BetaBinom
from ngboost.scores import LogScore, CRPScore
import ngboost as ngb

In [3]:
features = [col for col in df.columns.tolist() if col not in ['Unnamed: 0', 'SeriousDlqin2yrs']]
cat_features = list(df[features].select_dtypes(exclude=['float64','int64']).columns)
for feature in cat_features:
    df[feature] = df[feature].astype('category').cat.codes

In [4]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)
print(train.shape)
print(test.shape)

X_train, X_val, Y_train, Y_val = train_test_split(train[features], train['SeriousDlqin2yrs'], test_size=0.05)
print(X_train.shape)
print(X_val.shape)

X_train2, X_val2, Y_train2, Y_val2 = train_test_split(X_train, Y_train, test_size=0.95)
print(X_train2.shape)
print(X_val2.shape)

(120000, 12)
(30000, 12)
(114000, 10)
(6000, 10)
(5700, 10)
(108300, 10)


In [5]:
learner = DecisionTreeRegressor(criterion='friedman_mse', max_depth=5)
base_model = ngb.NGBRegressor(
            Dist=BetaBinom, 
            Score=LogScore, 
            Base=learner, 
            n_estimators=2000, 
            learning_rate=0.01,
            col_sample=1.0,
            minibatch_frac=1.0)
base_model.fit(X_train2.fillna(-1).values, Y_train2.values, X_val=X_val.fillna(-1).values, Y_val=Y_val.values, early_stopping_rounds=2)

[iter 0] loss=0.2550 val_loss=0.2472 scale=2.0000 norm=3.3090
== Early stopping achieved.
== Best iteration / VAL0 (val_loss=0.2472)


NGBRegressor(Base=DecisionTreeRegressor(ccp_alpha=0.0, criterion='friedman_mse',
                                        max_depth=5, max_features=None,
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        presort='deprecated', random_state=None,
                                        splitter='best'),
             Dist=<class 'ngboost.distns.betabinom.BetaBinom'>,
             Score=<class 'ngboost.scores.LogScore'>, col_sample=1.0,
             learning_rate=0.01, minibatch_frac=1.0, n_estimators=2000,
             natural_gradient=True,
             random_state=RandomState(MT19937) at 0x1F3B3F3D990, tol=0.0001,
             verbose=True, verbose_eval=100)

In [6]:
pred_dist = base_model.pred_dist(X_train.fillna(-1).values)

In [7]:
pred_dist.ppf(0.5).shape

(114000,)

In [8]:
base_model.predict(X_train.fillna(-1).values)

array([0.07052632, 0.07052632, 0.07052632, ..., 0.07052632, 0.07052632,
       0.07052632])