In [1]:
#https://www.kaggle.com/xiaozhouwang/2nd-place-lightgbm-solution
import lightgbm as lgbm
from scipy import sparse as ssp
from sklearn.model_selection import StratifiedKFold, cross_val_score
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from hyperopt import hp, tpe
from hyperopt.fmin import fmin

In [42]:
def Gini(y_true, y_pred):
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]

    # sort rows on prediction column
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:, 0].argsort()][::-1, 0]
    pred_order = arr[arr[:, 1].argsort()][::-1, 0]

    # get Lorenz curves
    L_true = np.cumsum(true_order) * 1. / np.sum(true_order)
    L_pred = np.cumsum(pred_order) * 1. / np.sum(pred_order)
    L_ones = np.linspace(1 / n_samples, 1, n_samples)

    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)

    # normalize to true Gini coefficient
    return G_pred * 1. / G_true

In [43]:
cv_only = True
save_cv = True
full_train = False

In [44]:
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', Gini(labels, preds), True

path = "datasets/"
train = pd.read_csv(path+'train.csv')
train_label = train['target']
train_id = train['id']
test = pd.read_csv(path+'test.csv')
test_id = test['id']

In [45]:
train.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [46]:
NFOLDS = 5
kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=218)

y = train['target'].values
drop_feature = [
    'id',
    'target'
]

X = train.drop(drop_feature,axis=1)
feature_names = X.columns.tolist()
cat_features = [c for c in feature_names if ('cat' in c and 'count' not in c)]
num_features = [c for c in feature_names if ('cat' not in c and 'calc' not in c)]

#list how many items are missing per row
train['missing'] = (train==-1).sum(axis=1).astype(float)
test['missing'] = (test==-1).sum(axis=1).astype(float)
num_features.append('missing')

In [47]:
train[['ps_ind_01','ps_ind_02_cat','missing']].head()

Unnamed: 0,ps_ind_01,ps_ind_02_cat,missing
0,2,2,1.0
1,1,1,2.0
2,5,4,3.0
3,0,1,0.0
4,0,2,2.0


In [48]:
for c in cat_features:
    le = LabelEncoder()
    le.fit(train[c])
    train[c] = le.transform(train[c])
    test[c] = le.transform(test[c])

In [49]:
enc = OneHotEncoder()
enc.fit(train[cat_features])
X_cat = enc.transform(train[cat_features])
X_t_cat = enc.transform(test[cat_features])

In [50]:
# took features with ind in name and added back to back with string, such as 'ps_ind_03'
ind_features = [c for c in feature_names if 'ind' in c]
count=0
for c in ind_features:
    if count==0:
        train['new_ind'] = train[c].astype(str)+'_'
        test['new_ind'] = test[c].astype(str)+'_'
        count+=1
    else:
        train['new_ind'] += train[c].astype(str)+'_'
        test['new_ind'] += test[c].astype(str)+'_'

cat_count_features = []

In [51]:
train[ind_features].head()

Unnamed: 0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin
0,2,2,5,2,1,0,1,0,0,0,0,0,0,0,11,0,1,0
1,1,1,7,1,1,0,0,1,0,0,0,0,0,0,3,0,0,1
2,5,4,9,2,1,0,0,1,0,0,0,0,0,0,12,1,0,0
3,0,1,2,1,1,1,0,0,0,0,0,0,0,0,8,1,0,0
4,0,2,0,2,1,1,0,0,0,0,0,0,0,0,9,1,0,0


In [52]:
compare=ind_features
compare.append('new_ind')
train[compare].head()

Unnamed: 0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,new_ind
0,2,2,5,2,1,0,1,0,0,0,0,0,0,0,11,0,1,0,2_2_5_2_1_0_1_0_0_0_0_0_0_0_11_0_1_0_
1,1,1,7,1,1,0,0,1,0,0,0,0,0,0,3,0,0,1,1_1_7_1_1_0_0_1_0_0_0_0_0_0_3_0_0_1_
2,5,4,9,2,1,0,0,1,0,0,0,0,0,0,12,1,0,0,5_4_9_2_1_0_0_1_0_0_0_0_0_0_12_1_0_0_
3,0,1,2,1,1,1,0,0,0,0,0,0,0,0,8,1,0,0,0_1_2_1_1_1_0_0_0_0_0_0_0_0_8_1_0_0_
4,0,2,0,2,1,1,0,0,0,0,0,0,0,0,9,1,0,0,0_2_0_2_1_1_0_0_0_0_0_0_0_0_9_1_0_0_


In [53]:
# get value counts for each categorical and new ind column
for c in cat_features+['new_ind']:
    d = pd.concat([train[c],test[c]]).value_counts().to_dict()
    train['%s_count'%c] = train[c].apply(lambda x:d.get(x,0))
    test['%s_count'%c] = test[c].apply(lambda x:d.get(x,0))
    cat_count_features.append('%s_count'%c)

    # take all numeric features, features like ps_car_03_cat_count
train_list = [train[num_features+cat_count_features].values,X_cat,]
test_list = [test[num_features+cat_count_features].values,X_t_cat,]

In [54]:
c='ps_car_03_cat'
train[c].apply(lambda x:d.get(x,0))

0         8228
1         8228
2         8228
3         6379
4         8228
5         8228
6         8228
7         6379
8         8228
9         6379
10        8228
11        8228
12        8228
13        6379
14        8228
15        8228
16        8228
17        7992
18        8228
19        8228
20        8228
21        8228
22        6379
23        8228
24        8228
25        8228
26        8228
27        8228
28        8228
29        8228
          ... 
595182    8228
595183    8228
595184    8228
595185    7992
595186    6379
595187    8228
595188    8228
595189    6379
595190    8228
595191    8228
595192    8228
595193    6379
595194    6379
595195    7992
595196    8228
595197    8228
595198    7992
595199    7992
595200    8228
595201    6379
595202    6379
595203    7992
595204    8228
595205    6379
595206    8228
595207    8228
595208    8228
595209    8228
595210    8228
595211    8228
Name: ps_car_03_cat, Length: 595212, dtype: int64

In [56]:
# cat count takes the
train[['ps_car_03_cat','ps_car_03_cat_count']].head()

Unnamed: 0,ps_car_03_cat,ps_car_03_cat_count
0,0,1028142
1,0,1028142
2,0,1028142
3,1,183044
4,0,1028142


In [22]:
train_list

[array([[  2.00000000e+00,   5.00000000e+00,   0.00000000e+00, ...,
           1.47546000e+06,   1.83260000e+04,   6.00000000e+00],
        [  1.00000000e+00,   7.00000000e+00,   0.00000000e+00, ...,
           1.47546000e+06,   1.25350000e+04,   3.60000000e+01],
        [  5.00000000e+00,   9.00000000e+00,   0.00000000e+00, ...,
           1.47546000e+06,   1.99430000e+04,   2.40000000e+01],
        ..., 
        [  1.00000000e+00,   1.00000000e+01,   1.00000000e+00, ...,
           1.47546000e+06,   1.31430000e+04,   3.82000000e+02],
        [  5.00000000e+00,   3.00000000e+00,   0.00000000e+00, ...,
           1.47546000e+06,   1.84160000e+04,   6.50000000e+01],
        [  0.00000000e+00,   8.00000000e+00,   1.00000000e+00, ...,
           1.47546000e+06,   1.17740000e+04,   6.99000000e+02]]),
 <595212x184 sparse matrix of type '<type 'numpy.float64'>'
 	with 8332968 stored elements in Compressed Sparse Row format>]

In [16]:
# turns sparse matrix to Compressed Sparse Row format
X = ssp.hstack(train_list).tocsr()
X_test = ssp.hstack(test_list).tocsr() #X_test is unknown X

In [17]:
X.shape

(595212, 223)

In [31]:
# training light gbm

learning_rate = 0.1
num_leaves = 15
min_data_in_leaf = 2000
feature_fraction = 0.6
num_boost_round = 10000
params = {"objective": "binary",
          "boosting_type": "gbdt",
          "learning_rate": learning_rate,
          "num_leaves": num_leaves,
           "max_bin": 256,
          "feature_fraction": feature_fraction,
          "verbosity": 2,
          "drop_rate": 0.1,
          "is_unbalance": False,
          "max_drop": 50,
          "min_child_samples": 10,
          "min_child_weight": 150,
          "min_split_gain": 0,
          "subsample": 0.9
          }
x_score = []
final_cv_train = np.zeros(len(train_label))
final_cv_pred = np.zeros(len(test_id))

for s in xrange(16):
    cv_train = np.zeros(len(train_label))
    cv_pred = np.zeros(len(test_id))

    params['seed'] = s

    if cv_only:
        kf = kfold.split(X, train_label) #split randomly

        best_trees = []
        fold_scores = []

        for i, (train_fold, validate) in enumerate(kf): #split into 3 crossfolds
            X_train, X_validate, label_train, label_validate = \ #for each crossfold
                X[train_fold, :], X[validate, :], train_label[train_fold], train_label[validate]
            dtrain = lgbm.Dataset(X_train, label_train)
            dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)
            bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, feval=evalerror, verbose_eval=100,
                            early_stopping_rounds=100) # best iteration?? = best number of trees?
            best_trees.append(bst.best_iteration)
            cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration) # set best_iteration as number of trees?
            cv_train[validate] += bst.predict(X_validate) #predict validation set

            score = Gini(label_validate, cv_train[validate])
            print score
            fold_scores.append(score)

        cv_pred /= NFOLDS
        final_cv_train += cv_train
        final_cv_pred += cv_pred

        print("cv score:")
        print Gini(train_label, cv_train)
        print "current score:", Gini(train_label, final_cv_train / (s + 1.)), s+1
        print(fold_scores)
        print(best_trees, np.mean(best_trees))

        x_score.append(Gini(train_label, cv_train))

print(x_score)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.151577	valid_0's gini: 0.288901
[200]	valid_0's binary_logloss: 0.151349	valid_0's gini: 0.295167
[300]	valid_0's binary_logloss: 0.151361	valid_0's gini: 0.295042
Early stopping, best iteration is:
[200]	valid_0's binary_logloss: 0.151349	valid_0's gini: 0.295167
0.295167170263
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.152171	valid_0's gini: 0.271694
[200]	valid_0's binary_logloss: 0.152098	valid_0's gini: 0.274733
[300]	valid_0's binary_logloss: 0.152157	valid_0's gini: 0.273878
Early stopping, best iteration is:
[203]	valid_0's binary_logloss: 0.152087	valid_0's gini: 0.275084
0.275084036478
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.152128	valid_0's gini: 0.27766
[200]	valid_0's binary_logloss: 0.152029	valid_0's gini: 0.281412
[300]	valid_0's binary_logloss: 0.152069	valid_0's g

[300]	valid_0's binary_logloss: 0.152088	valid_0's gini: 0.277148
Early stopping, best iteration is:
[229]	valid_0's binary_logloss: 0.152052	valid_0's gini: 0.277427
0.277426584798
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.152122	valid_0's gini: 0.277802
[200]	valid_0's binary_logloss: 0.152001	valid_0's gini: 0.281894
[300]	valid_0's binary_logloss: 0.152056	valid_0's gini: 0.280935
Early stopping, best iteration is:
[230]	valid_0's binary_logloss: 0.151975	valid_0's gini: 0.282751
0.282750529759
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.151882	valid_0's gini: 0.279485
[200]	valid_0's binary_logloss: 0.151733	valid_0's gini: 0.283484
Early stopping, best iteration is:
[192]	valid_0's binary_logloss: 0.151733	valid_0's gini: 0.283618
0.283618219704
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.1515	valid_0's gini: 0.293661
[20

0.284081719935
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.151863	valid_0's gini: 0.279675
[200]	valid_0's binary_logloss: 0.151733	valid_0's gini: 0.283896
Early stopping, best iteration is:
[173]	valid_0's binary_logloss: 0.151713	valid_0's gini: 0.284604
0.284603731766
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.151503	valid_0's gini: 0.293374
[200]	valid_0's binary_logloss: 0.151428	valid_0's gini: 0.295331
Early stopping, best iteration is:
[156]	valid_0's binary_logloss: 0.151402	valid_0's gini: 0.296149
0.296148541589
cv score:
0.287137425209
current score: 0.289327987871 9
[0.29619749948460927, 0.27476655523619969, 0.28408171993532033, 0.28460373176602766, 0.29614854158905191]
([251, 148, 167, 173, 156], 179.0)
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.151567	valid_0's gini: 0.289577
[200]	valid_0's binary_logloss: 0.15

0.286872227872
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.15153	valid_0's gini: 0.292415
[200]	valid_0's binary_logloss: 0.151455	valid_0's gini: 0.295758
Early stopping, best iteration is:
[170]	valid_0's binary_logloss: 0.151436	valid_0's gini: 0.296119
0.296118721615
cv score:
0.28776871839
current score: 0.289501234106 13
[0.29637244649312794, 0.27530134191239125, 0.28490339141563165, 0.28687222787184757, 0.29611872161535618]
([231, 246, 196, 205, 170], 209.59999999999999)
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.151564	valid_0's gini: 0.289655
[200]	valid_0's binary_logloss: 0.15136	valid_0's gini: 0.295157
[300]	valid_0's binary_logloss: 0.151378	valid_0's gini: 0.294829
Early stopping, best iteration is:
[213]	valid_0's binary_logloss: 0.15135	valid_0's gini: 0.295556
0.295555830216
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_log

In [32]:
# takes avg of 16 rounds for final prediction / test submission is the first one
pd.DataFrame({'id': test_id, 'target': final_cv_pred / 16.}).to_csv('answer_submissions/lgbm3_pred_avg.csv', index=False)
pd.DataFrame({'id': train_id, 'target': final_cv_train / 16.}).to_csv('answer_submissions/lgbm3_cv_avg.csv', index=False)

In [33]:
len(test)

892816

In [112]:
pd.DataFrame({'id': test_id, 'target': final_cv_pred / 16.}).describe()

Unnamed: 0,id,target
count,892816.0,892816.0
mean,744153.5,0.036469
std,429683.0,0.019811
min,0.0,0.007284
25%,372021.8,0.02318
50%,744307.0,0.031687
75%,1116308.0,0.044059
max,1488026.0,0.352603


In [35]:
len(pd.DataFrame({'id': test_id, 'target': final_cv_pred / 16.}))

892816

In [37]:
pd.read_csv('answer_submissions/lgbm3_pred_avg.csv').head()

Unnamed: 0,id,target
0,0,0.027714
1,1,0.024096
2,2,0.023101
3,3,0.01434
4,4,0.036072


# My Implementation

In [48]:
# get number of leaves (num_leaves is n_estimators) 
# X for training X_test for teesting train_label for Y

light_train=lgbm.Dataset(X,train_label.values)

learning_rate = 0.1
num_leaves = 15
min_data_in_leaf = 2000
feature_fraction = 0.6
num_boost_round = 10000

params = {"objective": "binary",
          "boosting_type": "gbdt",
          "learning_rate": learning_rate,
          "num_leaves": num_leaves,
           "max_bin": 256,
          "feature_fraction": feature_fraction,
          "verbosity": 2,
          "drop_rate": 0.1,
          "is_unbalance": False,
          "max_drop": 50,
          "min_child_samples": 10,
          "min_child_weight": 150,
          "min_split_gain": 0,
          "subsample": 0.9,
          "n_jobs": 4,
          }
clf = lgbm.LGBMClassifier(**params)

cvresult = lgbm.cv(clf.get_params(), light_train, num_boost_round=5000, nfold=10, metrics=['auc'],
                   early_stopping_rounds=100, stratified=True, seed=1301,verbose_eval=True)

[1]	cv_agg's auc: 0.600494 + 0.00725783
[2]	cv_agg's auc: 0.612722 + 0.00628275
[3]	cv_agg's auc: 0.616728 + 0.00652803
[4]	cv_agg's auc: 0.616278 + 0.00700242
[5]	cv_agg's auc: 0.616949 + 0.00687049
[6]	cv_agg's auc: 0.618899 + 0.00670483
[7]	cv_agg's auc: 0.618516 + 0.0067364
[8]	cv_agg's auc: 0.621811 + 0.00695485
[9]	cv_agg's auc: 0.622837 + 0.00680266
[10]	cv_agg's auc: 0.623101 + 0.00659046
[11]	cv_agg's auc: 0.622569 + 0.00673268
[12]	cv_agg's auc: 0.622868 + 0.00651344
[13]	cv_agg's auc: 0.622591 + 0.00652529
[14]	cv_agg's auc: 0.62271 + 0.00687316
[15]	cv_agg's auc: 0.622766 + 0.00718229
[16]	cv_agg's auc: 0.623671 + 0.00718135
[17]	cv_agg's auc: 0.624112 + 0.00696519
[18]	cv_agg's auc: 0.624029 + 0.00712927
[19]	cv_agg's auc: 0.624017 + 0.00713415
[20]	cv_agg's auc: 0.624216 + 0.00694471
[21]	cv_agg's auc: 0.624683 + 0.00707898
[22]	cv_agg's auc: 0.624533 + 0.00708634
[23]	cv_agg's auc: 0.625168 + 0.00701597
[24]	cv_agg's auc: 0.625897 + 0.00693674
[25]	cv_agg's auc: 0.626108

In [116]:
params = {"objective": "binary",
          "boosting_type": "gbdt",
          "learning_rate": learning_rate,
          "num_leaves": num_leaves,
           "max_bin": 256,
          "feature_fraction": feature_fraction,
          "verbosity": 2,
          "drop_rate": 0.1,
          "is_unbalance": False,
          "max_drop": 50,
          "min_child_samples": 10,
          "min_child_weight": 150,
          "min_split_gain": 0,
          "subsample": 0.9,
          "n_jobs": 4,
          }
clf = lgbm.LGBMClassifier(**params)

clf.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'drop_rate': 0.1,
 'feature_fraction': 0.6,
 'is_unbalance': False,
 'learning_rate': 0.1,
 'max_bin': 256,
 'max_depth': -1,
 'max_drop': 50,
 'min_child_samples': 10,
 'min_child_weight': 150,
 'min_split_gain': 0,
 'n_estimators': 100,
 'n_jobs': 4,
 'num_leaves': 15,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 0.9,
 'subsample_for_bin': 200000,
 'subsample_freq': 1,
 'verbosity': 2}

In [83]:
# http://lightgbm.readthedocs.io/en/latest/Python-API.html
def objective(params):
    params = {
        'max_bin': int(params['max_bin']),
        'num_leaves': int(params['num_leaves']),
        'reg_alpha': float(params['reg_alpha']),
        #'boosting_type': str(params['boosting_type'])
    }
    
    clf.set_params(**params)
    
    score = cross_val_score(clf, X, train_label.values, scoring='roc_auc', cv=StratifiedKFold(5),n_jobs=-1).mean()
    print("gini_score {:.3f} / roc_auc: {} / params {}".format(2*score-1, score, params))
    return score

space = {
    'max_bin': hp.quniform('max_bin', 200, 300, 5), #256
    'num_leaves': hp.quniform('num_leaves', 15, 40, 5), #15
    'reg_alpha': hp.choice('reg_alpha', [0, 0.5, 0.05, 1]), #'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
    #'boosting_type': hp.choice('boosting_type', ['gbdt','dart'])
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=500)
# beat .642 rocauc and .28 gini

gini_score 0.284 / roc_auc: 0.642100486326 / params {'num_leaves': 35, 'reg_alpha': 0.05, 'max_bin': 255}
gini_score 0.283 / roc_auc: 0.641719664967 / params {'num_leaves': 30, 'reg_alpha': 1.0, 'max_bin': 215}
gini_score 0.284 / roc_auc: 0.641979964424 / params {'num_leaves': 25, 'reg_alpha': 1.0, 'max_bin': 295}
gini_score 0.283 / roc_auc: 0.641294380213 / params {'num_leaves': 35, 'reg_alpha': 0.05, 'max_bin': 280}
gini_score 0.283 / roc_auc: 0.641477057386 / params {'num_leaves': 40, 'reg_alpha': 0.0, 'max_bin': 230}
gini_score 0.283 / roc_auc: 0.641294380213 / params {'num_leaves': 35, 'reg_alpha': 0.05, 'max_bin': 280}
gini_score 0.284 / roc_auc: 0.641858649153 / params {'num_leaves': 20, 'reg_alpha': 0.05, 'max_bin': 250}
gini_score 0.283 / roc_auc: 0.64164530945 / params {'num_leaves': 20, 'reg_alpha': 0.5, 'max_bin': 200}
gini_score 0.283 / roc_auc: 0.641663088751 / params {'num_leaves': 35, 'reg_alpha': 0.0, 'max_bin': 235}
gini_score 0.284 / roc_auc: 0.641901557053 / params 

gini_score 0.283 / roc_auc: 0.64129739569 / params {'num_leaves': 15, 'reg_alpha': 0.0, 'max_bin': 255}
gini_score 0.283 / roc_auc: 0.641435374177 / params {'num_leaves': 20, 'reg_alpha': 0.5, 'max_bin': 270}
gini_score 0.282 / roc_auc: 0.640810159255 / params {'num_leaves': 15, 'reg_alpha': 0.0, 'max_bin': 295}
gini_score 0.282 / roc_auc: 0.641088309814 / params {'num_leaves': 15, 'reg_alpha': 1.0, 'max_bin': 300}
gini_score 0.284 / roc_auc: 0.642097533569 / params {'num_leaves': 20, 'reg_alpha': 0.05, 'max_bin': 270}
gini_score 0.284 / roc_auc: 0.642050820088 / params {'num_leaves': 25, 'reg_alpha': 0.0, 'max_bin': 265}
gini_score 0.284 / roc_auc: 0.641858649153 / params {'num_leaves': 20, 'reg_alpha': 0.05, 'max_bin': 250}
gini_score 0.281 / roc_auc: 0.640748105352 / params {'num_leaves': 15, 'reg_alpha': 0.0, 'max_bin': 280}
gini_score 0.283 / roc_auc: 0.641402647408 / params {'num_leaves': 15, 'reg_alpha': 0.5, 'max_bin': 275}
gini_score 0.283 / roc_auc: 0.641654172215 / params {'

gini_score 0.284 / roc_auc: 0.642237347908 / params {'num_leaves': 20, 'reg_alpha': 0.05, 'max_bin': 295}
gini_score 0.282 / roc_auc: 0.641094272878 / params {'num_leaves': 15, 'reg_alpha': 0.05, 'max_bin': 285}
gini_score 0.283 / roc_auc: 0.64161129509 / params {'num_leaves': 20, 'reg_alpha': 0.5, 'max_bin': 290}
gini_score 0.283 / roc_auc: 0.641390638789 / params {'num_leaves': 15, 'reg_alpha': 0.05, 'max_bin': 270}
gini_score 0.283 / roc_auc: 0.641259813588 / params {'num_leaves': 15, 'reg_alpha': 1.0, 'max_bin': 285}
gini_score 0.283 / roc_auc: 0.641469020029 / params {'num_leaves': 20, 'reg_alpha': 0.05, 'max_bin': 300}
gini_score 0.281 / roc_auc: 0.640717541203 / params {'num_leaves': 15, 'reg_alpha': 0.05, 'max_bin': 280}
gini_score 0.282 / roc_auc: 0.641024996525 / params {'num_leaves': 15, 'reg_alpha': 0.05, 'max_bin': 290}
gini_score 0.283 / roc_auc: 0.641552493208 / params {'num_leaves': 20, 'reg_alpha': 0.05, 'max_bin': 265}
gini_score 0.283 / roc_auc: 0.641315768733 / para

gini_score 0.283 / roc_auc: 0.641552493208 / params {'num_leaves': 20, 'reg_alpha': 0.05, 'max_bin': 265}
gini_score 0.282 / roc_auc: 0.641024996525 / params {'num_leaves': 15, 'reg_alpha': 0.05, 'max_bin': 290}
gini_score 0.281 / roc_auc: 0.640748105352 / params {'num_leaves': 15, 'reg_alpha': 0.0, 'max_bin': 280}
gini_score 0.283 / roc_auc: 0.641259813588 / params {'num_leaves': 15, 'reg_alpha': 1.0, 'max_bin': 285}
gini_score 0.284 / roc_auc: 0.642097533569 / params {'num_leaves': 20, 'reg_alpha': 0.05, 'max_bin': 270}
gini_score 0.284 / roc_auc: 0.642237347908 / params {'num_leaves': 20, 'reg_alpha': 0.05, 'max_bin': 295}
gini_score 0.284 / roc_auc: 0.642100486326 / params {'num_leaves': 35, 'reg_alpha': 0.05, 'max_bin': 255}
gini_score 0.283 / roc_auc: 0.641369776519 / params {'num_leaves': 15, 'reg_alpha': 0.5, 'max_bin': 240}
gini_score 0.284 / roc_auc: 0.641940443691 / params {'num_leaves': 30, 'reg_alpha': 0.05, 'max_bin': 275}
gini_score 0.282 / roc_auc: 0.641050773761 / para

gini_score 0.283 / roc_auc: 0.641315768733 / params {'num_leaves': 15, 'reg_alpha': 0.5, 'max_bin': 295}
gini_score 0.282 / roc_auc: 0.641144720832 / params {'num_leaves': 15, 'reg_alpha': 0.0, 'max_bin': 275}
gini_score 0.283 / roc_auc: 0.641745562911 / params {'num_leaves': 20, 'reg_alpha': 1.0, 'max_bin': 260}
gini_score 0.282 / roc_auc: 0.641024996525 / params {'num_leaves': 15, 'reg_alpha': 0.05, 'max_bin': 290}
gini_score 0.281 / roc_auc: 0.640717541203 / params {'num_leaves': 15, 'reg_alpha': 0.05, 'max_bin': 280}
gini_score 0.284 / roc_auc: 0.642097533569 / params {'num_leaves': 20, 'reg_alpha': 0.05, 'max_bin': 270}
gini_score 0.282 / roc_auc: 0.641133587263 / params {'num_leaves': 15, 'reg_alpha': 0.05, 'max_bin': 300}
gini_score 0.284 / roc_auc: 0.642034534482 / params {'num_leaves': 35, 'reg_alpha': 0.05, 'max_bin': 285}
gini_score 0.284 / roc_auc: 0.641777342313 / params {'num_leaves': 20, 'reg_alpha': 0.05, 'max_bin': 260}
gini_score 0.284 / roc_auc: 0.641980006568 / para

gini_score 0.284 / roc_auc: 0.642039010152 / params {'num_leaves': 25, 'reg_alpha': 0.05, 'max_bin': 280}
gini_score 0.283 / roc_auc: 0.641390638789 / params {'num_leaves': 15, 'reg_alpha': 0.05, 'max_bin': 270}
gini_score 0.282 / roc_auc: 0.641050773761 / params {'num_leaves': 15, 'reg_alpha': 0.05, 'max_bin': 260}
gini_score 0.284 / roc_auc: 0.642237347908 / params {'num_leaves': 20, 'reg_alpha': 0.05, 'max_bin': 295}
gini_score 0.283 / roc_auc: 0.641402647408 / params {'num_leaves': 15, 'reg_alpha': 0.5, 'max_bin': 275}
gini_score 0.283 / roc_auc: 0.641708880217 / params {'num_leaves': 20, 'reg_alpha': 0.05, 'max_bin': 285}
gini_score 0.283 / roc_auc: 0.641426563388 / params {'num_leaves': 15, 'reg_alpha': 0.0, 'max_bin': 300}
gini_score 0.284 / roc_auc: 0.641771666971 / params {'num_leaves': 30, 'reg_alpha': 1.0, 'max_bin': 250}
gini_score 0.282 / roc_auc: 0.641024996525 / params {'num_leaves': 15, 'reg_alpha': 0.05, 'max_bin': 290}
gini_score 0.284 / roc_auc: 0.641972080767 / para

gini_score 0.283 / roc_auc: 0.641390638789 / params {'num_leaves': 15, 'reg_alpha': 0.05, 'max_bin': 270}
gini_score 0.283 / roc_auc: 0.641426563388 / params {'num_leaves': 15, 'reg_alpha': 0.0, 'max_bin': 300}
gini_score 0.284 / roc_auc: 0.641754466362 / params {'num_leaves': 20, 'reg_alpha': 0.05, 'max_bin': 275}
gini_score 0.283 / roc_auc: 0.641259813588 / params {'num_leaves': 15, 'reg_alpha': 1.0, 'max_bin': 285}
gini_score 0.282 / roc_auc: 0.641179224229 / params {'num_leaves': 15, 'reg_alpha': 0.05, 'max_bin': 205}
gini_score 0.282 / roc_auc: 0.641024996525 / params {'num_leaves': 15, 'reg_alpha': 0.05, 'max_bin': 290}
gini_score 0.284 / roc_auc: 0.641915203311 / params {'num_leaves': 40, 'reg_alpha': 0.05, 'max_bin': 280}
gini_score 0.284 / roc_auc: 0.642046430034 / params {'num_leaves': 30, 'reg_alpha': 0.05, 'max_bin': 265}
gini_score 0.282 / roc_auc: 0.641240724338 / params {'num_leaves': 15, 'reg_alpha': 0.05, 'max_bin': 255}
gini_score 0.284 / roc_auc: 0.641876969676 / par

In [98]:
print("Hyperopt estimated optimum {}".format(best))
learn_dict={'learning_rate': 0.01,'num_leaves': 15, 'reg_alpha': 2, 'max_bin': 280}
clf.set_params(**learn_dict)
clf.get_params()

Hyperopt estimated optimum {'num_leaves': 15.0, 'reg_alpha': 2, 'max_bin': 280.0}


{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'drop_rate': 0.1,
 'feature_fraction': 0.6,
 'is_unbalance': False,
 'learning_rate': 0.01,
 'max_bin': 280,
 'max_depth': -1,
 'max_drop': 50,
 'min_child_samples': 10,
 'min_child_weight': 150,
 'min_split_gain': 0,
 'n_estimators': 100,
 'n_jobs': 4,
 'num_leaves': 15,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 2,
 'reg_lambda': 0.0,
 'seed': 1,
 'silent': True,
 'subsample': 0.9,
 'subsample_for_bin': 200000,
 'subsample_freq': 1,
 'verbosity': 2}

In [126]:
# previous model without hyperopt
pd.read_csv('answer_submissions/lgbm3_pred_avg.csv').describe()

Unnamed: 0,id,target
count,892816.0,892816.0
mean,744153.5,0.036469
std,429683.0,0.019811
min,0.0,0.007284
25%,372021.8,0.02318
50%,744307.0,0.031687
75%,1116308.0,0.044059
max,1488026.0,0.352603


In [123]:
#bag of 15 models
rounds = 2
preds_mat = np.zeros((len(test['id'].index), rounds))
for i in range(rounds):
    print "Review %d of %d" % (range(rounds).index(i)+1,len(range(rounds)))
    clf.set_params(seed = i + 1)
    clf.fit(X, train_label.values)
    preds_tmp = clf.predict_proba(X_test)[:, 1]
    preds_mat[:, i] = preds_tmp
bagged_preds = preds_mat.mean(axis = 1)
#xboost_preds=pd.DataFrame({'bidder_id':unk_ids,'prediction':bagged_preds})

Review 1 of 2
Review 2 of 2


In [122]:
xboost_preds=pd.DataFrame({'bidder_id':test['id'],'prediction':bagged_preds})
xboost_preds.describe()

Unnamed: 0,bidder_id,prediction
count,892816.0,892816.0
mean,744153.5,0.036493
std,429683.0,0.018646
min,0.0,0.009438
25%,372021.8,0.023933
50%,744307.0,0.03204
75%,1116308.0,0.043625
max,1488026.0,0.25278


In [128]:
#bag of 15 models
rounds = 15
preds_mat = np.zeros((len(test['id'].index), rounds))
for i in range(rounds):
    print "Review %d of %d" % (range(rounds).index(i)+1,len(range(rounds)))
    clf.set_params(seed = i + 1)
    clf.fit(X, train_label.values)
    preds_tmp = clf.predict_proba(X_test)[:, 1]
    preds_mat[:, i] = preds_tmp
bagged_preds = preds_mat.mean(axis = 1)
#xboost_preds=pd.DataFrame({'bidder_id':unk_ids,'prediction':bagged_preds})
xboost_preds=pd.DataFrame({'id':test['id'],'target':bagged_preds})
xboost_preds.describe()

Review 1 of 15
Review 2 of 15
Review 3 of 15
Review 4 of 15
Review 5 of 15
Review 6 of 15
Review 7 of 15
Review 8 of 15
Review 9 of 15
Review 10 of 15
Review 11 of 15
Review 12 of 15
Review 13 of 15
Review 14 of 15
Review 15 of 15


Unnamed: 0,bidder_id,prediction
count,892816.0,892816.0
mean,744153.5,0.036493
std,429683.0,0.018631
min,0.0,0.009569
25%,372021.8,0.023949
50%,744307.0,0.032053
75%,1116308.0,0.043613
max,1488026.0,0.256617


In [133]:
xboost_preds=pd.DataFrame({'id':test['id'],'target':bagged_preds})
xboost_preds.describe()
xboost_preds.to_csv('submissions/xgb_bagged_hyperopt.csv',index=False)

In [63]:
x=0.572263853982
(x+1.)/2.

0.786131926991