In [41]:
import lightgbm as lgbm
from scipy import sparse as ssp
from sklearn.model_selection import StratifiedKFold, cross_val_score
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from hyperopt import hp, tpe
from hyperopt.fmin import fmin

In [42]:
def Gini(y_true, y_pred):
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]

    # sort rows on prediction column
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:, 0].argsort()][::-1, 0]
    pred_order = arr[arr[:, 1].argsort()][::-1, 0]

    # get Lorenz curves
    L_true = np.cumsum(true_order) * 1. / np.sum(true_order)
    L_pred = np.cumsum(pred_order) * 1. / np.sum(pred_order)
    L_ones = np.linspace(1 / n_samples, 1, n_samples)

    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)

    # normalize to true Gini coefficient
    return G_pred * 1. / G_true

In [43]:
cv_only = True
save_cv = True
full_train = False

In [44]:
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', Gini(labels, preds), True

path = "datasets/"
train = pd.read_csv(path+'train.csv')
train_label = train['target']
train_id = train['id']
test = pd.read_csv(path+'test.csv')
test_id = test['id']

In [45]:
train.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [46]:
NFOLDS = 5
kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=218)

y = train['target'].values
drop_feature = [
    'id',
    'target'
]

X = train.drop(drop_feature,axis=1)
feature_names = X.columns.tolist()
cat_features = [c for c in feature_names if ('cat' in c and 'count' not in c)]
num_features = [c for c in feature_names if ('cat' not in c and 'calc' not in c)]

#list how many items are missing per row
train['missing'] = (train==-1).sum(axis=1).astype(float)
test['missing'] = (test==-1).sum(axis=1).astype(float)
num_features.append('missing')

In [47]:
train[['ps_ind_01','ps_ind_02_cat','missing']].head()

Unnamed: 0,ps_ind_01,ps_ind_02_cat,missing
0,2,2,1.0
1,1,1,2.0
2,5,4,3.0
3,0,1,0.0
4,0,2,2.0


In [48]:
for c in cat_features:
    le = LabelEncoder()
    le.fit(train[c])
    train[c] = le.transform(train[c])
    test[c] = le.transform(test[c])

In [49]:
enc = OneHotEncoder()
enc.fit(train[cat_features])
X_cat = enc.transform(train[cat_features])
X_t_cat = enc.transform(test[cat_features])

In [50]:
# took features with ind in name and added back to back with string, such as 'ps_ind_03'
ind_features = [c for c in feature_names if 'ind' in c]
count=0
for c in ind_features:
    if count==0:
        train['new_ind'] = train[c].astype(str)+'_'
        test['new_ind'] = test[c].astype(str)+'_'
        count+=1
    else:
        train['new_ind'] += train[c].astype(str)+'_'
        test['new_ind'] += test[c].astype(str)+'_'

cat_count_features = []

In [51]:
train[ind_features].head()

Unnamed: 0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin
0,2,2,5,2,1,0,1,0,0,0,0,0,0,0,11,0,1,0
1,1,1,7,1,1,0,0,1,0,0,0,0,0,0,3,0,0,1
2,5,4,9,2,1,0,0,1,0,0,0,0,0,0,12,1,0,0
3,0,1,2,1,1,1,0,0,0,0,0,0,0,0,8,1,0,0
4,0,2,0,2,1,1,0,0,0,0,0,0,0,0,9,1,0,0


In [52]:
compare=ind_features
compare.append('new_ind')
train[compare].head()

Unnamed: 0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,new_ind
0,2,2,5,2,1,0,1,0,0,0,0,0,0,0,11,0,1,0,2_2_5_2_1_0_1_0_0_0_0_0_0_0_11_0_1_0_
1,1,1,7,1,1,0,0,1,0,0,0,0,0,0,3,0,0,1,1_1_7_1_1_0_0_1_0_0_0_0_0_0_3_0_0_1_
2,5,4,9,2,1,0,0,1,0,0,0,0,0,0,12,1,0,0,5_4_9_2_1_0_0_1_0_0_0_0_0_0_12_1_0_0_
3,0,1,2,1,1,1,0,0,0,0,0,0,0,0,8,1,0,0,0_1_2_1_1_1_0_0_0_0_0_0_0_0_8_1_0_0_
4,0,2,0,2,1,1,0,0,0,0,0,0,0,0,9,1,0,0,0_2_0_2_1_1_0_0_0_0_0_0_0_0_9_1_0_0_


In [53]:
# get value counts for each categorical and new ind column
for c in cat_features+['new_ind']:
    d = pd.concat([train[c],test[c]]).value_counts().to_dict()
    train['%s_count'%c] = train[c].apply(lambda x:d.get(x,0))
    test['%s_count'%c] = test[c].apply(lambda x:d.get(x,0))
    cat_count_features.append('%s_count'%c)

    # take all numeric features, features like ps_car_03_cat_count
train_list = [train[num_features+cat_count_features].values,X_cat,]
test_list = [test[num_features+cat_count_features].values,X_t_cat,]

In [16]:
# turns sparse matrix to Compressed Sparse Row format
X = ssp.hstack(train_list).tocsr()
X_test = ssp.hstack(test_list).tocsr() #X_test is unknown X

In [17]:
X.shape

(595212, 223)

In [31]:
# training light gbm

learning_rate = 0.1
num_leaves = 15
min_data_in_leaf = 2000
feature_fraction = 0.6
num_boost_round = 10000
params = {"objective": "binary",
          "boosting_type": "gbdt",
          "learning_rate": learning_rate,
          "num_leaves": num_leaves,
           "max_bin": 256,
          "feature_fraction": feature_fraction,
          "verbosity": 2,
          "drop_rate": 0.1,
          "is_unbalance": False,
          "max_drop": 50,
          "min_child_samples": 10,
          "min_child_weight": 150,
          "min_split_gain": 0,
          "subsample": 0.9
          }
x_score = []
final_cv_train = np.zeros(len(train_label))
final_cv_pred = np.zeros(len(test_id))

for s in xrange(16):
    cv_train = np.zeros(len(train_label))
    cv_pred = np.zeros(len(test_id))

    params['seed'] = s

    if cv_only:
        kf = kfold.split(X, train_label) #split randomly

        best_trees = []
        fold_scores = []

        for i, (train_fold, validate) in enumerate(kf): #split into 3 crossfolds
            X_train, X_validate, label_train, label_validate = \ #for each crossfold
                X[train_fold, :], X[validate, :], train_label[train_fold], train_label[validate]
            dtrain = lgbm.Dataset(X_train, label_train)
            dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)
            bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, feval=evalerror, verbose_eval=100,
                            early_stopping_rounds=100) # best iteration?? = best number of trees?
            best_trees.append(bst.best_iteration)
            cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration) # set best_iteration as number of trees?
            cv_train[validate] += bst.predict(X_validate) #predict validation set

            score = Gini(label_validate, cv_train[validate])
            print score
            fold_scores.append(score)

        cv_pred /= NFOLDS
        final_cv_train += cv_train
        final_cv_pred += cv_pred

        print("cv score:")
        print Gini(train_label, cv_train)
        print "current score:", Gini(train_label, final_cv_train / (s + 1.)), s+1
        print(fold_scores)
        print(best_trees, np.mean(best_trees))

        x_score.append(Gini(train_label, cv_train))

print(x_score)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.151577	valid_0's gini: 0.288901
[200]	valid_0's binary_logloss: 0.151349	valid_0's gini: 0.295167
[300]	valid_0's binary_logloss: 0.151361	valid_0's gini: 0.295042
Early stopping, best iteration is:
[200]	valid_0's binary_logloss: 0.151349	valid_0's gini: 0.295167
0.295167170263
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.152171	valid_0's gini: 0.271694
[200]	valid_0's binary_logloss: 0.152098	valid_0's gini: 0.274733
[300]	valid_0's binary_logloss: 0.152157	valid_0's gini: 0.273878
Early stopping, best iteration is:
[203]	valid_0's binary_logloss: 0.152087	valid_0's gini: 0.275084
0.275084036478
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.152128	valid_0's gini: 0.27766
[200]	valid_0's binary_logloss: 0.152029	valid_0's gini: 0.281412
[300]	valid_0's binary_logloss: 0.152069	valid_0's g

[300]	valid_0's binary_logloss: 0.152088	valid_0's gini: 0.277148
Early stopping, best iteration is:
[229]	valid_0's binary_logloss: 0.152052	valid_0's gini: 0.277427
0.277426584798
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.152122	valid_0's gini: 0.277802
[200]	valid_0's binary_logloss: 0.152001	valid_0's gini: 0.281894
[300]	valid_0's binary_logloss: 0.152056	valid_0's gini: 0.280935
Early stopping, best iteration is:
[230]	valid_0's binary_logloss: 0.151975	valid_0's gini: 0.282751
0.282750529759
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.151882	valid_0's gini: 0.279485
[200]	valid_0's binary_logloss: 0.151733	valid_0's gini: 0.283484
Early stopping, best iteration is:
[192]	valid_0's binary_logloss: 0.151733	valid_0's gini: 0.283618
0.283618219704
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.1515	valid_0's gini: 0.293661
[20

0.284081719935
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.151863	valid_0's gini: 0.279675
[200]	valid_0's binary_logloss: 0.151733	valid_0's gini: 0.283896
Early stopping, best iteration is:
[173]	valid_0's binary_logloss: 0.151713	valid_0's gini: 0.284604
0.284603731766
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.151503	valid_0's gini: 0.293374
[200]	valid_0's binary_logloss: 0.151428	valid_0's gini: 0.295331
Early stopping, best iteration is:
[156]	valid_0's binary_logloss: 0.151402	valid_0's gini: 0.296149
0.296148541589
cv score:
0.287137425209
current score: 0.289327987871 9
[0.29619749948460927, 0.27476655523619969, 0.28408171993532033, 0.28460373176602766, 0.29614854158905191]
([251, 148, 167, 173, 156], 179.0)
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.151567	valid_0's gini: 0.289577
[200]	valid_0's binary_logloss: 0.15

0.286872227872
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.15153	valid_0's gini: 0.292415
[200]	valid_0's binary_logloss: 0.151455	valid_0's gini: 0.295758
Early stopping, best iteration is:
[170]	valid_0's binary_logloss: 0.151436	valid_0's gini: 0.296119
0.296118721615
cv score:
0.28776871839
current score: 0.289501234106 13
[0.29637244649312794, 0.27530134191239125, 0.28490339141563165, 0.28687222787184757, 0.29611872161535618]
([231, 246, 196, 205, 170], 209.59999999999999)
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.151564	valid_0's gini: 0.289655
[200]	valid_0's binary_logloss: 0.15136	valid_0's gini: 0.295157
[300]	valid_0's binary_logloss: 0.151378	valid_0's gini: 0.294829
Early stopping, best iteration is:
[213]	valid_0's binary_logloss: 0.15135	valid_0's gini: 0.295556
0.295555830216
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_log

In [32]:
# takes avg of 16 rounds for final prediction / test submission is the first one
pd.DataFrame({'id': test_id, 'target': final_cv_pred / 16.}).to_csv('answer_submissions/lgbm3_pred_avg.csv', index=False)
pd.DataFrame({'id': train_id, 'target': final_cv_train / 16.}).to_csv('answer_submissions/lgbm3_cv_avg.csv', index=False)

In [33]:
len(test)

892816

In [112]:
pd.DataFrame({'id': test_id, 'target': final_cv_pred / 16.}).describe()

Unnamed: 0,id,target
count,892816.0,892816.0
mean,744153.5,0.036469
std,429683.0,0.019811
min,0.0,0.007284
25%,372021.8,0.02318
50%,744307.0,0.031687
75%,1116308.0,0.044059
max,1488026.0,0.352603


In [35]:
len(pd.DataFrame({'id': test_id, 'target': final_cv_pred / 16.}))

892816

In [37]:
pd.read_csv('answer_submissions/lgbm3_pred_avg.csv').head()

Unnamed: 0,id,target
0,0,0.027714
1,1,0.024096
2,2,0.023101
3,3,0.01434
4,4,0.036072
