In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import lightgbm as lgb
import os
import shutil
import matplotlib.pyplot as plt
import seaborn as sns
#import pandas_profiling

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:50].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    #plt.savefig('lgbm_importances01.png')

In [17]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

## FE

In [18]:
train_df.tail(10)

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
199990,train_199990,1,14.1475,1.8568,11.0066,3.6779,12.1944,-16.5936,5.3217,14.8508,...,-6.4708,4.7287,1.9034,7.2324,20.6047,1.717,-4.0032,9.1627,13.8077,-1.9646
199991,train_199991,0,9.9909,2.5523,11.9653,6.3958,13.5497,-9.5293,6.0864,14.1789,...,12.0737,5.2139,0.8094,-0.6585,17.0548,0.5328,-5.3444,8.5414,13.2895,-6.7896
199992,train_199992,0,12.2825,2.6918,15.4684,6.4262,10.9863,9.9659,4.503,9.9232,...,3.9514,10.904,1.3472,5.6184,20.5498,-0.2854,7.6252,10.1758,17.4066,-11.5244
199993,train_199993,0,13.2152,-5.8006,9.726,6.591,12.4612,-7.1652,6.0666,12.9887,...,3.9357,8.8442,1.8096,-4.8314,22.005,0.3916,6.7302,8.9709,14.5405,6.1149
199994,train_199994,0,12.3925,-5.8821,11.2323,3.9237,10.4509,10.7262,7.0503,18.6968,...,1.3911,6.8687,3.7788,6.3378,14.4656,-1.4843,-3.9299,9.1164,16.317,-7.5048
199995,train_199995,0,11.488,-0.4956,8.2622,3.5142,10.3404,11.6081,5.6709,15.1516,...,6.1415,13.2305,3.9901,0.9388,18.0249,-1.7939,2.1661,8.5326,16.666,-17.8661
199996,train_199996,0,4.9149,-2.4484,16.7052,6.6345,8.3096,-10.5628,5.8802,21.594,...,4.9611,4.6549,0.6998,1.8341,22.2717,1.7337,-2.1651,6.7419,15.9054,0.3388
199997,train_199997,0,11.2232,-5.0518,10.5127,5.6456,9.341,-5.4086,4.5555,21.5571,...,4.0651,5.4414,3.1032,4.8793,23.5311,-1.5736,1.2832,8.7155,13.8329,4.1995
199998,train_199998,0,9.7148,-8.6098,13.6104,5.793,12.5173,0.5339,6.0479,17.0152,...,2.684,8.6587,2.7337,11.1178,20.4158,-0.0786,6.798,10.0342,15.5289,-13.9001
199999,train_199999,0,10.8762,-5.7105,12.1183,8.0328,11.5577,0.3488,5.2839,15.2058,...,8.9842,1.6893,0.1276,0.3766,15.2101,-2.4907,-2.2342,8.1857,12.1284,0.1385


In [19]:

var_list = [f for f in train_df.columns if 'var' in f]
mean_var_list = [f+'_rank' for f in train_df.columns if 'var' in f]

# train_df['mean'] = np.mean(train_df[var_list],axis=1)
# train_df['std'] = np.std(train_df[var_list],axis=1)
# train_df['min'] = np.min(train_df[var_list],axis=1)
# train_df['max'] = np.max(train_df[var_list],axis=1)
# train_df['sum'] = np.sum(train_df[var_list],axis=1)
# train_df['median'] = np.median(train_df[var_list],axis=1)

# train_df[mean_var_list] = (train_df[var_list]-np.median(train_df[var_list],axis=0))/((np.max(train_df[var_list],axis=0)-np.median(train_df[var_list],axis=0)))
# test_df[mean_var_list] = (test_df[var_list]-np.median(test_df[var_list],axis=0))/((np.max(test_df[var_list],axis=0)-np.median(test_df[var_list],axis=0)))


train_df[mean_var_list] = train_df[var_list].rank(method='average',pct=True)
test_df[mean_var_list] = test_df[var_list].rank(method='average',pct=True)

#simple stats on rank cols 


In [20]:
test_df.head()

Unnamed: 0,ID_code,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190_rank,var_191_rank,var_192_rank,var_193_rank,var_194_rank,var_195_rank,var_196_rank,var_197_rank,var_198_rank,var_199_rank
0,test_0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,...,0.124692,0.920388,0.006548,0.41083,0.087673,0.956472,0.616908,0.981705,0.444165,0.318997
1,test_1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,-4.4131,...,0.943355,0.66867,0.261958,0.962167,0.244103,0.66226,0.274715,0.83379,0.853505,0.04708
2,test_2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,1.5233,...,0.204405,0.874622,0.523218,0.386777,0.052617,0.935732,0.035855,0.024625,0.911892,0.025837
3,test_3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,...,0.912833,0.69889,0.437725,0.514827,0.205517,0.986082,0.592745,0.625795,0.170783,0.45423
4,test_4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.989,...,0.58842,0.710175,0.340735,0.495173,0.669852,0.46748,0.097388,0.041087,0.258055,0.304935


## Model

In [21]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

n_folds = 5
random_seed = 26
model = 'lgb_rank_pct'


model_name = "{0}_{1}_folds".format(model, n_folds)
print("Model: {}".format(model_name))

Model: lgb_rank_pct_5_folds


In [22]:
#feats = train_df.iloc[:,2:].columns.tolist();feats
feats = [c for c in train_df.columns if c not in ['ID_code', 'target']];feats

['var_0',
 'var_1',
 'var_2',
 'var_3',
 'var_4',
 'var_5',
 'var_6',
 'var_7',
 'var_8',
 'var_9',
 'var_10',
 'var_11',
 'var_12',
 'var_13',
 'var_14',
 'var_15',
 'var_16',
 'var_17',
 'var_18',
 'var_19',
 'var_20',
 'var_21',
 'var_22',
 'var_23',
 'var_24',
 'var_25',
 'var_26',
 'var_27',
 'var_28',
 'var_29',
 'var_30',
 'var_31',
 'var_32',
 'var_33',
 'var_34',
 'var_35',
 'var_36',
 'var_37',
 'var_38',
 'var_39',
 'var_40',
 'var_41',
 'var_42',
 'var_43',
 'var_44',
 'var_45',
 'var_46',
 'var_47',
 'var_48',
 'var_49',
 'var_50',
 'var_51',
 'var_52',
 'var_53',
 'var_54',
 'var_55',
 'var_56',
 'var_57',
 'var_58',
 'var_59',
 'var_60',
 'var_61',
 'var_62',
 'var_63',
 'var_64',
 'var_65',
 'var_66',
 'var_67',
 'var_68',
 'var_69',
 'var_70',
 'var_71',
 'var_72',
 'var_73',
 'var_74',
 'var_75',
 'var_76',
 'var_77',
 'var_78',
 'var_79',
 'var_80',
 'var_81',
 'var_82',
 'var_83',
 'var_84',
 'var_85',
 'var_86',
 'var_87',
 'var_88',
 'var_89',
 'var_90',
 'var_91'

In [23]:
clfs = []
folds = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_seed)
oof_preds = np.zeros((len(train_df), 1))
test_preds = np.zeros((len(test_df), 1))


X = train_df[feats]
y = train_df['target']
X_test = test_df[feats]
test_ids = test_df.ID_code.values


parameters = {
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    #'scale_pos_weight': 400,
    #'device' : 'gpu' ,
    'boosting': 'gbdt',
    'num_leaves': 31, #31
    'feature_fraction': 0.5,
    'bagging_fraction': 0.7,
    'bagging_freq': 10,
    'learning_rate': 0.05, #0.05
    'verbose': 30
    #'min_data_in_leaf': 200
}

feature_importance_df = pd.DataFrame()
for fold_, (trn_, val_) in enumerate(folds.split(X, y)):
    print("Current Fold: {}".format(fold_+1))
    trn_x, trn_y = X.iloc[trn_, :], y[trn_]
    val_x, val_y = X.iloc[val_, :], y[val_]


    trn_lgb = lgb.Dataset(trn_x, trn_y)
    val_lgb = lgb.Dataset(val_x, val_y)
    clf = lgb.train(parameters,
                     train_set=trn_lgb,
                     #valid_sets=[valid_data_lgb,holdout_data_lgb],
                     valid_sets=[trn_lgb, val_lgb],
                     num_boost_round=3000,
                     early_stopping_rounds=50,
                     verbose_eval=False)
    


    val_pred = clf.predict(val_x)
    test_fold_pred = clf.predict(X_test)

    print("AUC = {}".format(roc_auc_score(val_y, val_pred)))
    oof_preds[val_, :] = val_pred.reshape((-1, 1))
    test_preds += test_fold_pred.reshape((-1, 1))
    
    print('getting feature importance')
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    
test_preds /= n_folds
roc_score = roc_auc_score(y, oof_preds.ravel())
print("Overall AUC = {}".format(roc_score))


print("Saving submission file")
sample = pd.read_csv('../data/sample_submission.csv')
sample.target = test_preds.astype(float)
sample.ID_code = test_ids
sample.to_csv('../submissions/{}_{}.csv'.format(model_name,str(roc_score)), index=False)

display_importances(feature_importance_df)


Current Fold: 1
AUC = 0.8934838812800731
getting feature importance
Current Fold: 2
AUC = 0.8917703732802043
getting feature importance
Current Fold: 3
AUC = 0.8967559160838388
getting feature importance
Current Fold: 4
AUC = 0.8884419698483343
getting feature importance
Current Fold: 5
AUC = 0.8952197877203903
getting feature importance
Overall AUC = 0.8929203506967012
Saving submission file


  stat_data = remove_na(group_data)


In [23]:
%%javascript
var nb = IPython.notebook;
var kernel = IPython.notebook.kernel;
var command = "NOTEBOOK_FULL_PATH = '" + nb.base_url + nb.notebook_path + "'";
kernel.execute(command);

<IPython.core.display.Javascript object>

In [24]:


shutil.copyfile(os.path.basename(NOTEBOOK_FULL_PATH), 
                             '../models/{}_{}.ipynb'.format(model_name, str(roc_score)))


'../models/lgb_over_max_median_5_folds_0.8927908906108155.ipynb'

In [36]:
clf.predict(val_x.iloc[0],pred_contrib=True)



array([[-3.93268843e-02, -3.70502267e-02,  7.20410447e-02,
        -5.14794659e-03, -3.36656922e-03,  1.18189760e-01,
        -2.03232768e-02, -2.43534530e-03,  2.06846570e-02,
        -2.30367336e-02, -3.05631523e-03,  7.17579562e-02,
        -3.60610582e-02,  2.39898732e-02,  1.18249116e-02,
        -3.05180717e-03,  2.51868135e-02,  2.89458050e-03,
        -4.18144826e-02,  4.30009116e-02,  9.36178792e-02,
         2.17325798e-01,  1.98014155e-01, -3.47866363e-02,
        -3.78195873e-02,  1.26590884e-02, -4.96251022e-02,
         2.79162055e-04,  2.34828771e-02, -3.38617033e-03,
         4.72025758e-04,  9.75963642e-02,  1.30248245e-02,
        -8.09505272e-02,  9.54451387e-03, -4.69094399e-02,
        -3.90603161e-02, -2.87577091e-03,  7.55222173e-04,
        -4.74092681e-03, -4.28867229e-02,  1.40182843e-02,
         2.10805601e-02, -4.70198097e-02,  7.82518234e-02,
         5.21578368e-02, -6.75091211e-03,  4.50467751e-03,
         7.49051159e-02,  2.25519916e-02,  1.62397812e-0

In [62]:
result_df = pd.DataFrame(data=clf.predict(val_x.iloc[:10],pred_contrib=True))
result_df['prediction'] = clf.predict(val_x.iloc[:10])
#result_df[val_x.columns.tolist()] = clf.predict(val_x.iloc[:10],pred_contrib=True)


In [65]:
result_df.columns = val_x.columns.tolist()+['baseline_pred'] +['final_pred']

In [71]:
result_df =  result_df[['final_pred'] + ['baseline_pred'] + val_x.columns.tolist()]

In [89]:
result_df.iloc[0,1:].sum()

0.28812681658719375

In [113]:
result_df.iloc[:,:9]


Unnamed: 0,final_pred,baseline_pred,var_0,var_1,var_2,var_3,var_4,var_5,var_6
0,0.571537,-1.701885,-0.039327,-0.03705,0.072041,-0.005148,-0.003367,0.11819,-0.020323
1,0.056427,-1.701885,-0.034855,-0.013877,0.012534,0.001982,-0.006649,-0.045968,0.422089
2,0.131655,-1.701885,-0.044732,-0.013932,0.052405,-0.001917,-0.003318,0.014513,-0.065764
3,0.540031,-1.701885,-0.032672,-0.002076,-0.070309,-0.007568,-0.008739,-0.042647,0.054001
4,0.329536,-1.701885,0.027451,0.072701,0.429397,-0.00337,0.014742,-0.030384,-0.004369
5,0.311653,-1.701885,0.15697,-0.085969,-0.063579,-0.006659,0.00381,-0.01047,-0.043518
6,0.568784,-1.701885,-0.023433,-0.019481,0.05425,-0.007657,-0.048572,-0.002471,0.018791
7,0.117736,-1.701885,-0.03703,0.015511,-0.020845,-0.00359,-0.01096,-0.030726,-0.071401
8,0.055309,-1.701885,-0.05326,0.001029,0.082322,-0.005337,-0.008556,0.002953,0.101039
9,0.055792,-1.701885,-0.053844,-0.025975,-0.089981,0.038213,-0.010739,-0.069373,-0.057705


In [109]:
result_df.iloc[1,2:].sum()

-1.1148467489322968

In [107]:
a = result_df.iloc[0,1].sum()

In [110]:
1/(1+np.exp(a))

0.7530315906570576