In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.filterwarnings('ignore')

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
print('Loading dataframes into memory')
train_df = pd.read_csv('train.csv') # reading train
test_df = pd.read_csv('test.csv') # reading test
features = [c for c in train_df.columns if c not in ['ID_code', 'target']]
print('Separating output column')
target = train_df['target']
print('Creating Parameter Grid')
param = {
    'bagging_freq': 5,          'bagging_fraction': 0.335,   'boost_from_average':'false',   'boost': 'gbdt',
    'feature_fraction': 0.041,   'learning_rate': 0.0083,     'max_depth': -1,                'metric':'auc',
    'min_data_in_leaf': 80,     'min_sum_hessian_in_leaf': 10.0,'num_leaves': 13,           'num_threads': 8,
    'tree_learner': 'serial',   'objective': 'binary',      'verbosity': 1
}


Loading dataframes into memory
Separating output column
Creating Parameter Grid


In [3]:
num_folds = 15
folds = StratifiedKFold(n_splits=num_folds, shuffle=False, random_state=2319)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))

In [4]:
%%time
print('Starting cross-validation:')
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
    print("Fold idx:{}".format(fold_ + 1))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])
    clf = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=5000, early_stopping_rounds = 4000)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits
print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

Starting cross-validation:
Fold idx:1
Training until validation scores don't improve for 40000 rounds.
[50000]	training's auc: 0.995612	valid_1's auc: 0.899221
Early stopping, best iteration is:
[13652]	training's auc: 0.949293	valid_1's auc: 0.902555
Fold idx:2
Training until validation scores don't improve for 40000 rounds.
[50000]	training's auc: 0.995653	valid_1's auc: 0.890944
Early stopping, best iteration is:
[12768]	training's auc: 0.947515	valid_1's auc: 0.894456
Fold idx:3
Training until validation scores don't improve for 40000 rounds.
[50000]	training's auc: 0.995481	valid_1's auc: 0.898449
Early stopping, best iteration is:
[10509]	training's auc: 0.941111	valid_1's auc: 0.903688
Fold idx:4
Training until validation scores don't improve for 40000 rounds.
Early stopping, best iteration is:
[9805]	training's auc: 0.940256	valid_1's auc: 0.88855
Fold idx:5
Training until validation scores don't improve for 40000 rounds.
[50000]	training's auc: 0.995589	valid_1's auc: 0.905177

In [5]:
print('Writing results to a text file')
sub = pd.DataFrame({"ID_code": test_df.ID_code.values})
sub["target"] = predictions
sub.to_csv('lgbm_submission.csv', index=False)

Writing results to a text file
