In [79]:
#Constants:
c_train_df_prep_file='train_df_prep_file.pkl'
c_bst_file='bst.pkl'
c_accuracy='c_accuracy.csv'

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split 
import lightgbm as lgb
import gc

In [26]:
metrics = 'auc'

In [27]:
lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric':metrics,
        'learning_rate': 0.1,
        'num_leaves': 7,  # we should let it be smaller than 2^(max_depth)
        'max_depth': 4,  # -1 means no limit
        'min_child_samples': 100,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 100,  # Number of bucketed bin for feature values
        'subsample': 0.7,  # Subsample ratio of the training instance.
        'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.7,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'nthread': 4,
        'verbose': 0,
        'scale_pos_weight':99.7, # because training data is extremely unbalanced 
        'metric':metrics
}
target = 'is_attributed'
predictors = ['app','device','os', 'channel', 'hour']#,'nip_day_test_hh', 'nip_day_hh',
#               'nip_hh_os', 'nip_hh_app', 'nip_hh_dev'
categorical = ['app', 'device', 'os', 'channel', 'hour']

In [36]:
train_df_prep=pd.read_pickle(c_train_df_prep_file)
train_df, val_df = train_test_split( train_df_prep, train_size=.95, random_state=99, shuffle=True )
 
print(train_df.info())
print(val_df.info())

print("train size: ", len(train_df))
print("valid size: ", len(val_df))
del train_df_prep
gc.collect()



<class 'pandas.core.frame.DataFrame'>
Int64Index: 712500 entries, 490318 to 684673
Data columns (total 7 columns):
app              712500 non-null uint16
device           712500 non-null uint16
os               712500 non-null uint16
channel          712500 non-null uint16
is_attributed    712500 non-null uint8
hour             712500 non-null uint8
in_test_hh       712500 non-null uint8
dtypes: uint16(4), uint8(3)
memory usage: 12.9 MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 37500 entries, 70504 to 249153
Data columns (total 7 columns):
app              37500 non-null uint16
device           37500 non-null uint16
os               37500 non-null uint16
channel          37500 non-null uint16
is_attributed    37500 non-null uint8
hour             37500 non-null uint8
in_test_hh       37500 non-null uint8
dtypes: uint16(4), uint8(3)
memory usage: 695.8 KB
None
train size:  712500
valid size:  37500


506

In [37]:
VALIDATE = True
MAX_ROUNDS = 500
EARLY_STOP = 50
OPT_ROUNDS = 500
print("Training...")

num_boost_round=MAX_ROUNDS
early_stopping_rounds=EARLY_STOP

xgtrain = lgb.Dataset(train_df[predictors].values, label=train_df[target].values,
                      feature_name=predictors,
                      categorical_feature=categorical
                      )
del train_df
gc.collect()
print('done...')

Training...
done...


In [38]:
xgvalid = lgb.Dataset(val_df[predictors].values, label=val_df[target].values,
                      feature_name=predictors,
                      categorical_feature=categorical
                      )
del val_df
gc.collect()

93

In [39]:
evals_results = {}
bst = lgb.train(lgb_params, 
                 xgtrain, 
                 valid_sets=[xgtrain, xgvalid], 
                 valid_names=['train','valid'], 
                 evals_result=evals_results, 
                 num_boost_round=num_boost_round,
                 early_stopping_rounds=early_stopping_rounds,
                 verbose_eval=10, 
                 feval=None)

n_estimators = bst.best_iteration
bst.save_model(c_bst_file)
print("\nModel Report")
print("n_estimators : ", n_estimators)
print(metrics+":", evals_results['valid'][metrics][n_estimators-1])
del xgvalid



Training until validation scores don't improve for 50 rounds.
[10]	train's auc: 0.969211	valid's auc: 0.950359
[20]	train's auc: 0.971378	valid's auc: 0.955819
[30]	train's auc: 0.97418	valid's auc: 0.964316
[40]	train's auc: 0.976882	valid's auc: 0.964807
[50]	train's auc: 0.978825	valid's auc: 0.966639
[60]	train's auc: 0.980549	valid's auc: 0.962793
[70]	train's auc: 0.981846	valid's auc: 0.965478
[80]	train's auc: 0.982939	valid's auc: 0.963937
[90]	train's auc: 0.98364	valid's auc: 0.963965
[100]	train's auc: 0.984318	valid's auc: 0.965378
Early stopping, best iteration is:
[50]	train's auc: 0.978825	valid's auc: 0.966639

Model Report
n_estimators :  50
auc: 0.9666387484993704


In [40]:
evals_results['valid'][metrics][n_estimators-1]

0.9666387484993704

In [106]:

import datetime
date_time=datetime.datetime.now()
train_acc=evals_results['train']['auc'][n_estimators-1]
valid_acc=evals_results['valid']['auc'][n_estimators-1]
bst_iter=n_estimators

In [110]:
#prepare metrics and see
col_list=['date_time','train_acc','valid_acc','best_iter']
curr_data=pd.DataFrame([[curr_date,train_acc,valid_acc,bst_iter]], columns=col_list)
curr_data.head()

Unnamed: 0,date_time,train_acc,valid_acc,best_iter
0,2018-04-09 06:40:04.238477,0.978825,0.966639,50


In [114]:
#ADD to records
add_to_records=False
log=pd.read_csv(c_accuracy)
log.head()
if(add_to_records):
    log.append(curr_data)
    print('records updated')
    log.head()
    log.to_csv(c_accuracy,index=False)
gc.collect()
#resetCSV
# curr_data.to_csv(c_accuracy,index=False)
# curr_data.head()

467