In [7]:
#Constants:
c_train_df_prep_file='train_df_prep_file.pkl'
c_bst_file='bst.pkl'
c_accuracy='c_accuracy.csv'

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split 
import lightgbm as lgb
import gc

In [9]:
metrics = 'auc'

In [15]:
lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric':metrics,
        'learning_rate': 0.1,
        'num_leaves': 7,  # we should let it be smaller than 2^(max_depth)
        'max_depth': 4,  # -1 means no limit
        'min_child_samples': 100,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 100,  # Number of bucketed bin for feature values
        'subsample': 0.7,  # Subsample ratio of the training instance.
        'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.7,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'nthread': 4,
        'verbose': 0,
        'scale_pos_weight':99.7, # because training data is extremely unbalanced 
#         'metric':metrics
}
target = 'is_attributed'
predictors = ['app','device','os', 'channel', 'hour','nip_day_test_hh', 'nip_day_hh',
              'nip_hh_os', 'nip_hh_app', 'nip_hh_dev']
categorical = ['app', 'device', 'os', 'channel', 'hour']

In [27]:
train_df_prep=pd.read_pickle(c_train_df_prep_file)
train_df, val_df = train_test_split( train_df_prep, train_size=.95, random_state=99, shuffle=False )
 
print(train_df.info())
print(val_df.info())

print("train size: ", len(train_df))
print("valid size: ", len(val_df))
del train_df_prep
gc.collect()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712500 entries, 0 to 712499
Data columns (total 11 columns):
app                712500 non-null uint16
device             712500 non-null uint16
os                 712500 non-null uint16
channel            712500 non-null uint16
is_attributed      712500 non-null uint8
hour               712500 non-null uint8
nip_day_test_hh    712500 non-null uint32
nip_day_hh         712500 non-null uint16
nip_hh_os          712500 non-null uint16
nip_hh_app         712500 non-null uint16
nip_hh_dev         712500 non-null uint32
dtypes: uint16(7), uint32(2), uint8(2)
memory usage: 21.7 MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 37500 entries, 712500 to 749999
Data columns (total 11 columns):
app                37500 non-null uint16
device             37500 non-null uint16
os                 37500 non-null uint16
channel            37500 non-null uint16
is_attributed      37500 non-null uint8
hour               37500 non-null uint8
nip_



327

In [29]:
VALIDATE = True
MAX_ROUNDS = 500
EARLY_STOP = 50
OPT_ROUNDS = 500
print("Training...")

num_boost_round=MAX_ROUNDS
early_stopping_rounds=EARLY_STOP

xgtrain = lgb.Dataset(train_df[predictors].values, label=train_df[target].values,
                      feature_name=predictors,
                      categorical_feature=categorical
                      )
del train_df
gc.collect()
print('done...')

Training...
done...


In [30]:
xgvalid = lgb.Dataset(val_df[predictors].values, label=val_df[target].values,
                      feature_name=predictors,
                      categorical_feature=categorical
                      )
del val_df
gc.collect()

124

In [31]:
lgb_params

{'boosting_type': 'gbdt',
 'objective': 'binary',
 'metric': 'auc',
 'learning_rate': 0.1,
 'num_leaves': 7,
 'max_depth': 4,
 'min_child_samples': 100,
 'max_bin': 100,
 'subsample': 0.7,
 'subsample_freq': 1,
 'colsample_bytree': 0.7,
 'min_child_weight': 0,
 'min_split_gain': 0,
 'nthread': 4,
 'verbose': 0,
 'scale_pos_weight': 99.7,
 'categorical_column': [0, 1, 2, 3, 4]}

In [32]:
evals_results = {}
bst = lgb.train(lgb_params, 
                 xgtrain, 
                 valid_sets=[xgtrain, xgvalid], 
                 valid_names=['train','valid'], 
                 evals_result=evals_results, 
                 num_boost_round=num_boost_round,
                 early_stopping_rounds=early_stopping_rounds,
                 verbose_eval=10, 
                 feval=None)

n_estimators = bst.best_iteration
bst.save_model(c_bst_file)
print("\nModel Report")
print("n_estimators : ", n_estimators)
# print(metrics+":", evals_results['valid'][metrics][n_estimators-1])
del xgvalid



Training until validation scores don't improve for 50 rounds.
[10]	train's auc: 0.970496	valid's auc: 0.961434
[20]	train's auc: 0.975601	valid's auc: 0.960874
[30]	train's auc: 0.978572	valid's auc: 0.962068
[40]	train's auc: 0.98129	valid's auc: 0.962279
[50]	train's auc: 0.983788	valid's auc: 0.964676
[60]	train's auc: 0.985521	valid's auc: 0.967716
[70]	train's auc: 0.986827	valid's auc: 0.966594
[80]	train's auc: 0.987946	valid's auc: 0.965048
[90]	train's auc: 0.988874	valid's auc: 0.965555
[100]	train's auc: 0.98973	valid's auc: 0.96565
[110]	train's auc: 0.990632	valid's auc: 0.964636
Early stopping, best iteration is:
[61]	train's auc: 0.985568	valid's auc: 0.96775

Model Report
n_estimators :  61


In [21]:
evals_results['valid'][metrics][n_estimators-1]

0.96774997679886854

In [22]:

import datetime
date_time=datetime.datetime.now()
train_acc=evals_results['train']['auc'][n_estimators-1]
valid_acc=evals_results['valid']['auc'][n_estimators-1]
bst_iter=n_estimators
comment='Trained with all data, 5 features from andys'

In [24]:
#prepare metrics and see
col_list=['date_time','train_acc','valid_acc','best_iter','comment']
curr_data=pd.DataFrame([[date_time,train_acc,valid_acc,bst_iter,comment]], columns=col_list)
curr_data.head()

Unnamed: 0,date_time,train_acc,valid_acc,best_iter,comment
0,2018-04-09 13:53:44.416075,0.985568,0.96775,61,"Trained with all data, 5 features from andys"


In [25]:
ADD to records
add_to_records=False
log=pd.read_csv(c_accuracy)
log.head()
if(add_to_records):
    log.append(curr_data)
    print('records updated')
    log.head()
    log.to_csv(c_accuracy,index=False)
gc.collect()
#resetCSV
# curr_data.to_csv(c_accuracy,index=False)
# curr_data.head()

Unnamed: 0,date_time,train_acc,valid_acc,best_iter,comment
0,2018-04-09 13:53:44.416075,0.985568,0.96775,61,"Trained with all data, 5 features from andys"
