In [1]:
import numpy as np
import pandas as pd
import getpass
import os
import gc
import csv
from pathlib import Path

from lightgbm import LGBMRegressor

import optuna
import ast
import datetime

from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt

In [2]:
DATA_DIR = Path('/data', getpass.getuser(), 'kaggle', 'future_sales') #path to data files

OPTIMIZE_FILE = DATA_DIR / 'optuna_scores.csv'
SEED = 42
NJOBS = 60

with open(DATA_DIR / 'features.txt', 'r') as f:
    features = f.read()
features = features.split(',')

all_data = pd.read_pickle(DATA_DIR / 'all_data.pkl')
all_data_part = pd.read_pickle(DATA_DIR / 'all_data_part.pkl')

all_data = all_data[features]
all_data_part = all_data_part[features]

In [3]:
all_data = all_data[all_data['date_block_num'] > 0]
all_data_part = all_data_part[all_data_part['date_block_num'] > 0]

In [4]:
#to validate
train_part = all_data_part[all_data_part['date_block_num'] < 33]
val = all_data_part[all_data_part['date_block_num'] == 33]

#to fit on all data
train, test = all_data[all_data['date_block_num'] < 34], all_data[all_data['date_block_num'] == 34]

In [5]:
Xtrain_part, ytrain_part = train_part.drop(['date_block_num', 'item_cnt_month'] , axis=1), train_part['item_cnt_month']
Xval, yval = val.drop(['date_block_num', 'item_cnt_month'] , axis=1), val['item_cnt_month']

Xtrain, ytrain = train.drop(['date_block_num', 'item_cnt_month'] , axis=1), train['item_cnt_month']
Xtest, ytest = test.drop(['date_block_num', 'item_cnt_month'] , axis=1), test['item_cnt_month']

### optuna

In [32]:
N_EST = 1000
INT_PARAMS = ['n_estimators', 'max_leaves', 'max_depth', 'subsample_for_bin', 'min_data_in_leaf', 'subsample_freq'
              , 'random_state', 'n_jobs', 'num_leaves']

In [16]:
def objective(trial):

    params = {'n_estimators': N_EST
             , 'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.2)
             , 'num_leaves': trial.suggest_int('num_leaves', 7, 255)
             , 'max_depth': -1
             , 'subsample': trial.suggest_uniform('subsample', 0.75, 1)
             , 'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.8, 1)
             , 'min_child_weight': trial.suggest_uniform('min_child_weight', 1e-1, 200)
             , 'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 3.0)
             , 'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 3.0)
             , 'min_split_gain': trial.suggest_loguniform('min_split_gain', 1e-2, 100) #aka gamma in xgboost
             , 'max_delta_step': trial.suggest_uniform('max_delta_step', 0, 100)
             , 'subsample_for_bin': trial.suggest_int('subsample_for_bin', 50*1000, 300*1000)
             , 'subsample_freq': trial.suggest_int('subsample_freq', 1, 10)
             , 'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 20, int(0.05 * Xtrain_part.shape[0]))
              
             , 'random_state': SEED
             , 'n_jobs' : NJOBS
            }
    
    start_time = datetime.datetime.now()

    model = LGBMRegressor(**params)
    model.fit(Xtrain_part, ytrain_part, eval_metric=['rmse'], verbose=False
            , eval_set=[(Xval, yval)], early_stopping_rounds=100)
    
    train_time = datetime.datetime.now() - start_time

    pred_val = model.predict(Xval)
    pred_train = model.predict(Xtrain_part)
    
    mse_test = mean_squared_error(yval, pred_val)
    mse_train = mean_squared_error(ytrain_part, pred_train)
    
    with open(OPTIMIZE_FILE, 'a') as f:
        writer = csv.writer(f)
        writer.writerow([params, mse_test, mse_train, train_time])
    
    return mse_test

In [17]:
with open(OPTIMIZE_FILE, 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['params', 'mse_test', 'mse_train', 'train_time'])

In [None]:
study = optuna.create_study(direction='minimize', pruner=optuna.pruners.SuccessiveHalvingPruner())
study.optimize(objective, n_trials=30, show_progress_bar=True, n_jobs=1)

In [19]:
print('Best parameters:\n')
print(study.best_params)

Best parameters:

{'learning_rate': 0.10940414868293571, 'num_leaves': 253, 'subsample': 0.8647972293106071, 'colsample_bytree': 0.9119952348272526, 'min_child_weight': 37.481104606983166, 'reg_alpha': 2.958101388986032, 'reg_lambda': 0.0015385349766620054, 'min_split_gain': 0.010540968582577605, 'max_delta_step': 35.0645297176416, 'subsample_for_bin': 102259, 'subsample_freq': 5, 'min_data_in_leaf': 442}


In [20]:
print(study.best_value)

0.7970915672871371


In [41]:
info_opt = pd.read_csv(DATA_DIR / OPTIMIZE_FILE)

info_opt_all = pd.DataFrame([ast.literal_eval(info_opt.loc[i, 'params']) for i in range(info_opt.shape[0])])
info_opt_all.insert(0, 'mse_test', info_opt['mse_test'])
info_opt_all.insert(1, 'mse_train', info_opt['mse_train'])
info_opt_all.insert(2, 'diff_mse', info_opt['mse_train'] - info_opt['mse_test'])

const_cols = ['n_estimators', 'random_state', 'n_jobs']

info_opt_all = info_opt_all.drop(const_cols, axis=1)
info_opt_all.sort_values(by=['diff_mse'], inplace=True, ascending=False)
k=info_opt_all[(info_opt_all['mse_test'] < 0.9)]

In [42]:
k

Unnamed: 0,mse_test,mse_train,diff_mse,learning_rate,num_leaves,max_depth,subsample,colsample_bytree,min_child_weight,reg_alpha,reg_lambda,min_split_gain,max_delta_step,subsample_for_bin,subsample_freq,min_data_in_leaf
2,0.899103,0.858823,-0.04028,0.150743,99,-1,0.851512,0.90464,166.733067,0.001546,0.84148,0.016605,49.241413,291564,6,133294
18,0.884908,0.834186,-0.050722,0.171767,203,-1,0.784913,0.891926,69.371391,0.56353,0.045833,0.635907,78.023973,167154,7,98282
26,0.881271,0.830003,-0.051268,0.157487,253,-1,0.802838,0.997099,56.968814,1.499633,0.002645,0.024521,23.517478,156326,7,95851
13,0.883272,0.829824,-0.053448,0.196099,173,-1,0.756643,0.855126,1.272967,2.438602,0.134115,0.3363,99.655457,239695,3,95806
14,0.872304,0.81143,-0.060874,0.171667,105,-1,0.775231,0.869084,32.89332,0.307044,0.001026,0.383568,88.012562,247123,4,73347
25,0.868358,0.799586,-0.068773,0.127179,222,-1,0.850921,0.93924,32.789977,2.984969,0.001563,0.010348,2.890119,135921,5,59710
8,0.858012,0.789038,-0.068974,0.099031,41,-1,0.820257,0.801635,86.922991,0.030942,0.00412,0.036697,48.209465,291028,1,39542
21,0.867282,0.788213,-0.079069,0.199924,244,-1,0.771317,0.845434,17.661697,0.703127,0.001126,0.073493,29.750799,265027,5,59935
27,0.854132,0.772861,-0.081271,0.139607,195,-1,0.830749,0.878033,105.40639,2.597542,0.007144,0.050528,19.656932,181608,8,44386
3,0.828695,0.744875,-0.08382,0.119879,20,-1,0.848846,0.919838,130.135062,0.032927,0.017685,0.077313,44.179315,293465,4,13653


In [48]:
best_eval = 28

best_params = {'random_state': SEED
               , 'n_jobs' : NJOBS
               , 'n_estimators': N_EST
              }
best_params.update(dict(info_opt_all.drop(['mse_test', 'mse_train', 'diff_mse'], axis=1).loc[best_eval,:]))
for k, v in best_params.items():
    if k in INT_PARAMS:
        best_params[k] = int(v)
best_params

{'random_state': 42,
 'n_jobs': 60,
 'n_estimators': 1000,
 'learning_rate': 0.10940414868293571,
 'num_leaves': 253,
 'max_depth': -1,
 'subsample': 0.8647972293106071,
 'colsample_bytree': 0.9119952348272526,
 'min_child_weight': 37.481104606983166,
 'reg_alpha': 2.958101388986032,
 'reg_lambda': 0.0015385349766620054,
 'min_split_gain': 0.010540968582577605,
 'max_delta_step': 35.0645297176416,
 'subsample_for_bin': 102259,
 'subsample_freq': 5,
 'min_data_in_leaf': 442}

In [49]:
best_params['n_estimators'] = 3000
lgb = LGBMRegressor(**best_params)
lgb.fit(Xtrain_part, ytrain_part, eval_metric=['rmse'], verbose=200, early_stopping_rounds=100
             , eval_set=[(Xtrain_part, ytrain_part), (Xval, yval)])

Training until validation scores don't improve for 100 rounds
[200]	training's rmse: 0.77559	training's l2: 0.60154	valid_1's rmse: 0.893832	valid_1's l2: 0.798935
Early stopping, best iteration is:
[145]	training's rmse: 0.788829	training's l2: 0.622252	valid_1's rmse: 0.8928	valid_1's l2: 0.797092


LGBMRegressor(boosting_type='gbdt', class_weight=None,
              colsample_bytree=0.9119952348272526, importance_type='split',
              learning_rate=0.10940414868293571,
              max_delta_step=35.0645297176416, max_depth=-1,
              min_child_samples=20, min_child_weight=37.481104606983166,
              min_data_in_leaf=442, min_split_gain=0.010540968582577605,
              n_estimators=3000, n_jobs=60, num_leaves=253, objective=None,
              random_state=42, reg_alpha=2.958101388986032,
              reg_lambda=0.0015385349766620054, silent=True,
              subsample=0.8647972293106071, subsample_for_bin=102259,
              subsample_freq=5)

**Fit on all sample**

In [50]:
best_params['n_estimators'] = 145
lgb_best = LGBMRegressor(**best_params)
lgb_best.fit(Xtrain, ytrain, eval_metric=['rmse'], verbose=100, early_stopping_rounds=100
             , eval_set=[(Xtrain, ytrain)])

Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.803613	training's l2: 0.645793
Did not meet early stopping. Best iteration is:
[145]	training's rmse: 0.790121	training's l2: 0.624291


LGBMRegressor(boosting_type='gbdt', class_weight=None,
              colsample_bytree=0.9119952348272526, importance_type='split',
              learning_rate=0.10940414868293571,
              max_delta_step=35.0645297176416, max_depth=-1,
              min_child_samples=20, min_child_weight=37.481104606983166,
              min_data_in_leaf=442, min_split_gain=0.010540968582577605,
              n_estimators=145, n_jobs=60, num_leaves=253, objective=None,
              random_state=42, reg_alpha=2.958101388986032,
              reg_lambda=0.0015385349766620054, silent=True,
              subsample=0.8647972293106071, subsample_for_bin=102259,
              subsample_freq=5)

In [79]:
ans = pd.DataFrame({'item_cnt_month': lgb_best.predict(Xtest)})
ans['item_cnt_month'] = ans['item_cnt_month'].clip(0, 20)

In [80]:
ans.insert(0, 'ID', ans.index)
ans.to_csv(DATA_DIR / 'final_ans_best_28_145.csv', index=False)
# 28 - 0.927244 - 145 trees (private is 0.916828)

In [81]:
#scores from 7_shap_vaues
xgbbest_ans = pd.read_csv(DATA_DIR / 'final_ans_7_2.csv')

In [83]:
ans['item_cnt_month'] += xgbbest_ans['item_cnt_month']
ans['item_cnt_month'] /= 2

In [85]:
ans.to_csv(DATA_DIR / 'final_ans_lgb_xgb.csv', index=False)
#0.926215