In [1]:
from pathlib import Path
import pickle

import pandas as pd
import lightgbm as lgb

In [8]:
train_data_dir_path = '../../../../data/train'
valid_data_dir_path = '../../../../data/valid'
data_file_format = 'csv'
target_col = ['target']
objective = 'regression' # multiclass
num_class = '3'
valid_metrics = 'rmse'
learning_rate = 0.1
model_path = '../../../../models/lgbm_model.pkl'

In [3]:
if data_file_format == "csv":
  train = pd.concat((pd.read_csv(f) for f in Path(train_data_dir_path).glob('*')))
  valid = pd.concat((pd.read_csv(f) for f in Path(valid_data_dir_path).glob('*')))  
elif data_file_format == "parquet":
  train = pd.concat((pd.read_parquet(f) for f in Path(train_data_dir_path).glob('*')))
  valid = pd.concat((pd.read_parquet(f) for f in Path(valid_data_dir_path).glob('*')))
elif data_file_format == "pickle":
  train = pd.concat((pd.read_pickle(f) for f in Path(valid_data_dir_path).glob('*')))
  valid = pd.concat((pd.read_pickle(f) for f in Path(valid_data_dir_path).glob('*')))
else:
  pass

In [4]:
lgb_params = {
  'learning_rate': learning_rate,
  'max_depth': 2**7-1,
  'num_leaves': 7,
  'random_state': 42,
  'verbose': -1,
  'metric': valid_metrics,
}

In [5]:
if objective == 'multiclass':
  lgb_params['objective'] = objective
  lgb_params['num_class'] = num_class
else:
  lgb_params['objective'] = objective

In [6]:
model = lgb.train(
  params = lgb_params,
  train_set = lgb.Dataset(data=train.drop(target_col, axis=1), label=train[target_col]),
  valid_names = ["valid_sets"],
  valid_sets = [lgb.Dataset(data=valid.drop(target_col, axis=1), label=valid[target_col])],
  num_boost_round=10000,
  callbacks=[
    lgb.early_stopping(stopping_rounds=10, verbose=True),
    lgb.log_evaluation(1)
  ]
)

[1]	valid_sets's rmse: 77.1747
Training until validation scores don't improve for 10 rounds
[2]	valid_sets's rmse: 74.6466
[3]	valid_sets's rmse: 72.4541
[4]	valid_sets's rmse: 70.2764
[5]	valid_sets's rmse: 68.6881
[6]	valid_sets's rmse: 66.9426
[7]	valid_sets's rmse: 65.9844
[8]	valid_sets's rmse: 64.6813
[9]	valid_sets's rmse: 63.9509
[10]	valid_sets's rmse: 63.0619
[11]	valid_sets's rmse: 62.3278
[12]	valid_sets's rmse: 61.4772
[13]	valid_sets's rmse: 61.0636
[14]	valid_sets's rmse: 60.5712
[15]	valid_sets's rmse: 60.2013
[16]	valid_sets's rmse: 59.7931
[17]	valid_sets's rmse: 59.4848
[18]	valid_sets's rmse: 58.9464
[19]	valid_sets's rmse: 58.6449
[20]	valid_sets's rmse: 58.4734
[21]	valid_sets's rmse: 58.2566
[22]	valid_sets's rmse: 57.9422
[23]	valid_sets's rmse: 57.8268
[24]	valid_sets's rmse: 57.6773
[25]	valid_sets's rmse: 57.5531
[26]	valid_sets's rmse: 57.4001
[27]	valid_sets's rmse: 57.4385
[28]	valid_sets's rmse: 57.3422
[29]	valid_sets's rmse: 57.2701
[30]	valid_sets's rm

In [9]:
# Dump the objects
Path(model_path).parent.mkdir(
    parents=True, exist_ok=True
) 
with open(model_path, 'wb') as f:
  pickle.dump(model, f)