In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# model 
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt
import lightgbm 

from sklearn.metrics import fbeta_score, make_scorer

# data path 
TRAIN_PATH = "../input/tabular-playground-series-jan-2022/train.csv"
TEST_PATH = "../input/tabular-playground-series-jan-2022/test.csv"
SAMPLE_SUBMISSION_PATH = "../input/tabular-playground-series-jan-2022/sample_submission.csv"
SUBMISSION_PATH = "submission.csv"

ID = "row_id"
TARGET = "num_sold"
DATE = "date"

SEED = 2022
TEST_SIZE = 0.2

RS_CV = 3
RS_N_ITER = 50
RS_N_JOBS = -1

In [None]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

train[DATE] = pd.to_datetime(train[DATE])
test[DATE]  = pd.to_datetime(test[DATE])

train['year'] = train[DATE].dt.year
train['month'] = train[DATE].dt.month
train['day'] = train[DATE].dt.day
train['day_of_year'] = train[DATE].dt.dayofyear
train['day_of_month'] = train[DATE].dt.days_in_month
train['day_of_week'] = train[DATE].dt.dayofweek
train['weekday'] = train[DATE].dt.weekday

test['year'] = test[DATE].dt.year
test['month'] = test[DATE].dt.month
test['day'] = test[DATE].dt.day
test['day_of_year'] = test[DATE].dt.dayofyear
test['day_of_month'] = test[DATE].dt.days_in_month
test['day_of_week'] = test[DATE].dt.dayofweek
test['weekday'] = test[DATE].dt.weekday

cat_cols = train.select_dtypes('object').columns.tolist()
train = pd.get_dummies(train, columns=cat_cols)
test  = pd.get_dummies(test, columns=cat_cols)

In [None]:
####################################################################################
#split input data and target data 
X = train.drop([ID,TARGET,DATE],axis=1)
y = train[TARGET]
####################################################################################
# search best parameter and model

def smape(actual, predicted):
    numerator = np.abs(predicted - actual)
    denominator = (np.abs(actual) + np.abs(predicted)) / 2
    
    return np.mean(numerator / denominator)*100

model = lightgbm.LGBMRegressor()
parameters = {
    'n_estimators': sp_randInt(1, 5000),
    'max_depth' : sp_randInt(1, 100),
    'learning_rate' : sp_randFloat(),
    'bagging_fraction':sp_randFloat(),
    'subsample':sp_randFloat(),
#     'min_sum_hessian_in_leaf':sp_randFloat()
}

rs = RandomizedSearchCV(estimator=model, 
                           param_distributions = parameters,
                           scoring=make_scorer(smape, greater_is_better=False),
                           cv = RS_CV, 
                           n_iter = RS_N_ITER, 
                           n_jobs=RS_N_JOBS)
rs.fit(X, y)

# best model 
print(rs.best_params_)
bestModel = rs.best_estimator_

In [None]:
####################################################################################
#predict
X_test = test.drop([ID,DATE],axis=1)
pred_test = bestModel.predict(X_test)
####################################################################################
#submit
sub = pd.read_csv(SAMPLE_SUBMISSION_PATH)
sub[TARGET] = pred_test
sub.to_csv(SUBMISSION_PATH, index=False)
sub.head()
####################################################################################