In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import holidays
from lightgbm import LGBMRegressor
from hyperopt import fmin, tpe, hp
from sklearn.preprocessing import LabelEncoder

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv', index_col='row_id')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/test.csv', index_col='row_id')
sample_submission = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv', index_col='row_id')
train.head()

In [None]:
train.dtypes

In [None]:
train['date'] = pd.to_datetime(train['date'])
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train['yearday'] = train['date'].dt.dayofyear
train['weekday'] = train['date'].dt.dayofweek
train['holiday'] = [int(date in holidays.CountryHoliday(country, year))
                    for date, country, year in zip(train['date'], train['country'], train['year'])]
train.drop('date', axis=1, inplace=True)
train.head()

In [None]:
test['date'] = pd.to_datetime(test['date'])
test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day
test['yearday'] = test['date'].dt.dayofyear
test['weekday'] = test['date'].dt.dayofweek
test['holiday'] = [int(date in holidays.CountryHoliday(country, year))
                    for date, country, year in zip(test['date'], test['country'], test['year'])]
test.drop('date', axis=1, inplace=True)

In [None]:
for name in ['country', 'store', 'product']:
    encoder = LabelEncoder()
    train[name] = encoder.fit_transform(train[name])
    test[name] = encoder.transform(test[name])
train.head()

In [None]:
X_train = train[train['year']!=2018].drop('num_sold', axis=1)
y_train = train[train['year']!=2018]['num_sold']
X_val = train[train['year']==2018].drop('num_sold', axis=1)
y_val = train[train['year']==2018]['num_sold']

In [None]:
def smape(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

In [None]:
lgbmr = LGBMRegressor()
lgbmr.fit(X_train, y_train)
y_pred = lgbmr.predict(X_val)
smape(y_val, y_pred)

In [None]:
space = {'num_leaves': hp.quniform('num_leaves', 10, 1000, 1),
         'max_depth': hp.quniform('max_depth', 1, 500, 1),
         'learning_rate': hp.uniform('learning_rate', 0, 0.1),
         'n_estimators': hp.quniform('n_estimators', 1, 200, 1)}

In [None]:
def objective(space):
    lgbmr = LGBMRegressor(num_leaves=int(space['num_leaves']),
                          max_depth=int(space['max_depth']),
                          learning_rate=space['learning_rate'],
                          n_estimators=int(space['n_estimators']))
    lgbmr.fit(X_train, y_train)
    y_pred = lgbmr.predict(X_val)
    return smape(y_val, y_pred)

In [None]:
params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100)

In [None]:
for param in ['max_depth', 'n_estimators', 'num_leaves']:
    params[param] = int(params[param])
params

In [None]:
X = train.drop('num_sold', axis=1)
y = train['num_sold']

In [None]:
lgbmr = LGBMRegressor(**params)
lgbmr.fit(X, y)
y_pred = lgbmr.predict(test)

In [None]:
sample_submission['num_sold'] = y_pred
sample_submission.to_csv('submission.csv')