In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## EDA

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/test.csv')
submission = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv')
train

In [None]:
train.drop(['row_id'], axis = 1, inplace = True)
test.drop(['row_id'], axis = 1, inplace = True)

In [None]:
# Check missing values
train.info()

In [None]:
train['country'].value_counts()

In [None]:
train['product'].value_counts()

In [None]:
train['store'].value_counts()

## Feature Engineering

In [None]:
train['year'] = train['date'].apply(lambda x: x.split('-')[0])
train['month'] = train['date'].apply(lambda x: x.split('-')[1])
train['date'] = train['date'].apply(lambda x: x.split('-')[2])
train

In [None]:
test['year'] = test['date'].apply(lambda x: x.split('-')[0])
test['month'] = test['date'].apply(lambda x: x.split('-')[1])
test['date'] = test['date'].apply(lambda x: x.split('-')[2])

## Data type conversion

In [None]:
train.info()

In [None]:
train['num_sold'].max()

In [None]:
train[['date', 'month']] = train[['date', 'month']].astype(np.int8)
test[['date', 'month']] = test[['date', 'month']].astype(np.int8)

# As train['num_sold'].max() = 2884, it can be encoded in int16
train[['year', 'num_sold']] = train[['year', 'num_sold']].astype(np.int16)
test['year'] = test['year'].astype(np.int16)

train.info()

## Data Preprocessing

In [None]:
target = ['num_sold']
num_features = [col for col in train.columns if train[col].dtype in ['int8', 'int16']
               and col not in target]
cat_features = [col for col in train.columns if train[col].dtype == 'object']

In [None]:
num_features

In [None]:
cat_features

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_pipeline = Pipeline([
    ('num_scaler', StandardScaler()), 
])

cat_pipeline = Pipeline([
    ('cat_encoder', OneHotEncoder(sparse = False, handle_unknown = 'ignore')), 
    ('cat_scaler', StandardScaler()), 
])

In [None]:
from sklearn.compose import ColumnTransformer

preprocess_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_features), 
    ('cat', cat_pipeline, cat_features), 
])

In [None]:
X_train = preprocess_pipeline.fit_transform(train[num_features + cat_features])
X_test = preprocess_pipeline.transform(test[num_features + cat_features])
y_train = train[target]

## LightGBM Regressor

In [None]:
# from sklearn.model_selection import KFold
# from sklearn.metrics import mean_squared_error
# from lightgbm import LGBMRegressor

# split = KFold(n_splits = 5)
# for train_index, test_index in split.split(X_train):
#     X_tr, X_te = X_train[train_index], X_train[test_index]
#     y_tr, y_te = y_train[train_index], y_train[test_index]
    
#     params = {
#         'boosting_type': 'gbdt', 
#         'objective': 'regression',
#         'learning_rate': '0.05',
#         'n_jobs': -1, 
#     }
    
#     lgbm_reg = LGBMRegressor(**params)
#     lgbm_reg.fit(X_tr, y_tr)
#     y_pred = lgbm_reg.predict(X_te)

#     score = mean_squared_error(y_te, y_pred, squared = False)
    
# score.mean()

In [None]:
from lightgbm import LGBMRegressor

params = {
    'boosting_type': 'gbdt', 
    'objective': 'regression',
    'learning_rate': 0.05,
    'n_jobs': -1, 
}

lgbm_reg = LGBMRegressor(**params)
lgbm_reg.fit(X_train, y_train)

## Submission

In [None]:
submission

In [None]:
y_pred = lgbm_reg.predict(X_test)
submission['num_sold'] = y_pred
submission.to_csv('my_submission.csv', index = False)