# **TPS Jan, 2022**

## **Catboost Baseline**

In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/test.csv')

display(train, test)

# **Preprocessing**

In [None]:
all_data = pd.concat([train, test])
all_data

In [None]:
all_data['date'] = pd.to_datetime(all_data['date'])
all_data['year'] = all_data['date'].dt.year
all_data['month'] = all_data['date'].dt.month
all_data['day'] = all_data['date'].dt.day
all_data['dayofweek'] = all_data['date'].dt.dayofweek
all_data['dayofmonth'] = all_data['date'].dt.days_in_month
all_data['dayofyear'] = all_data['date'].dt.dayofyear
all_data['weekday'] = all_data['date'].dt.weekday
all_data['weekofyear'] = all_data['date'].dt.weekofyear

In [None]:
all_data.drop(columns = ['num_sold', 'date', 'row_id'], inplace = True)
all_data

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in ['country', 'product', 'store']:
    all_data[col] = le.fit_transform(all_data[col])
    
    
all_data

In [None]:
train2 = all_data[:len(train)]
test2 = all_data[len(train):]
y = train['num_sold']

# **Modeling**

* Used TimeSeriesSplit

In [None]:
from sklearn.model_selection import TimeSeriesSplit
kfold = TimeSeriesSplit(10)

In [None]:
from catboost import CatBoostRegressor

test_pred = []
for fold, (train_id, test_id) in enumerate(kfold.split(train2)):
    print('<------- fold', fold+1, '------->')
    x_train, y_train = train2.iloc[train_id], y.iloc[train_id]
    x_valid, y_valid = train2.iloc[test_id], y.iloc[test_id]
    
    cat = CatBoostRegressor(n_estimators = 10000)
    cat.fit(x_train, y_train, eval_set = (x_valid, y_valid), verbose = 1000, early_stopping_rounds = 1500)
    test_pred.append(cat.predict(test2))

In [None]:
sold = np.mean(test_pred, axis = 0)

In [None]:
sub = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv')
sub['num_sold'] = sold
sub.to_csv('sub.csv', index = False)