In [None]:
import numpy as np
import pandas as pd

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/test.csv')

In [None]:
train.head(5)

Let's see the dynamics of sells for a particular product in particular country and particular shop.

In [None]:
import seaborn as sns
sns.set(rc={'figure.figsize':(20,5)})

def show_sells(country, store, product):
    
    sells = train.loc[(train['country'] == country) & 
                               (train['product'] == product) &
                               (train['store'] == store)]
    sells['date'] = pd.to_datetime(sells['date'])
    sns.lineplot(data=sells, x='date', y='num_sold')

In [None]:
show_sells('Finland', 'KaggleMart', 'Kaggle Mug')

In [None]:
show_sells('Norway', 'KaggleMart', 'Kaggle Sticker')

In [None]:
show_sells('Sweden', 'KaggleRama', 'Kaggle Hat')

This data definetly has seasonality. The month matters, december is the month when sells are particulary high. Different products have different sells dynamics: sometimes it's close to sine wave.

In [None]:
from sklearn.preprocessing import LabelEncoder


def transform_labels(df):
    le = LabelEncoder()
    df['country'] = le.fit_transform(df['country'])
    df['store'] = le.fit_transform(df['store'])
    df['product'] = le.fit_transform(df['product'])
    return df

In [None]:
train = transform_labels(train)
train.head(5)

It helped me a lot when I added weekday and yearday. Year made results worse.

In [None]:
def transform_date(df):
    df['date'] = pd.to_datetime(df['date'])
    df['day'] = df['date'].dt.day
    df['month'] = df['date'].dt.month
    #df['year'] = df['date'].dt.year - 2015
    df['weekday'] = df['date'].dt.dayofweek
    df['yearday'] = df['date'].dt.dayofyear
    df = df.drop(columns=['date', 'row_id'])
    return df

train = transform_date(train)

In [None]:
Y = train['num_sold']
X = train.drop(columns=['num_sold'])

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_dev, y_train, y_dev = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
def smape(A, F):    
    N = A.shape[0] 
    return 100 / N * np.sum(np.divide(np.abs(A - F),
                                      np.divide(np.abs(A) + np.abs(F), 2)))
    

The graphs had shown that linear methods won't work. It won't handle these ups in the end of any year. Tree-based models could hangle that.

In [None]:
from xgboost import XGBRegressor

model = XGBRegressor(eta=0.05)
model.fit(X_train, y_train)

y_pred = pd.Series(model.predict(X_dev))

smape(y_dev, y_pred)

In [None]:
test = transform_labels(test)
test = transform_date(test)

Y_test = model.predict(test)
Y_test = Y_test.astype(int)


In [None]:
result = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv')
result['num_sold'] = pd.Series(Y_test)

In [None]:
result.to_csv('submission.csv', index=False)