In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

### read data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')

### data preprocessing

In [None]:
train.head()

In [None]:
train = train.drop(['row_id', 'date'], axis=1)
test = test.drop(['row_id', 'date'], axis=1)

In [None]:
x_data = train.drop('num_sold', axis=1)
y_data = train.num_sold

In [None]:
for col in x_data.columns:
    encoder = LabelEncoder()
    encoder.fit(x_data[col])
    x_data[col] = encoder.transform(x_data[col])
    test[col] = encoder.transform(test[col])

### train model

In [None]:
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data)

In [None]:
x_train.head()

In [None]:
model = DecisionTreeRegressor()
model.fit(x_train, y_train)

train_pred = model.predict(x_train)
test_pred = model.predict(x_test)

train_score = SMAPE(y_train, train_pred)
test_score = SMAPE(y_test, test_pred)

print('train_score ', train_score, 'test_score', test_score)


### prediction && submission

In [None]:
y_test = model.predict(test)

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv')
submission.num_sold = y_test
submission.to_csv('submission.csv', index=False)