# Tabular Playground Series (January 2022)
## Import Data and Gather Insights

In [1]:
from typing import Dict

import numpy as np
import pandas as pd
from sklearn.neural_network import MLPRegressor

training_df = pd.read_csv("data/train.csv", index_col='row_id')
training_df['date'] = pd.to_datetime(training_df['date'], format='%Y-%m-%d')
training_df.head()

Unnamed: 0_level_0,date,country,store,product,num_sold
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329
1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520
2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146
3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572
4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911


In [2]:
training_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26298 entries, 0 to 26297
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date      26298 non-null  datetime64[ns]
 1   country   26298 non-null  object        
 2   store     26298 non-null  object        
 3   product   26298 non-null  object        
 4   num_sold  26298 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 1.2+ MB


In [3]:
training_df['date'] = training_df['date'].dt.strftime("%m%d").astype(int)
countries = training_df.drop_duplicates(subset='country')['country']
stores = training_df.drop_duplicates(subset='store')['store']
products = training_df.drop_duplicates(subset='product')['product']

In [4]:
def map_str_to_int(series: pd.Series) -> Dict:
    i = 0
    _map = {}
    for item in series:
        _map[item] = i
        i += 1
    return _map

In [5]:
training_df.replace(to_replace=map_str_to_int(countries), inplace=True)
training_df.replace(to_replace=map_str_to_int(stores), inplace=True)
training_df.replace(to_replace=map_str_to_int(products), inplace=True)
training_df.head()

Unnamed: 0_level_0,date,country,store,product,num_sold
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,101,0,0,0,329
1,101,0,0,1,520
2,101,0,0,2,146
3,101,0,1,0,572
4,101,0,1,1,911


## Train MLP Regression Model

In [6]:
training_data = training_df[['date', 'country', 'store', 'product']].to_numpy()
target_values = training_df[['num_sold']].to_numpy().ravel()

In [7]:
mlp_regressor = MLPRegressor().fit(training_data, target_values)

## Testing MLP Regressor Model

In [8]:
test_df = pd.read_csv("data/test.csv", index_col='row_id')
test_df['date'] = pd.to_datetime(test_df['date'], format='%Y-%m-%d')
test_df.head()

Unnamed: 0_level_0,date,country,store,product
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
26298,2019-01-01,Finland,KaggleMart,Kaggle Mug
26299,2019-01-01,Finland,KaggleMart,Kaggle Hat
26300,2019-01-01,Finland,KaggleMart,Kaggle Sticker
26301,2019-01-01,Finland,KaggleRama,Kaggle Mug
26302,2019-01-01,Finland,KaggleRama,Kaggle Hat


In [9]:
test_df['date'] = test_df['date'].dt.strftime("%m%d").astype(int)
test_df.replace(to_replace=map_str_to_int(countries), inplace=True)
test_df.replace(to_replace=map_str_to_int(stores), inplace=True)
test_df.replace(to_replace=map_str_to_int(products), inplace=True)
test_df.head()

Unnamed: 0_level_0,date,country,store,product
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
26298,101,0,0,0
26299,101,0,0,1
26300,101,0,0,2
26301,101,0,1,0
26302,101,0,1,1


In [10]:
test_data = test_df.to_numpy()
predictions = np.round(mlp_regressor.predict(test_data))
test_df['num_sold'] = np.int64(predictions)
test_df.head()

Unnamed: 0_level_0,date,country,store,product,num_sold
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
26298,101,0,0,0,366
26299,101,0,0,1,281
26300,101,0,0,2,195
26301,101,0,1,0,580
26302,101,0,1,1,494


In [11]:
test_df['num_sold'].to_csv('predictions.csv')