In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2022/train.csv", index_col='row_id')
test = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2022/test.csv")

In [None]:
train

In [None]:
train.info()

# Data Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [None]:
#  Columns to be encoded
encode_cols = ['country','store','product']

## Label Encode (Enumerating)

In [None]:
label_encoders = {}
train_encode = train.copy()

for col in encode_cols:
    label_encoders[col] = LabelEncoder()
    label_encoders[col].fit(train[col])
    train_encode[col] = label_encoders[col].transform(train_encode[col])

In [None]:
train_encode

## Manual Encoding

In [None]:
def enumerate_column(df_col):
    """Manual column enumarator."""
    elements = {value: key 
                for key, value 
                in enumerate(df_col.unique())}
    print(elements)
    df_col = df_col.replace(elements)
    return df_col

In [None]:
train_manual_encode = train.copy()

In [None]:
train_manual_encode = train.copy()
train_manual_encode[['country','store','product']] = (
    train[['country', 'store', 'product']]
    .apply(enumerate_column)
)
train_manual_encode

## One Hot Encode

In [None]:
onehotencoder = OneHotEncoder()
onehotencoder.fit(train[encode_cols])
onehotencoder.categories_

In [None]:
train_oe = train.copy()
train_oe[onehotencoder.get_feature_names()] =(
    onehotencoder
    .transform(train[encode_cols])
    .toarray()
    .astype(int)
)
train_oe

In [None]:
# Check if inverse transform of one hot encodeing is the same as initial country, store and product columns
np.array_equal(
    onehotencoder.inverse_transform(train_oe[onehotencoder.get_feature_names()]),
    train[['country', 'store', 'product']]
)

## Splitting Data into Unique Country, Store and Product

In [None]:
combinations = (
    dict(
    enumerate(
        np.unique(
            train[['country', 'store', 'product']].values.tolist(),
            axis=0))))

splitted_dfs = {}

for key, comb in combinations.items():
    splitted_dfs[key] = train.loc[(train['country'] == comb[0]) & 
                                 (train['store'] == comb[1]) & 
                                 (train['product'] == comb[2]), 
                                 ['date','num_sold'] ].set_index('date')

In [None]:
# splitted_dfs