## EDA notebook

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
TRAIN_PATH = '/kaggle/input/tabular-playground-series-jan-2022/train.csv'
TEST_PATH = '/kaggle/input/tabular-playground-series-jan-2022/test.csv'

TRAIN_DTYPE = {
        'row_id': 'uint32',
        'country': 'category',
        'store': 'category',
        'product': 'category',
        'num_sold': 'uint32'
    }
TEST_DTYPE = {
        'row_id': 'uint32',
        'country': 'category',
        'store': 'category',
        'product': 'category',
    }

CAT_COLS = ['country', 'store', 'product']
TS_COLS = ['country', 'store', 'product']
TARGET_COL = 'num_sold'
DATE_COL = 'date'
DATE_FREQ = 'D'
ID_COL = 'row_id'



In [None]:
train = pd.read_csv(TRAIN_PATH, dtype=TRAIN_DTYPE, parse_dates=[DATE_COL], infer_datetime_format=True,)
test = pd.read_csv(TEST_PATH, dtype=TEST_DTYPE, parse_dates=[DATE_COL],infer_datetime_format=True,)

In [None]:
train[DATE_COL] = train[DATE_COL].dt.to_period(DATE_FREQ)
test[DATE_COL] = test[DATE_COL].dt.to_period(DATE_FREQ)

validation: to determine what cleaning is needed

In [None]:
train.tail(3)
# test.tail(3)

In [None]:
train_missing_ratio = ((train.isnull().sum() / len(train)) * 100).sort_values(ascending=False).to_frame(name="missing_ratio").query("missing_ratio>0")
test_missing_ratio = ((test.isnull().sum() / len(test)) * 100).sort_values(ascending=False).to_frame(name="missing_ratio").query("missing_ratio>0")

In [None]:
cat_col_unique = pd.DataFrame({
    'train_nunique': train[CAT_COLS].nunique(),
    'train_unique':[train[col].unique().to_list() for col in CAT_COLS],
    'test_nunique': test[CAT_COLS].nunique(),
    'test_unique':[test[col].unique().to_list() for col in CAT_COLS],
})


In [None]:
train_num_series = np.product(train[TS_COLS].nunique())
test_num_series = np.product(test[TS_COLS].nunique())

In [None]:
train_possible_dates = pd.date_range(
    start=train[DATE_COL].min().strftime('%Y-%m-%d'), 
    end=train[DATE_COL].max().strftime('%Y-%m-%d'))
test_possible_dates = pd.date_range(
    start=test[DATE_COL].min().strftime('%Y-%m-%d'), 
    end=test[DATE_COL].max().strftime('%Y-%m-%d'))

In [None]:
print(f"train columns:\n{train.columns}")
print(f"test columns:\n{test.columns}\n")

print(f"train shape:\n{train.shape}")
print(f"test shape:\n{test.shape}\n")

print(f"train dtypes:\n{train.dtypes}")
print(f"test dtypes:\n{test.dtypes}\n")

print(f"train dtypes value_counts:\n{train.dtypes.value_counts()}")
print(f"test dtypes value_counts:\n{test.dtypes.value_counts()}\n")

print(f"train_missing_ratio:\n{train_missing_ratio}")
print(f"test_missing_ratio:\n{test_missing_ratio}\n")

print(f"cat_col_unique:\n{cat_col_unique}\n")

print(f"train_num_series:\n{train_num_series}")
print(f"test_num_series:\n{test_num_series}\n")

print(f"No missing dates in train: {(len(train_possible_dates)*train_num_series)==len(train)}")
print(f"No missing dates in test: {(len(test_possible_dates)*test_num_series)==len(test)}\n")

print(f"forecast horzions: {test[DATE_COL].nunique()} {DATE_FREQ}")

plots

In [None]:
train['store_product'] = train[['store','product']].apply(lambda x: '-'.join(x), axis=1)
train

In [None]:
train['date_str'] = [d.strftime('%Y-%m-%d') for d in train.date]
train['year'] = train.date.dt.year
train['month'] = train.date.dt.month
train['weekday'] = train.date.dt.weekday
train['week'] =train.date.dt.week
train["day"] = train.date.dt.dayofweek

In [None]:
# double click the image to enlarge
g = sns.FacetGrid(train, row="country", col="store_product" ,sharex=True,sharey=False, margin_titles=True, height=4, aspect=4)
g.map(sns.lineplot, "date_str", "num_sold", )
for axes in g.axes.flat:
    labels = [d.strftime('%Y-%m-%d') for d in train.iloc[train[['year','month']].drop_duplicates(keep='first').index,1].to_list()]
    axes.set_xticks(labels)
    axes.set_xticklabels(labels, rotation=90)

In [None]:
train = train.set_index(TS_COLS+[DATE_COL]).sort_index()
train = train.unstack(TS_COLS)

In [None]:
X = pd.DataFrame({'num_sold':train.iloc[:,0],
                 'week':[idx.to_timestamp().week for idx in train.iloc[:,0].index],
                 'day':[idx.to_timestamp().dayofweek for idx in train.iloc[:,0].index],
                'year':[idx.to_timestamp().year for idx in train.iloc[:,0].index],


                 })
