# DATA_PATH

In [None]:
import os

# Work in PC or Kaggle
IN_PC = ('VSCODE_CWD' in os.environ) and (not 'AK_KAGGLE_ENV' in os.environ)
IN_DOCKER = ('AK_KAGGLE_ENV' in os.environ)
IN_COLAB = ('COLAB_GPU' in os.environ)
IN_KAGGLE = ('KAGGLE_DATA_PROXY_URL' in os.environ)
# DATA_PATH = "../../data" if os.path.exists("../../data") and IN_PC else "../../../data" if os.path.exists("../../../data") and IN_PC else "/kaggle"

if IN_PC:
    DATA_PATH = os.path.abspath("../../data") if os.path.isdir("../../data") else os.path.abspath("../../../data")
elif(IN_COLAB):
    DATA_PATH = "/content/drive/MyDrive/Colab_Kaggle/data"
else: # IN_KAGGLE
    DATA_PATH = "/kaggle"

# Imports

In [None]:
# import os
# import sys
import numpy as np 
import pandas as pd
# import time
# import gc
import warnings

import holidays
import dateutil.easter as easter

# Sklearn
# from sklearn.model_selection import *
# from sklearn.impute import *
from sklearn.preprocessing import *
# from sklearn.metrics import *
# from sklearn.pipeline import *
# from sklearn.compose import *
# from sklearn.ensemble import *
# from sklearn.linear_model import *


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:,}'.format)
warnings.filterwarnings('ignore')

# Variables

In [None]:
target_col='num_sold' # name of target column
id_col='row_id'
competition='tps0122'
fullname_competition ='tabular-playground-series-jan-2022'

WORKING_PATH = DATA_PATH + '/working/'+ competition + '/'
ORGIN_CSV_PATH = DATA_PATH + '/input/'+ fullname_competition +'/'
CSV_PATH = WORKING_PATH + 'CSV/'

if not os.path.isdir(CSV_PATH):
    os.makedirs(CSV_PATH)

In [None]:
train = pd.read_csv(ORGIN_CSV_PATH + 'train.csv')
test = pd.read_csv(ORGIN_CSV_PATH + 'test.csv')


In [None]:
test.info(verbose=True, memory_usage="deep")

# Encoder (Text 2 Label)

In [None]:
for col in ['country', 'store', 'product' ]:
    train[col+'_label'] = LabelEncoder().fit_transform(train[col])
    test[col+'_label'] = LabelEncoder().fit_transform(test[col])

# Feature engineering

# Holiday
* [Thanks] <https://www.kaggle.com/lucamassaron/festivities-in-finland-norway-sweden>

In [None]:
holiday_list = list()

# Finland
for date in holidays.Finland(years=[2015, 2016, 2017, 2018, 2019], observed=True).items():
    holiday_list.append([date[0], date[1], "Finland"])
    
# Norway
for date in holidays.Norway(years=[2015, 2016, 2017, 2018, 2019], observed=True).items():
    holiday_list.append([date[0], date[1], "Norway"])
    
# Sweden
for date in holidays.Sweden(years=[2015, 2016, 2017, 2018, 2019], observed=True).items():
    if date[1]!='Söndag':
        holiday_list.append([date[0], date[1].replace(", Söndag", ""), "Sweden"])

In [None]:
holidays_df = pd.DataFrame(holiday_list, columns=['date', 'holiday', 'country'])
holidays_df['date'] = pd.to_datetime(holidays_df['date'])

In [None]:
holidays_df.shape

In [None]:
holidays_df.sample(10)

# Special dates

In [None]:
special_dates_list = list()

# Last week of the year
for year in [2015, 2016, 2017, 2018, 2019]:
    for i, day in enumerate(range(24, 32)):
        for country in ['Finland', 'Sweden', 'Norway']:
             special_dates_list.append([pd.to_datetime(f"{year}-{12}-{day}").date(), 
                                  f"Last week of the year (day {i+1})", 
                                  country])
# Swedish Rock Concert
for start, end, year in [[3,6,2015],[8,11,2016],[7,10,2017],[6,10,2018],[5,8,2019]]:
    for i, day in enumerate(range(start, end+1)):
        special_dates_list.append([pd.to_datetime(f"{year}-{6}-{day}").date(), 
                                  f"Swedish Rock Concert (day {i+1})", 
                                  "Sweden"])
        
# Last Wednesday of June
for date in ['2015-06-24', '2016-06-29', '2017-06-28', '2018-06-27', '2019-06-26']:
    for country in ['Finland', 'Sweden', 'Norway']:
         special_dates_list.append([pd.to_datetime(date).date(), 
                                  f"Last Wednesday of June", 
                                  country])
            
# First Sunday of November
for date in ['2015-11-1', '2016-11-6', '2017-11-5', '2018-11-4', '2019-11-3']:
    for country in ['Finland', 'Sweden', 'Norway']:
         special_dates_list.append([pd.to_datetime(date).date(), 
                                  f"First Sunday of November", 
                                  country])
            
# Independence Day of Finland
for year in [2015, 2016, 2017, 2018, 2019]:
    special_dates_list.append([pd.to_datetime(f"{year}-{12}-{6}").date(), 
                                      f"Independence Day of Finland", 
                                      'Finland'])

# Easter
easter_date = [easter.easter(y) for y in [2015, 2016, 2017, 2018, 2019]]
for date in easter_date:
    for country in ['Finland', 'Sweden', 'Norway']:
         special_dates_list.append([pd.to_datetime(date).date(), 
                                  f"Easter", 
                                  country])

In [None]:
special_dates_df = pd.DataFrame(special_dates_list, columns=['date', 'special_dates', 'country'])
special_dates_df['date'] = pd.to_datetime(special_dates_df['date'])

In [None]:
special_dates_df.shape

In [None]:
special_dates_df.sample(10)

# Public & Unofficial holidays

In [None]:
holidays_official = pd.read_csv(DATA_PATH + '/input/' + 'public-and-unofficial-holidays-nor-fin-swe-201519/holidays.csv')
holidays_official = holidays_official.rename(columns={"type": "event_type"})
holidays_official['date'] = pd.to_datetime(holidays_official['date'])

In [None]:
holidays_official.sample(5)

In [None]:
holidays_official.country.unique()

# GDP

In [None]:
gdp_df = pd.read_csv(DATA_PATH + '/input/' +'gdp-20152019-finland-norway-and-sweden/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv')
gdp_df.set_index('year', inplace=True)
gdp_df = gdp_df.rename(columns={"GDP_Finland": "Finland", "GDP_Norway": "Norway", "GDP_Sweden": "Sweden",})

In [None]:
gdp_df.head()

# Feature engineering

In [None]:
special_dates_df.head()

In [None]:
def feature_engineering(df):
    # Calendar
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month']= df['date'].dt.month
    df['week']= df['date'].dt.week
    df['day'] = df['date'].dt.day
    df['quarter'] = df['date'].dt.quarter
    df['dayofyear'] = df['date'].dt.dayofyear
    df['weekend'] = df['date'].dt.weekday >=5
    df['weekday'] = df['date'].dt.weekday
    
    # holidays & special_dates
    df = df.merge(holidays_df, on=['date', 'country'], how='left')
    df = df.merge(special_dates_df, on=['date', 'country'], how='left')
#     df = df.merge(holidays_official, on=['date', 'country'], how='left')

    df['is_holiday'] = df['holiday'].apply(lambda x: True if not pd.isnull(x) else False)
    df['is_special_dates'] = df['special_dates'].apply(lambda x: True if not pd.isnull(x) else False)
    
    # Public & Unofficial holidays
#     df['event_type'] = df['event_type'].astype('str')
#     df['event_type_label'] = LabelEncoder().fit_transform(df['event_type'])
    
    # GDP
    df['gdp_year'] = df.apply(lambda x: gdp_df.loc[x.year, x.country] , axis=1)
    
    return df

train = feature_engineering(train)
test = feature_engineering(test)
test.info(verbose=True, memory_usage="deep")


In [None]:
train.info(verbose=True, memory_usage="deep")

In [None]:
test.info(verbose=True, memory_usage="deep")

In [None]:
train.sample(10)

# Pandas dummies

In [None]:
train_dum = pd.get_dummies(train, columns=['store', 'country', 'product'])
test_dum = pd.get_dummies(test, columns=['store', 'country', 'product'])

In [None]:
train_dum.info()

In [None]:
train_dum.sample(10)

# Save CSV

In [None]:
train.to_csv(CSV_PATH + 'train-ak.csv', index=False, float_format='%g')
test.to_csv(CSV_PATH + 'test-ak.csv', index=False, float_format='%g')