In [None]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import xgboost
from sklearn import preprocessing

# Data Loading

In [None]:
original_train_df = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
original_test_df = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')
gdp_df = pd.read_csv('../input/gdp-20152019-finland-norway-and-sweden/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv')

# gdp_df의 인덱스를 년도로 설정해줌
gdp_df.set_index('year', inplace=True)

# 문자형인 date 컬럼을 datatime형으로 변환해줌
for df in [original_train_df, original_test_df]:
    df['date'] = pd.to_datetime(df.date)
original_train_df.head(2)

# Data Explore

In [None]:
original_train_df.shape

In [None]:
original_train_df['date']

In [None]:
original_train_df.head()

In [None]:
original_test_df.shape

In [None]:
original_test_df.head()

In [None]:
gdp_df

In [None]:
def smape_loss(y_true, y_pred):
    """SMAPE Loss"""
    return np.abs(y_true - y_pred) / (y_true + np.abs(y_pred)) * 200

# Simple feature engineering (without holidays)
in this simple model, we consider the following:

* country, store, product
* weekdays
* seasonal variations per product as a Fourier series with wavelengths from 1 year down to 18 days
* country's GDP

The residuals of this simple model will permit us to understand the effect of holidays.

In [None]:
# Feature engineering
# 계절 주기성을 고려한 Feature engineering은 추후에 할 예정 
def engineer(df):
    """Return a new dataframe with the engineered features"""
    
    def get_gdp(row):
        country = 'GDP_' + row['country']
        return gdp_df.loc[row['date'].year, country]
    
    new_df = pd.DataFrame({'gdp': np.log(df.apply(get_gdp, axis = 1)),
                          'wd4': df.date.dt.weekday == 4, # 금요일
                          'wd56': df.date.dt.weekday >= 5, # 토요일, 일요일
                          })
    
    # 기존 컬럼 원-핫 인코딩
    for country in ['Finland', 'Norway']:
        new_df[country] = df['country'] == country
    new_df['KaggleRama'] = df['store'] == 'KaggleRama'
    for product in ['Kaggle Mug', 'Kaggle Hat']:
        new_df[product] = df['product'] == product

    # 계절의 주기성
    # 3개의 제품들은 다른 계절 패턴을 보인다.
    dayofyear = df.date.dt.dayofyear # 해당 날짜가 365일 중 얼마나 지났는지 알려주는 함수
    for k in range(1, 3):
        new_df[f'sin{k}'] = np.sin(dayofyear / 365 * 2 * math.pi * k)
        new_df[f'cos{k}'] = np.cos(dayofyear / 365 * 2 * math.pi * k)
        new_df[f'mug_sin{k}'] = new_df[f'sin{k}'] * new_df['Kaggle Mug']
        new_df[f'mug_cos{k}'] = new_df[f'cos{k}'] * new_df['Kaggle Mug']
        new_df[f'hat_sin{k}'] = new_df[f'sin{k}'] * new_df['Kaggle Hat']
        new_df[f'hat_cos{k}'] = new_df[f'cos{k}'] * new_df['Kaggle Hat']
        
    return new_df

train_df = engineer(original_train_df)
train_df['date'] = original_train_df['date']
train_df['num_sold'] = original_train_df['num_sold'].astype(np.float32)
test_df= engineer(original_test_df)

features = test_df.columns

for df in [train_df, test_df]:
    df[features] = df[features].astype(np.float32)

print(list(features))

# LabelEncoding

In [None]:
def LabelEncoding(df):
    def get_gdp(row):
        country = 'GDP_' + row['country']
        return gdp_df.loc[row['date'].year, country]
        
    new_df = pd.DataFrame({'gdp': np.log(df.apply(get_gdp, axis = 1)),
                        'wd4': df.date.dt.weekday == 4, # 금요일
                        'wd56': df.date.dt.weekday >= 5, # 토요일, 일요일
                        })
        
    # 기존 컬럼 라벨인코딩
    features = ['country','store', 'product']
    le = preprocessing.LabelEncoder()
    for feature in features:
        le.fit(df[feature])
        new_df[feature] = le.transform(df[feature])
    return new_df

train_df_label = LabelEncoding(original_train_df)
train_df_label['date'] = original_train_df['date']
train_df_label['num_sold'] = original_train_df['num_sold'].astype(np.float32)
test_df_label= LabelEncoding(original_test_df)

In [None]:
train_df_label['wd4'] = train_df_label['wd4'].astype(int)
train_df_label['wd56'] = train_df_label['wd56'].astype(int)
test_df_label['wd4'] = test_df_label['wd4'].astype(int)
test_df_label['wd56'] = test_df_label['wd56'].astype(int)

In [None]:
train_df_label.head()

In [None]:
test_df_label.head()

In [None]:
train_df.head()

In [None]:
train_df.drop('date',axis = 1, inplace = True)

In [None]:
test_df.head()

# Holidays 추가

In [None]:
X = train_df.drop('num_sold', axis = 1)
y = train_df['num_sold']

In [None]:
model = xgboost.XGBRegressor()
model.fit(X,y)
pred = model.predict(test_df)

In [None]:
sub = original_test_df[['row_id']].copy()
sub['num_sold'] = pred
sub.to_csv('submission_XGboost_model.csv', index=False)

In [None]:
sub_rounded = sub.copy()
sub_rounded['num_sold'] = sub_rounded['num_sold'].round()
sub_rounded.to_csv('submission_XGboost_model_rounded.csv', index=False)
sub_rounded