In [None]:
!pip install scikit-learn
!pip install flaml
!pip install fast-ml
!pip install ray==1.1

In [None]:
## Relevant package imports
import pandas as pd
import numpy as np
import ray
import re
from flaml import AutoML
from fast_ml.feature_engineering import FeatureEngineering_DateTime
from sklearn.model_selection import train_test_split
from flaml.ml import sklearn_metric_loss_score
import pandas as pd
import numpy as np
import dateutil.easter as easter
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor

In [None]:
#import data into pandas DataFrames

TEST = False
train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv',index_col='row_id')
test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv',index_col='row_id')
gdp = pd.read_csv('../input/gdp-20152019-finland-norway-and-sweden/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv',index_col=0)
train['date'] = pd.to_datetime(train.date)
train['year']= train.date.dt.year
test['date'] = pd.to_datetime(test.date)
test['year'] = test.date.dt.year
print(train[['num_sold','year']].groupby('year').mean())

if TEST:
    test = train[train.year == 2018].copy()
    train.drop(test.index,inplace=True)
    kgroups = 2
else:
    kgroups = 3   

In [None]:
train.head(5)

In [None]:
test.head(5)

In [None]:
gdp.head(5)

**Feature Engineering: GDP & Holidays**

In [None]:
gdp.columns = gdp.columns.str[4:]
gdp = gdp.apply(lambda x: x**1.21) # see explanation in https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model/notebook
scaler = gdp.iloc[kgroups+1]/gdp
gdp_map = scaler.stack().to_dict()

In [None]:
train['num_sold']=pd.Series(list(zip(train.date.dt.year,train.country))).map(gdp_map)*train.num_sold
train['num_sold'] = np.log(train.num_sold)

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.dtypes

In [None]:
test.info()

In [None]:
train.reset_index(drop = True, inplace=True)
train

In [None]:
def date_process(df):
    
    df['date'] = pd.to_datetime(df['date'])
    df['year'] =  df['date'].dt.year
    df['wd56'] = (df.date.dt.weekday >= 5).astype(str)+df.country
    df['wd4'] = (df.date.dt.weekday == 4).astype(str)+df.country
    df['dayofyear'] = df['date'].dt.dayofyear
    df.loc[(df.date.dt.is_leap_year) & (df.dayofyear >= 60),'dayofyear'] -= 1
    
    # Christmas
    xmas_date = df.date.dt.year.apply(lambda year: pd.Timestamp(str(year)+'-12-25'))
    df['xmas_adjust1'] = (df.date - xmas_date).dt.days.clip(lower=1,upper=6)
    df['xmas_adjust2'] = (df.date - xmas_date).dt.days.clip(lower=-2,upper=20)*1.0
          
    # New Year 
    df['newyear_adjust1'] = df.dayofyear.clip(lower=0,upper=15)
    df['newyear_adjust2'] = df.dayofyear.clip(lower=0,upper=2)
    
    # Easter
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    df['easter_adj']= (df.date - easter_date).dt.days.clip(lower =-3,upper = 60).astype(float)
    df.loc[df['easter_adj'].isin(range(12, 39)), 'easter_adj'] = 12 
        
    # Last Wednesday of June
    wed_june_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-24')),
                                         2016: pd.Timestamp(('2016-06-29')),
                                         2017: pd.Timestamp(('2017-06-28')),
                                         2018: pd.Timestamp(('2018-06-27')),
                                         2019: pd.Timestamp(('2019-06-26'))})
    df['days_from_wed_jun'] = (df.date - wed_june_date).dt.days.clip(-5, 5)
    
    #First Sunday of November (second Sunday is Father's Day)
    sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')),
                                         2016: pd.Timestamp(('2016-11-6')),
                                         2017: pd.Timestamp(('2017-11-5')),
                                         2018: pd.Timestamp(('2018-11-4')),
                                         2019: pd.Timestamp(('2019-11-3'))})
    df['days_from_sun_nov'] = (df.date - sun_nov_date).dt.days.clip(-1, 9)
    
    df.drop(columns=['date'],inplace=True)

In [None]:
date_process(train)
date_process(test)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
y = train.pop('num_sold')
X = train

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

In [None]:
automl = AutoML()

In [None]:
automl.fit(X_train, y_train, task="regression",metric='mape',time_budget=680)

In [None]:
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Best MAPE on validation data: {0:.4g}'.format(automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

In [None]:
print(automl.best_iteration)
print(automl.best_loss)
print(automl.time_to_find_best_model)
print(automl.config_history)

In [None]:
print('train SMAPE', '=', 100*sklearn_metric_loss_score('mape', automl.predict(X_train), y_train))
print(' test SMAPE', '=', 100*sklearn_metric_loss_score('mape', automl.predict(X_test), y_test))

In [None]:
y_pred = np.exp(automl.predict(test))
y_pred

In [None]:
import matplotlib.pyplot as plt
plt.barh(automl.model.estimator.feature_name_, automl.model.estimator.feature_importances_)

In [None]:
sub = pd.DataFrame(y_pred,columns=['num_sold'])
sub.tail(5)

In [None]:
sample = pd.read_csv("../input/tabular-playground-series-jan-2022/sample_submission.csv")
sample['num_sold'] = sub['num_sold']
sample[['row_id', 'num_sold']].to_csv('submission.csv', index=False)

In [None]:
sample