# Libraries

In [None]:
#pip install scikit-learn  -U

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

#import lightgbm as lgb 
#import xgboost as xgb
#from catboost import CatBoostRegressor

from sklearn.linear_model import LinearRegression,HuberRegressor,Ridge,TweedieRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

import dateutil.easter as easter

import optuna
import math

In [None]:
#Holidays
HOLIDAYS = False     
NEXT_HOLIDAY = False  

POST_PROCESSING = False
MODEL_TYPE = "Ridge Regression"

VAL_SPLIT = "2017-12-31" #"2018-05-31"

In [None]:
EPOCHS = 10000    
EARLY_STOPPING = 30
DEVICE = "cpu"

SCALER_NAME = "MinMax"  #None MinMax Standard
SCALER = MinMaxScaler()  #MinMaxScaler StandardScaler

obj is the objective function of the algorithm, i.e. what it's trying to maximize or minimize, e.g. "regression" means it's minimizing squared residuals.

Metric and eval are essentially the same. They are used for Early stopping 

# Load Data

In [None]:
train_df = pd.read_csv("../input/tabular-playground-series-jan-2022/train.csv",index_col = 0)
test_df = pd.read_csv("../input/tabular-playground-series-jan-2022/test.csv",index_col = 0)
gdp_df = pd.read_csv('../input/gdp-20152019-finland-norway-and-sweden/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv')
gdp_df.set_index('year', inplace=True)

if HOLIDAYS:
    holidays = pd.read_csv("../input/holidays-finland-norway-sweden-20152019/Holidays_Finland_Norway_Sweden_2015-2019.csv",usecols = ["Date","Country","Name"]                      )
    holidays.rename(columns = {"Date":"date","Country":"country","Name":"holiday"},inplace= True)
    holidays["holiday"]= 1
    holidays["holiday"]= holidays["holiday"].astype("int32")
    holidays["date"] = pd.to_datetime(holidays["date"])

In [None]:
#Make date
train_df["date"] = pd.to_datetime(train_df["date"])
test_df["date"] = pd.to_datetime(test_df["date"])

In [None]:
train_df.head()

# Functions 

Thanks to [ambrosm](https://www.kaggle.com/anirudhg15)

https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model/notebook#More-feature-engineering-(advanced-model)

In [None]:
# Feature engineering
def engineer(df):
    """Return a new dataframe with the engineered features"""
    
    def get_gdp(row):
        country = 'GDP_' + row.country
        return gdp_df.loc[row.date.year, country]
        
    new_df = pd.DataFrame({'gdp': np.log(df.apply(get_gdp, axis=1)),
                           'wd4': df.date.dt.weekday == 4, # Friday
                           'wd56': df.date.dt.weekday >= 5, # Saturday and Sunday
                          })

    # One-hot encoding (no need to encode the last categories)
    for country in ['Finland', 'Norway']:
        new_df[country] = df.country == country
    new_df['KaggleRama'] = df.store == 'KaggleRama'
    for product in ['Kaggle Mug', 'Kaggle Hat']:
        new_df[product] = df['product'] == product
        
    # Seasonal variations (Fourier series)
    # The three products have different seasonal patterns
    dayofyear = df.date.dt.dayofyear
    for k in range(1, 3):
        new_df[f'sin{k}'] = np.sin(dayofyear / 365 * 2 * math.pi * k)
        new_df[f'cos{k}'] = np.cos(dayofyear / 365 * 2 * math.pi * k)
        new_df[f'mug_sin{k}'] = new_df[f'sin{k}'] * new_df['Kaggle Mug']
        new_df[f'mug_cos{k}'] = new_df[f'cos{k}'] * new_df['Kaggle Mug']
        new_df[f'hat_sin{k}'] = new_df[f'sin{k}'] * new_df['Kaggle Hat']
        new_df[f'hat_cos{k}'] = new_df[f'cos{k}'] * new_df['Kaggle Hat']

    return new_df

In [None]:
# Feature engineering for holidays
def engineer_more(df):
    """Return a new dataframe with more engineered features"""
    new_df = engineer(df)

    # End of year
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d)
                                      for d in range(24, 32)}),
                        pd.DataFrame({f"n-dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Norway')
                                      for d in range(24, 32)}),
                        pd.DataFrame({f"f-jan{d}":
                                      (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Finland')
                                      for d in range(1, 14)}),
                        pd.DataFrame({f"jan{d}":
                                      (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Norway')
                                      for d in range(1, 10)}),
                        pd.DataFrame({f"s-jan{d}":
                                      (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Sweden')
                                      for d in range(1, 15)})],
                       axis=1)
    
    # May
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"may{d}":
                                      (df.date.dt.month == 5) & (df.date.dt.day == d) 
                                      for d in list(range(1, 10))}), #  + list(range(17, 25))
                        pd.DataFrame({f"may{d}":
                                      (df.date.dt.month == 5) & (df.date.dt.day == d) & (df.country == 'Norway')
                                      for d in list(range(19, 26))})],
                       axis=1)
    
    # June and July
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"june{d}":
                                      (df.date.dt.month == 6) & (df.date.dt.day == d) & (df.country == 'Sweden')
                                      for d in list(range(8, 14))}),
                        #pd.DataFrame({f"june{d}":
                        #              (df.date.dt.month == 6) & (df.date.dt.day == d) & (df.country == 'Norway')
                        #              for d in list(range(22, 31))}),
                        #pd.DataFrame({f"july{d}":
                        #              (df.date.dt.month == 7) & (df.date.dt.day == d) & (df.country == 'Norway')
                        #              for d in list(range(1, 3))})],
                       ],
                       axis=1)
    
    # Last Wednesday of June
    wed_june_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-24')),
                                         2016: pd.Timestamp(('2016-06-29')),
                                         2017: pd.Timestamp(('2017-06-28')),
                                         2018: pd.Timestamp(('2018-06-27')),
                                         2019: pd.Timestamp(('2019-06-26'))})
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"wed_june{d}": 
                                      (df.date - wed_june_date == np.timedelta64(d, "D")) & (df.country != 'Norway')
                                      for d in list(range(-4, 6))})],
                       axis=1)
    
    # First Sunday of November
    sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')),
                                         2016: pd.Timestamp(('2016-11-6')),
                                         2017: pd.Timestamp(('2017-11-5')),
                                         2018: pd.Timestamp(('2018-11-4')),
                                         2019: pd.Timestamp(('2019-11-3'))})
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"sun_nov{d}": 
                                      (df.date - sun_nov_date == np.timedelta64(d, "D")) & (df.country != 'Norway')
                                      for d in list(range(0, 9))})],
                       axis=1)
    
    # First half of December (Independence Day of Finland, 6th of December)
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Finland')
                                      for d in list(range(6, 14))})],
                       axis=1)

    # Easter
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"easter{d}": 
                                      (df.date - easter_date == np.timedelta64(d, "D"))
                                      for d in list(range(-2, 11)) + list(range(40, 48)) + list(range(50, 59))})],
                       axis=1)
    
    return new_df.astype(np.float32)

train = engineer_more(train_df)

train['num_sold'] = train_df.num_sold.astype(np.float32)
test = engineer_more(test_df)

features = list(test.columns)
print(features)
test['date'] = test_df.date
train['date'] = train_df.date

In [None]:
train[["store","product","country"]]= train_df[["store","product","country"]]
test[["store","product","country"]]= test_df[["store","product","country"]]

In [None]:
def public_hols(df):
    df = pd.merge(df, holidays, how='left', on=['date', 'country'])
    df.fillna(value = 0,inplace=True)
    return df

In [None]:
if HOLIDAYS:
    train = public_hols(train)
    test = public_hols(test)

In [None]:
def next_holiday(x):
    i=1
    while sum(holidays["date"] == pd.Timestamp(x) + pd.DateOffset(days=i)) ==0:
        i+=1
        if i >200:
            i=0
            break
            break
    return i

if NEXT_HOLIDAY:
    holidays["date"] = pd.to_datetime(holidays["date"])
    train["to_holiday"] = train["date"].apply(lambda x : next_holiday(x))
    test["to_holiday"] = test["date"].apply(lambda x : next_holiday(x))

In [None]:
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [None]:
prior_2017 = train[train["date"]<=VAL_SPLIT].index
after_2017 = train[train["date"]>VAL_SPLIT].index

# Split and Scale

In [None]:
X = train[features]
y= train["num_sold"]

X_train = train[features].iloc[prior_2017,:]
X_test = train[features].iloc[after_2017,:]
y_train= train.iloc[prior_2017,:]["num_sold"]
y_test= train.iloc[after_2017,:]["num_sold"]

In [None]:
def scale_data(X_train, test, X_test= None):
     
    scaler= SCALER
    
    #this can be X or X_train 
    X_train_s = scaler.fit_transform(X_train)
    test_s = scaler.transform(test)
    
   
    if X_test is not None:
        X_test_s = scaler.transform(X_test)
        return X_train_s, test_s , X_test_s
    
    else:
        return X_train_s, test_s 

# Run model

In [None]:
params = {"power":1.0,
    "alpha":0.0,
    "fit_intercept":False,
    "link":'log', #‘auto’, ‘identity’, ‘log’}
    "tol":0.00000000001,
    "warm_start":False
         }

In [None]:
def fit_model(X,y,test, X_test = None,y_test= None):
    
    #X_train_s, test_s , X_test_s = scale_data(X_train, test, X_test)
    model = Ridge(max_iter=EPOCHS)

    if X_test is not None: 
        X_train_s, test_s , X_test_s = scale_data(X, test, X_test)
        model.fit(X_train_s,np.log1p(y))
        preds = np.expm1(model.predict(X_test_s))
        
        smape = SMAPE(y_test,preds)
        print("SMAPE:",smape )
        
        return preds, model, smape
        
    else:
        X_s, test_s = scale_data(X, test)
        
        model.fit(X_s,np.log1p(y))
        preds = np.exp(model.predict(test_s))
        
        return preds, model

In [None]:
val_predictions , model ,smape = fit_model(X_train,y_train,test[features] , X_test,y_test)

In [None]:
print("SMAPE :",smape )
print(f"\n EPOCHS: {EPOCHS}")
print(f"\n SCALER: {SCALER_NAME}")
print(f"\n POST_PROCESSING: {POST_PROCESSING}")

# No-Split Train on full Dataset

In [None]:
# fit on full dataset
onesplit_preds , model = fit_model(X,y,test[features])

In [None]:
onesplit_preds

#  Multi Split

In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
sub = pd.read_csv("../input/tabular-playground-series-jan-2022/sample_submission.csv",index_col = 0)

In [None]:
def split_models(split_on, sub_df):    

    split_smape=0
    
    # split training on product/ store/ country
    for split in train[split_on].unique():
        print(f"\nPredicting for {split_on} {split}")

        train_split= train[train[split_on] ==split]
        test_split =test[test[split_on] ==split]

        train_split.drop(["store","product","country"],axis =1 ,inplace=True)
        test_split.drop(["store","product","country", "date"],axis =1 ,inplace=True)

        X_train = train_split[train_split["date"]<=VAL_SPLIT].drop(["num_sold", "date"], axis=1)
        X_test = train_split[train_split["date"]>VAL_SPLIT].drop(["num_sold", "date"], axis=1)
        y_train= train_split[train_split["date"]<=VAL_SPLIT]["num_sold"]
        y_test= train_split[train_split["date"]>VAL_SPLIT]["num_sold"]


        #run model for each split type
        val_predictions , model ,smape = fit_model(X_train,y_train,test_split, X_test,y_test)
        print(f"\n{split_on} smape:",smape)
        split_smape += smape/train[split_on].nunique()
        
        #train on Full dataset
        final_predictions , model = fit_model(train_split.drop(["num_sold", "date"],axis =1),train_split["num_sold"],test_split)
        sub_df.loc[test_split.index,"num_sold"] = final_predictions.round()

    print(f"\n Final smape:",split_smape)
    
    return split_smape, sub_df, model

In [None]:
store_smape, sub_store, model = split_models("store", sub.copy(deep=True))

In [None]:
sub_store

In [None]:
store_smape, sub_product, model = split_models("product", sub.copy(deep=True))

In [None]:
sub_product

In [None]:
store_smape, sub_country,model = split_models("country", sub.copy(deep=True))

# Each prediction added to next model 

In [None]:
def split_models_recursive(split_on, sub_df):    

    split_smape=0
    
    # split training on product/ store/ country
    for idx, split in enumerate(train[split_on].unique()):
        print(f"\nPredicting for {split_on} {split}")
        
        #apply previous preds to train and test
        if idx>0:
            test.loc[test[test[split_on] ==split].index ,f'{idx}_{split}'] = final_predictions.round()
            test.loc[test_split.index,f'{idx}_{split}'] = final_predictions.round()
            test.fillna(0,inplace=True)

        train_split= train[train[split_on] ==split]
        test_split =test[test[split_on] ==split]
        
        print(train_split.shape)
        print(test_split.shape)

        train_split.drop(["store","product","country"],axis =1 ,inplace=True)
        test_split.drop(["store","product","country", "date"],axis =1 ,inplace=True)

        X_train = train_split[train_split["date"]<=VAL_SPLIT].drop(["num_sold", "date"], axis=1)
        X_test = train_split[train_split["date"]>VAL_SPLIT].drop(["num_sold", "date"], axis=1)
        y_train= train_split[train_split["date"]<=VAL_SPLIT]["num_sold"]
        y_test= train_split[train_split["date"]>VAL_SPLIT]["num_sold"]

        #run model for each split type
        val_predictions , model ,smape = fit_model(X_train,y_train,test_split, X_test,y_test)
        print(f"\n{split_on} smape:",smape)
        split_smape += smape/train[split_on].nunique()
        
        #train on Full dataset
        final_predictions , model = fit_model(train_split.drop(["num_sold", "date"],axis =1),train_split["num_sold"],test_split)
        sub_df.loc[test_split.index,"num_sold"] = final_predictions.round()
        
        #add train 
        num_store = train[train["store"] ==split]["num_sold"]
        train.loc[num_store.index ,f'{idx}_{split}']  = num_store
        train.fillna(0,inplace=True)


    print(f"\n Final smape:",split_smape)
    
    return split_smape, sub_df, model

#store_smape, sub_country,model = split_models_recursive("store", sub.copy(deep=True))

# All Split 

In [None]:
import itertools
all_splits = list(itertools.product(['KaggleMart', 'KaggleRama'],['Kaggle Mug', 'Kaggle Hat', 'Kaggle Sticker'],['Finland', 'Norway', 'Sweden']))

In [None]:
def split_models_ALL(split_on, sub_df):    

    split_smape=0
    split_dict = {}

    # split training on product/ store/ country
    for idx ,split in enumerate(split_on):
        print(f"\nPredicting for store: {split[0]}, product: {split[1]}, country: {split[2]} ")

        train_split= train[ (train["store"] == split[0]) & (train["product"] == split[1]) & (train["country"] == split[2])]
        test_split =test[ (test["store"] == split[0]) & (test["product"] == split[1]) & (test["country"] == split[2])]

        
        train_split.drop(["store","product","country"],axis =1 ,inplace=True)
        test_split.drop(["store","product","country", "date"],axis =1 ,inplace=True)
        
        X_train = train_split[train_split["date"]<=VAL_SPLIT].drop(["num_sold", "date"], axis=1)
        X_test = train_split[train_split["date"]>VAL_SPLIT].drop(["num_sold", "date"], axis=1)
        y_train= train_split[train_split["date"]<=VAL_SPLIT]["num_sold"]
        y_test= train_split[train_split["date"]>VAL_SPLIT]["num_sold"]


        #run model for each split type
        val_predictions , model ,smape = fit_model(X_train,y_train,test_split, X_test,y_test)

        split_smape += smape/len(all_splits)
        split_dict[split] = smape

        #train on Full dataset
        final_predictions , model = fit_model(train_split.drop(["num_sold", "date"],axis =1),train_split["num_sold"],test_split)
        sub_df.loc[test_split.index,"num_sold"] = final_predictions.round()
        

    print(f"\n final all_split smape:",split_smape)
    
    return split_smape, sub_df , split_dict

In [None]:
smape_all, sub_all, split_dict = split_models_ALL(all_splits, sub.copy(deep=True))

In [None]:
split_dict

In [None]:
sub_all

# Post Processing & Submission 

In [None]:
sub = pd.read_csv("../input/tabular-playground-series-jan-2022/sample_submission.csv",index_col = 0)

In [None]:
sub["num_sold"] = onesplit_preds.round()

In [None]:
sub.to_csv("submission_noSplit")
sub_store.to_csv("submission_store.csv")
sub_product.to_csv("submission_product.csv")
sub_country.to_csv("submission_country.csv")
sub_all.to_csv("submission_all.csv")

# Training Visualization

In [None]:
sns.lineplot(data = sub_country, x= test["date"] , y = "num_sold", label ="Country split prediction" ,ci=None)

In [None]:
plt.figure(figsize=(20,10))

sns.lineplot(data= train, x= "date", y= "num_sold" ,label="Train Actual",ci=None)
sns.lineplot(data = sub_store,x = test["date"] , y = "num_sold", label ="Store split prediction" ,ci=None)
sns.lineplot(data = sub_product, x= test["date"] , y = "num_sold", label ="Product split prediction" ,ci=None)
sns.lineplot(data = sub_country, x= test["date"] , y = "num_sold", label ="Country split prediction" ,ci=None)
sns.lineplot(data = sub_all, x= test["date"] , y = "num_sold", label ="All  split prediction" ,ci=None)

plt.show()

In [None]:
'''#for visual only
test = pd.read_csv("../input/tabular-playground-series-jan-2022/test.csv",index_col = 0)
test["date"] = pd.to_datetime(test["date"])

fig,ax = plt.subplots(2,1, figsize=(25,20),sharey= True)

diff = y_test - val_predictions
sns.lineplot(ax=ax[0], y= y_test, x= y_test_index, label="Train Actual",ci=None)
sns.lineplot(ax=ax[0], x = y_test_index , y = val_predictions, label ="Validation Prediction" ,ci=None)
sns.lineplot(ax=ax[0],data =sub,x = test_df["date"], y = "num_sold",label="Final Prediction" ,ci=None) 

ax[0].set_title(f"Actual and Predicted Sales for {MODEL_TYPE}")

sns.lineplot(ax=ax[1], data = diff, label ="Residuals" )
ax[1].set_title(f"Residuals for {MODEL_TYPE} for 2018")

plt.show()'''

In [None]:
plt.figure(figsize=(25,10))

sns.lineplot(x =train_df["date"], y= train_df["num_sold"] ,label="Train Actual",ci=None)
sns.lineplot(data =sub,x = test_df["date"], y = "num_sold",label="Final Prediction" ,ci=None) 
plt.title("Actual and Predicted Sales")

plt.show()