# To DO
1. Add fake 2014 data with gDP
1. Run a model for each product
1. Recursive model
1. Add Further Feature engineering - ambrosm https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model/
1. SARIMAX model

* **Outliers**? - check for linear and Trees

* Catboost minmax scaling 

# Libraries

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

import lightgbm as lgb 
import xgboost as xgb
from sklearn.linear_model import LinearRegression,HuberRegressor,SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

import optuna
import math

In [None]:
LINEAR_DATE_AUG = False

OPTUNA = True
NUM_TRIALS = 400

##Add DATA
ADD_2014 = False 

#Holidays
HOLIDAYS = True   #good for all except catboost?
NEXT_HOLIDAY = True  # good for lightgbm

POST_PROCESSING = False
MODEL_TYPE = "lightgbm" #lightgbm catboost

VAL_SPLIT = "2017-12-31" #"2018-05-31"

In [None]:
EPOCHS = 10000     #Catboost best is 100 epochs - lightgbm is 1000
EARLY_STOPPING = 30

DEVICE = "cpu"
BOOSTING =  'gbdt'  # "goss" 'dart'  'gbdt'

SCALER_NAME = "MinMax"  #None MinMax
SCALER = MinMaxScaler() 

obj is the objective function of the algorithm, i.e. what it's trying to maximize or minimize, e.g. "regression" means it's minimizing squared residuals.

Metric and eval are essentially the same. They are used for Early stopping 

# Load Data

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jan-2022/train.csv",index_col = 0)
test = pd.read_csv("../input/tabular-playground-series-jan-2022/test.csv",index_col = 0)

gdp_df = pd.read_csv('../input/gdp-20152019-finland-norway-and-sweden/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv')
gdp_df.set_index('year', inplace=True)

if HOLIDAYS:
    holidays = pd.read_csv("../input/holidays-finland-norway-sweden-20152019/Holidays_Finland_Norway_Sweden_2015-2019.csv",usecols = ["Date","Country","Name"]                      )
    holidays.rename(columns = {"Date":"date","Country":"country","Name":"holiday"},inplace= True)
    holidays["holiday"]= 1
    holidays["holiday"]= holidays["holiday"].astype("int32")
    holidays["date"] = pd.to_datetime(holidays["date"])

In [None]:
#Make date
train["date"] = pd.to_datetime(train["date"])
test["date"] = pd.to_datetime(test["date"])

In [None]:
if ADD_2014:
    train_2014= train [train["date"]<"2016-01-01"]
    train_2014["date"] = train_2014["date"] - pd.DateOffset(years=1)
    train_2014["num_sold"] = train_2014["num_sold"]*0.98
    train = pd.concat([train_2014,train],axis=0,ignore_index=True)

In [None]:
train.head()

# Functions 

In [None]:
def public_hols(df):
    df = pd.merge(df, holidays, how='left', on=['date', 'country'])
    df.fillna(value = 0,inplace=True)
    
    return df

if HOLIDAYS:
    train = public_hols(train)
    test = public_hols(test)

In [None]:
def get_gdp(row):
    country = 'GDP_' + row.country
    return gdp_df.loc[row.date.year, country]

Lightgbm doesnt like linear changes

In [None]:
def engineer(df):
    #get GDP from file 
    df["gdp"] = df.apply(get_gdp, axis=1)   #improves Huber & Tweedie & catboost
    
    df["day"] = df["date"].dt.day
    df["dayofweek"] = df["date"].dt.dayofweek
    df["month"] = df["date"].dt.month
    df["year"] = df["date"].dt.year
    
    #play around with if Tree model - each varies 
    #df['dayofyear'] = df['date'].dt.dayofyear                ### This can cause noise 
    df['inverse_dayofyear'] = 365 - df['date'].dt.dayofyear    # good for all  except catboost
    df.loc[df["year"] == 2016 , "inverse_dayofyear"] = df.loc[df["year"] == 2016 , "inverse_dayofyear"]+1     #Leap year in 2016
    df['quarter'] = 'Q' + df['date'].dt.quarter.astype(str)      # Good for lightgbm & Huber, bad for Tweedie & catboost
    df['daysinmonth'] = df['date'].dt.days_in_month           ## Bad for all except Lightgbm
     
    # catboost and lightgbm dont like this   
    #df["Friday"] = df["dayofweek"] ==4
    #df["Sat_sun"] = (df["dayofweek"] ==5) |(df["dayofweek"] ==6)
    
    if LINEAR_DATE_AUG:
        for country in ['Finland', 'Norway']:
            df[country] = df.country == country
        df['KaggleRama'] = df.store == 'KaggleRama'
        for product in ['Kaggle Mug', 'Kaggle Sticker']:
            df[product] = df['product'] == product
            
        df.drop(["country","store","product"], axis =1, inplace = True)

        # Seasonal variations (Fourier series)
        dayofyear = df.date.dt.dayofyear
        for k in range(1, 20):
            df[f'sin{k}'] = np.sin(dayofyear / 365 * 2 * math.pi * k)
            df[f'cos{k}'] = np.cos(dayofyear / 365 * 2 * math.pi * k)
            df[f'mug_sin{k}'] = df[f'sin{k}'] * df['Kaggle Mug']
            df[f'mug_cos{k}'] = df[f'cos{k}'] * df['Kaggle Mug']
            df[f'sticker_sin{k}'] = df[f'sin{k}'] * df['Kaggle Sticker']
            df[f'sticker_cos{k}'] = df[f'cos{k}'] * df['Kaggle Sticker']

    return df

In [None]:
train = engineer(train)
test = engineer(test)

categorical_feats = [
    "country","store","product",
                     "quarter", 
                    ]

In [None]:
def next_holiday(x):
    i=1
    while sum(holidays["date"] == pd.Timestamp(x) + pd.DateOffset(days=i)) ==0:
        i+=1
        if i >200:
            i=0
            break
            break
    return i

if NEXT_HOLIDAY:
    holidays["date"] = pd.to_datetime(holidays["date"])
    train["to_holiday"] = train["date"].apply(lambda x : next_holiday(x))
    test["to_holiday"] = test["date"].apply(lambda x : next_holiday(x))

In [None]:
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [None]:
def scale_data(X_train, X_test, test):
    scaler= SCALER
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    test = scaler.transform(test)
    
    return X_train, X_test, test

In [None]:
train.head()

### One HotEncoder

In [None]:
train = pd.get_dummies(train,columns= categorical_feats)
test = pd.get_dummies(test,columns= categorical_feats)

In [None]:
train.head()

In [None]:
prior_2017 = train[train["date"]<=VAL_SPLIT].index
after_2017 = train[train["date"]>VAL_SPLIT].index

In [None]:
train.index = train["date"]
train.drop("date",axis=1,inplace=True)

test.index = test["date"]
test.drop("date",axis=1,inplace=True)

# Model Creation

# Split and Scale

In [None]:
X = train.drop("num_sold", axis=1)
y= train["num_sold"]

In [None]:
X_train = train.iloc[prior_2017,:].drop("num_sold", axis=1)
X_test = train.iloc[after_2017,:].drop("num_sold", axis=1)
y_train= train.iloc[prior_2017,:]["num_sold"]
y_test= train.iloc[after_2017,:]["num_sold"]

In [None]:
X_train.head(2)

In [None]:
X_train,X_test,test =scale_data(X_train,X_test,test)

In [None]:
X_train[0]

# Optuna

In [None]:
def objective_lgb(trial):
    # 2. Suggest values of the hyperparameters using a trial object.
    params = {
        #'objective': OBJECTIVE,
        #'metric': METRIC,
        "num_threads": -1,
        "verbose" : -1,
        "boosting_type":BOOSTING,
        "objective":trial.suggest_categorical("objective", ['poisson', 'mape', 'rmse',"mae"]),
        "learning_rate": trial.suggest_uniform('learning_rate', 0.001, 0.10),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),  #usually less than 2**max depth
        'max_bin': trial.suggest_int('max_bin', 10, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "min_data_in_leaf":trial.suggest_int('min_data_in_leaf', 10,200)
    }
    
    metric_optuna = trial.suggest_categorical("metric", ['poisson', 'mape', 'rmse',"mae"])
    params["metric"] = metric_optuna
    
    train_data = lgb.Dataset(X_train, label=y_train,)
    test_data =lgb.Dataset(X_test,label=y_test )

    #pruning_callback = optuna.integration.LightGBMPruningCallback(trial, metric_optuna)
    
    ## CREATE lightgbm model
    model = lgb.train(params=params,
                      train_set= train_data, 
                      num_boost_round= EPOCHS,
                      valid_sets= [test_data], 
                      callbacks=[lgb.early_stopping(EARLY_STOPPING),
                                 #pruning_callback
                                ]
                     )

    test_predictions = model.predict(X_test)
    smape = SMAPE(y_test,test_predictions)
    
    print("SMAPE:",smape)
    
    return smape

In [None]:
if OPTUNA:
    print("RUNNINING OPTUNA LIGHTGBM")
    study = optuna.create_study(direction="minimize")
    study.optimize(objective_lgb, n_trials=NUM_TRIALS)
    trial = study.best_trial

In [None]:
if OPTUNA:
    print("Number of finished trials: {}".format(len(study.trials)))
    print("Best trial num :",trial.number)
    print(" SMAPE Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

# Run model

In [None]:
#poisson', 'mape', 'rmse',"tweedie"
if OPTUNA: 
    print("Using Optuna params")
    params = trial.params

else:
    #OPTUNA 400 7.79556964946429
    params= {
    "objective": "poisson",
    "learning_rate": 0.007502236406176916,
    "lambda_l1": 4.901678633449036e-08,
    "lambda_l2": 0.04542006655651054,
    "num_leaves": 176,
    "max_bin": 748,
    "max_depth": 10,
    "metric": "mape"
    }
"""    
    # optuna score of : 7.886738068386228
    params ={
        "objective": "poisson",
        "metric": "mape",
        "learning_rate": 0.0856871611722979,
        "lambda_l1": 6.006877273786205e-07,
        "lambda_l2": 4.784086439520516e-06,
        "num_leaves": 178,
        "max_bin": 542,
        "max_depth": 10
    }"""

In [None]:
def fit_model(X_train,y_train,X_test,y_test):
    
    train_data = lgb.Dataset(X_train, label=y_train,)
    test_data =lgb.Dataset(X_test,label=y_test )

    ## CREATE lightgbm model
    model = lgb.train(params=params,
                      train_set= train_data, 
                      num_boost_round= EPOCHS,
                      valid_sets= [test_data], 
                      callbacks=[lgb.early_stopping(EARLY_STOPPING)],
                     )

    test_predictions = model.predict(X_test)
    print("SMAPE:", SMAPE(y_test,test_predictions))
    
    return test_predictions, model

In [None]:
test_predictions, model = fit_model(X_train,y_train,X_test,y_test)

In [None]:
print("SMAPE :",SMAPE(y_test,test_predictions) )
print(f"\n EPOCHS: {EPOCHS}")
print(f"\n SCALER: {SCALER_NAME}")
print(f"\n PARAMS: { params}")
print(f"\n Holidays : {HOLIDAYS}")
print(f"\n Next Holiday : {NEXT_HOLIDAY}")
print(f"\n Linear Date Augmentation : {LINEAR_DATE_AUG}")
print(f"\n POST_PROCESSING: {POST_PROCESSING}")

## 8.10587538123248  - no linear - holidays + next holiday  

# Final Train

In [None]:
final_predictions = model.predict(test)

In [None]:
final_predictions

# Post Processing & Submission 

In [None]:
sub = pd.read_csv("../input/tabular-playground-series-jan-2022/sample_submission.csv",index_col = 0)

In [None]:
if POST_PROCESSING:
    # from previous run we are under predicting, lets scale the values upwards
    print("Scaling predictions ")
    print("preds_prior:", final_predictions)
    
    sub["num_sold"] = final_predictions*1.143
    
    print("preds after:", np.array(sub["num_sold"]))
else:
    sub["num_sold"] = final_predictions

In [None]:
sub.to_csv("submission.csv")

In [None]:
sub.head()

# Training Visualization

In [None]:
sub.head()

In [None]:
#for visual only
test = pd.read_csv("../input/tabular-playground-series-jan-2022/test.csv",index_col = 0)
test["date"] = pd.to_datetime(test["date"])

fig,ax = plt.subplots(2,1, figsize=(25,20),sharey= True)

diff = y_test - test_predictions
sns.lineplot(ax=ax[0], data= y_test, label="Train Actual",ci=None)
sns.lineplot(ax=ax[0], data = y_test,x = y_test.index , y = test_predictions, label ="Validation Prediction" ,ci=None)
sns.lineplot(ax=ax[0],data =sub, x= test["date"], y = "num_sold",label="Final Prediction" ,ci=None) 

ax[0].set_title(f"Actual and Predicted Sales for {MODEL_TYPE}")

sns.lineplot(ax=ax[1], data = diff, label ="Residuals" )
ax[1].set_title(f"Residuals for {MODEL_TYPE} for 2018")

plt.show()

In [None]:
plt.figure(figsize=(25,10))

sns.lineplot(data= train["num_sold"] ,label="Train Actual",ci=None)
sns.lineplot(data =sub, x= test["date"], y = "num_sold",label="Final Prediction" ,ci=None) 
plt.title("Actual and Predicted Sales")

plt.show()

## Previous  Runs 

#### Best - 8.10587538123248  - no linear - holidays + next holiday  

SMAPE : 7.883328735411218

 EPOCHS: 10000

 SCALER: MinMax

 PARAMS: {'objective': 'poisson', 'learning_rate': 0.007502236406176916, 'lambda_l1': 4.901678633449036e-08, 'lambda_l2': 0.04542006655651054, 'num_leaves': 176, 'max_bin': 748, 'max_depth': 10, 'metric': 'mape'}

 Holidays : True

 Next Holiday : True

 Linear Date Augmentation : False

 POST_PROCESSING: False
 
 
## Date Aug

def engineer(df):
    #get GDP from file 
    df["gdp"] = df.apply(get_gdp, axis=1)   #improves Huber & Tweedie & catboost
    
    df["day"] = df["date"].dt.day
    df["dayofweek"] = df["date"].dt.dayofweek
    df["month"] = df["date"].dt.month
    df["year"] = df["date"].dt.year
    
    #play around with if Tree model - each varies 
    #df['dayofyear'] = df['date'].dt.dayofyear                ### This can cause noise 
    df['inverse_dayofyear'] = 365 - df['date'].dt.dayofyear    # good for all  except catboost
    df.loc[df["year"] == 2016 , "inverse_dayofyear"] = df.loc[df["year"] == 2016 , "inverse_dayofyear"]+1     #Leap year in 2016
    df['quarter'] = 'Q' + df['date'].dt.quarter.astype(str)      # Good for lightgbm & Huber, bad for Tweedie & catboost
    df['daysinmonth'] = df['date'].dt.days_in_month           ## Bad for all except Lightgbm
     
    # catboost and lightgbm dont like this   
    #df["Friday"] = df["dayofweek"] ==4
    #df["Sat_sun"] = (df["dayofweek"] ==5) |(df["dayofweek"] ==6)
    
    if LINEAR_DATE_AUG:
        for country in ['Finland', 'Norway']:
            df[country] = df.country == country
        df['KaggleRama'] = df.store == 'KaggleRama'
        for product in ['Kaggle Mug', 'Kaggle Sticker']:
            df[product] = df['product'] == product
            
        df.drop(["country","store","product"], axis =1, inplace = True)

        # Seasonal variations (Fourier series)
        dayofyear = df.date.dt.dayofyear
        for k in range(1, 20):
            df[f'sin{k}'] = np.sin(dayofyear / 365 * 2 * math.pi * k)
            df[f'cos{k}'] = np.cos(dayofyear / 365 * 2 * math.pi * k)
            df[f'mug_sin{k}'] = df[f'sin{k}'] * df['Kaggle Mug']
            df[f'mug_cos{k}'] = df[f'cos{k}'] * df['Kaggle Mug']
            df[f'sticker_sin{k}'] = df[f'sin{k}'] * df['Kaggle Sticker']
            df[f'sticker_cos{k}'] = df[f'cos{k}'] * df['Kaggle Sticker']

    return df