## Experiments :: Regression Models

In [58]:
%load_ext autoreload
%autoreload 2

from datetime import date, timedelta
from pathlib import Path

import warnings
warnings.filterwarnings("ignore")

from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
import pandas as pd
from prophet.make_holidays import make_holidays_df

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Constants

In [59]:
PROJECT_ROOT = Path("__file__").resolve().parents[1]

DATA_DPATH = PROJECT_ROOT / "data"
assert DATA_DPATH.exists()

HORIZON = 30
SPLIT_DATE = date(2020, 2, 29)

## Data Loading 

In [60]:
train_fpath = DATA_DPATH / "datasets" / "train.csv"
train_df = pd.read_csv(train_fpath, index_col=0)
train_df["timestamp"] = pd.to_datetime(train_df["timestamp"])

test_fpath = DATA_DPATH / "datasets" / "test.csv"
test_df = pd.read_csv(test_fpath, index_col=0)
test_df["timestamp"] = pd.to_datetime(test_df["timestamp"])

train_df.shape, test_df.shape

((790, 2), (30, 2))

## Feature Preparation 

In [61]:
df = pd.concat((train_df, test_df)).set_index("timestamp")

df.head()

Unnamed: 0_level_0,target
timestamp,Unnamed: 1_level_1
2018-01-01,196.335144
2018-01-02,342.874069
2018-01-03,321.301506
2018-01-04,362.192585
2018-01-05,239.929


### Day-Time Features

In [62]:
df["dayofmonth"] = df.index.hour.tolist()
df["dayofweek"] = df.index.dayofweek.tolist()
df["quarter"] = df.index.quarter.tolist()
df["month"] = df.index.month.tolist()
df["dayofyear"] = df.index.dayofyear.tolist()
df["weekofyear"] = df.index.isocalendar().week.tolist()

df["is_weekend"] = 0
df.loc[df["dayofweek"].isin([5, 6]), "is_weekend"] = 1

df.head()

Unnamed: 0_level_0,target,dayofmonth,dayofweek,quarter,month,dayofyear,weekofyear,is_weekend
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-01,196.335144,0,0,1,1,1,1,0
2018-01-02,342.874069,0,1,1,1,2,1,0
2018-01-03,321.301506,0,2,1,1,3,1,0
2018-01-04,362.192585,0,3,1,1,4,1,0
2018-01-05,239.929,0,4,1,1,5,1,0


## Holiday Feature

In [63]:
holidays_df = make_holidays_df(year_list=[2018, 2019, 2020], country="RU")

holidays_df["is_holiday"] = 1
holidays_df = holidays_df.rename(columns={"ds": "timestamp"})
holidays_df = holidays_df.set_index("timestamp")

holidays_df.head()

Unnamed: 0_level_0,holiday,is_holiday
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01,New Year Holidays,1
2018-01-02,New Year Holidays,1
2018-01-03,New Year Holidays,1
2018-01-04,New Year Holidays,1
2018-01-05,New Year Holidays,1


In [64]:
df = df.merge(holidays_df[["is_holiday"]], left_index=True, right_index=True, how="outer")
df["is_holiday"] = df["is_holiday"].fillna(0)
df["holiday_prior_1"] = df["is_holiday"].shift(-1)
df["holiday_prior_2"] = df["is_holiday"].shift(-2)
df["holiday_lag_1"] = df["is_holiday"].shift(1)
df["holiday_lag_2"] = df["is_holiday"].shift(2)
df = df.dropna(subset=["target"])

df.head()

Unnamed: 0_level_0,target,dayofmonth,dayofweek,quarter,month,dayofyear,weekofyear,is_weekend,is_holiday,holiday_prior_1,holiday_prior_2,holiday_lag_1,holiday_lag_2
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2018-01-01,196.335144,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,,
2018-01-02,342.874069,0.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,
2018-01-03,321.301506,0.0,2.0,1.0,1.0,3.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
2018-01-04,362.192585,0.0,3.0,1.0,1.0,4.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
2018-01-05,239.929,0.0,4.0,1.0,1.0,5.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


## Lag Features

In [65]:
LAGS = [7, 10, 15]

for lag in LAGS: 
    for lag_value in range(HORIZON, HORIZON + lag):
        feature_name = f"lag_{lag_value}"
        df[feature_name] = df["target"].shift(lag)

        df[f"{feature_name}_rolling_mean_30"] = df[feature_name].rolling(30).mean()
        df[f"{feature_name}_rolling_mean_7"] = df[feature_name].rolling(7).mean()


df = df.dropna()

df.head()

Unnamed: 0_level_0,target,dayofmonth,dayofweek,quarter,month,dayofyear,weekofyear,is_weekend,is_holiday,holiday_prior_1,...,lag_41_rolling_mean_7,lag_42,lag_42_rolling_mean_30,lag_42_rolling_mean_7,lag_43,lag_43_rolling_mean_30,lag_43_rolling_mean_7,lag_44,lag_44_rolling_mean_30,lag_44_rolling_mean_7
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-02-14,773.223933,0.0,2.0,1.0,2.0,45.0,7.0,0.0,0.0,0.0,...,291.560688,344.138785,344.138785,291.560688,344.138785,344.138785,291.560688,344.138785,344.138785,291.560688
2018-02-15,329.845516,0.0,3.0,1.0,2.0,46.0,7.0,0.0,0.0,0.0,...,319.230987,344.138785,349.065573,319.230987,344.138785,349.065573,319.230987,344.138785,349.065573,319.230987
2018-02-16,201.519331,0.0,4.0,1.0,2.0,47.0,7.0,0.0,0.0,0.0,...,334.031624,349.065573,349.271957,334.031624,349.065573,349.271957,334.031624,349.065573,349.271957,334.031624
2018-02-17,213.50621,0.0,5.0,1.0,2.0,48.0,7.0,1.0,0.0,0.0,...,355.44226,349.271957,350.204305,355.44226,349.271957,350.204305,355.44226,349.271957,350.204305,355.44226
2018-02-18,314.438632,0.0,6.0,1.0,2.0,49.0,7.0,1.0,0.0,0.0,...,368.678748,350.204305,349.804696,368.678748,350.204305,349.804696,368.678748,350.204305,349.804696,368.678748


In [66]:
train_df = df.loc[:SPLIT_DATE]
test_df = df.loc[(SPLIT_DATE + timedelta(days=1)):]

X_train = train_df.drop(columns=["target"])
y_train = train_df[["target"]]

X_test = test_df.drop(columns=["target"])
y_test = test_df[["target"]]

train_df.shape, test_df.shape

((746, 58), (30, 58))

## Model Training :: LinearRegression

In [67]:
model = LinearRegression()
model.fit(X_train, y_train)

## Predictions :: LinearRegression

In [68]:
pred = model.predict(X_test)

In [69]:
pred_df = test_df[["target"]].copy().rename(columns={"target": "y_test"})
pred_df["y_pred"] = pred 
pred_df.head()

Unnamed: 0_level_0,y_test,y_pred
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-01,372.717726,636.218989
2020-03-02,267.364632,703.309159
2020-03-03,185.421321,624.121694
2020-03-04,58.17667,575.213619
2020-03-05,243.762992,740.022011


In [70]:
pred_dpath = DATA_DPATH / "predictions"
pred_dpath.mkdir(parents=True, exist_ok=True)

pred_df.to_csv(pred_dpath / "linear_regression_predictions.csv")

## Model Training :: LGBM 

In [71]:
model = LGBMRegressor(random_state=42)
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000429 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11529
[LightGBM] [Info] Number of data points in the train set: 746, number of used features: 56
[LightGBM] [Info] Start training from score 638.324941


## Predictions :: LGBM

In [72]:
pred = model.predict(X_test)

In [73]:
pred_df = test_df[["target"]].copy().rename(columns={"target": "y_test"})
pred_df["y_pred"] = pred 
pred_df.head()

Unnamed: 0_level_0,y_test,y_pred
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-01,372.717726,680.749982
2020-03-02,267.364632,380.058841
2020-03-03,185.421321,544.152559
2020-03-04,58.17667,522.814863
2020-03-05,243.762992,416.090111


In [74]:
pred_dpath = DATA_DPATH / "predictions"
pred_dpath.mkdir(parents=True, exist_ok=True)

pred_df.to_csv(pred_dpath / "lgbm_predictions.csv")