## Experiments :: Regression Models

In [1]:
%load_ext autoreload
%autoreload 2

from datetime import date, timedelta
from pathlib import Path

import warnings
warnings.filterwarnings("ignore")

from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
import pandas as pd
from prophet.make_holidays import make_holidays_df

## Constants

In [2]:
PROJECT_ROOT = Path("__file__").resolve().parents[1]

DATA_DPATH = PROJECT_ROOT / "data"
assert DATA_DPATH.exists()

HORIZON = 30
SPLIT_DATE = date(2020, 2, 29)

## Data Loading 

In [None]:
train_fpath = DATA_DPATH / "datasets" / "train.csv"
train_df = pd.read_csv(train_fpath, index_col=0)
train_df["timestamp"] = pd.to_datetime(train_df["timestamp"])

test_fpath = DATA_DPATH / "datasets" / "test.csv"
test_df = pd.read_csv(test_fpath, index_col=0)
test_df["timestamp"] = pd.to_datetime(test_df["timestamp"])

train_df.shape, test_df.shape

## Feature Preparation 

In [None]:
df = pd.concat((train_df, test_df)).set_index("timestamp")

df.head()

### Day-Time Features

In [None]:
df["dayofmonth"] = df.index.hour.tolist()
df["dayofweek"] = df.index.dayofweek.tolist()
df["quarter"] = df.index.quarter.tolist()
df["month"] = df.index.month.tolist()
df["dayofyear"] = df.index.dayofyear.tolist()
df["weekofyear"] = df.index.isocalendar().week.tolist()

df["is_weekend"] = 0
df.loc[df["dayofweek"].isin([5, 6]), "is_weekend"] = 1

df.head()

## Holiday Feature

In [None]:
holidays_df = make_holidays_df(year_list=[2018, 2019, 2020], country="RU")

holidays_df["is_holiday"] = 1
holidays_df = holidays_df.rename(columns={"ds": "timestamp"})
holidays_df = holidays_df.set_index("timestamp")

holidays_df.head()

In [None]:
df = df.merge(holidays_df[["is_holiday"]], left_index=True, right_index=True, how="outer")
df["is_holiday"] = df["is_holiday"].fillna(0)
df["holiday_prior_1"] = df["is_holiday"].shift(-1)
df["holiday_prior_2"] = df["is_holiday"].shift(-2)
df["holiday_lag_1"] = df["is_holiday"].shift(1)
df["holiday_lag_2"] = df["is_holiday"].shift(2)
df = df.dropna(subset=["target"])

df.head()

## Lag Features

In [None]:
LAGS = [7, 10, 15]

for lag in LAGS: 
    for lag_value in range(HORIZON, HORIZON + lag):
        feature_name = f"lag_{lag_value}"
        df[feature_name] = df["target"].shift(lag)

        df[f"{feature_name}_rolling_mean_30"] = df[feature_name].rolling(30).mean()
        df[f"{feature_name}_rolling_mean_7"] = df[feature_name].rolling(7).mean()


df = df.dropna()

df.head()

In [None]:
train_df = df.loc[:SPLIT_DATE]
test_df = df.loc[(SPLIT_DATE + timedelta(days=1)):]

X_train = train_df.drop(columns=["target"])
y_train = train_df[["target"]]

X_test = test_df.drop(columns=["target"])
y_test = test_df[["target"]]

train_df.shape, test_df.shape

## Model Training :: LinearRegression

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

## Predictions :: LinearRegression

In [11]:
pred = model.predict(X_test)

In [None]:
pred_df = test_df[["target"]].copy().rename(columns={"target": "y_test"})
pred_df["y_pred"] = pred 
pred_df.head()

In [13]:
pred_dpath = DATA_DPATH / "predictions"
pred_dpath.mkdir(parents=True, exist_ok=True)

pred_df.to_csv(pred_dpath / "linear_regression_predictions.csv")

## Model Training :: LGBM 

In [None]:
model = LGBMRegressor(random_state=42)
model.fit(X_train, y_train)

## Predictions :: LGBM

In [15]:
pred = model.predict(X_test)

In [None]:
pred_df = test_df[["target"]].copy().rename(columns={"target": "y_test"})
pred_df["y_pred"] = pred 
pred_df.head()

In [17]:
pred_dpath = DATA_DPATH / "predictions"
pred_dpath.mkdir(parents=True, exist_ok=True)

pred_df.to_csv(pred_dpath / "lgbm_predictions.csv")