# Data Preprocessing

In [None]:
%load_ext autoreload
%autoreload 2

from datetime import date
from pathlib import Path

import warnings
warnings.filterwarnings("ignore")

from etna.datasets.tsdataset import TSDataset
from etna.transforms import MedianOutliersTransform
from etna.transforms import TimeSeriesImputerTransform
import pandas as pd 

## Constants

In [4]:
PROJECT_ROOT = Path("__file__").resolve().parents[1]

DATA_DPATH = PROJECT_ROOT / "data"
assert DATA_DPATH.exists()

## Functions

## Data Loading 

In [None]:
df_fpath = DATA_DPATH / "preprocessed_data" / "resampled_product.csv"

df = pd.read_csv(df_fpath, index_col=0)
df["timestamp"] = pd.to_datetime(df["timestamp"])
df = df.rename(columns={"quantity": "target"})

df.shape

In [None]:
df.head()

In [None]:
df.info()

## Train/Test Split

In [None]:
split_date = date(2020, 2, 29)

train_df = df[df["timestamp"].dt.date <= split_date]
test_df = df[df["timestamp"].dt.date > split_date]

train_df.shape, test_df.shape

## Preprocessing 

In [None]:
train_df["segment"] = 10
ts_train_df = TSDataset(train_df, freq="D")

# --- Outlier Processing --- #
outliers_remover = MedianOutliersTransform(in_column="target", window_size=30)
ts_train_df.fit_transform([outliers_remover])

print("number of series with outliers:", len(outliers_remover.outliers_timestamps))
print("total number of outliers:", sum([len(values) for values in outliers_remover.outliers_timestamps.values()]))

# --- Null Filling --- #
imputer = TimeSeriesImputerTransform(in_column="target", strategy="running_mean", window=30)
ts_train_df.fit_transform([imputer])

preprocessed_train_df = ts_train_df.to_pandas(flatten=True)
preprocessed_train_df = preprocessed_train_df.dropna(subset=["target"])
preprocessed_train_df = preprocessed_train_df.drop(columns=["segment"])

preprocessed_train_df.shape

In [None]:
# check for correct null filling - must be 1 days for every train_df
df_check = preprocessed_train_df.copy()
df_check["date_tt_shifted"] = df_check["timestamp"].shift()
df_check = df_check[~df_check["date_tt_shifted"].isna()]
print((df_check["timestamp"] - df_check["date_tt_shifted"]).max())

## Data Caching

In [12]:
dataset_dpath = DATA_DPATH / "datasets"
dataset_dpath.mkdir(parents=True, exist_ok=True)

preprocessed_train_df.to_csv(dataset_dpath / "train.csv")
test_df.to_csv(dataset_dpath / "test.csv")