In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from math import nan

from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.pipeline import Pipeline 

pd.set_option('display.max_columns', None)

In [2]:
class DataImputation(TransformerMixin, BaseEstimator):
    """Imputs missing values. """
    
    def __init__(self):
        """Initalizes the attributes."""
        self.features = []
        self.num_cols = []
        self.dict_num = {}

    def fit(self, X, y=None):
  
        self.num_cols = list(X.select_dtypes(exclude='object').columns)
        self.dict_num = {col: X[col].median() for col in self.num_cols}
        return self
    
    def transform(self, X, y=None):
        """Fill the missing values and drop some columns.
        
        
        """
        df = X.copy()
        for col in self.num_cols:
            if df[col].isna().sum() > 0:
                df.loc[df[col].isna(), col] = self.dict_num[col]

        self.features = df.columns.values
        return df
    
    def get_feature_names_out(self, input_features=None):
        """Returns the names of the features."""
        return self.features

In [36]:
class FeatureEngineer(TransformerMixin, BaseEstimator):  
    """Generates new features."""
    
    def __init__(self, dict_lag):
        """Initalizes the attributes."""
        self.features = []
        self.dict_lag = dict_lag
    
    def fit(self, X, y=None):
        """DocString  """
        df = X.copy()
        return self
    
    def transform(self, X, y=None):
        """Generate new columns. """        
        
        df = pd.DataFrame.from_dict({'date':X.index})
        df = df.set_index('date')
        for col in self.dict_lag.keys(): 
            lags = self.dict_lag[col]
            for lag in lags:
                if lag > 0:
                    df[col + '_lag_by_' + str(lag)] = X.loc[:, col ].shift(lag)
                else:
                    df[col ] = X.loc[:, col ]

        self.features = df.columns.values
        return df
    
    def get_feature_names_out(self, input_features=None):
        """Returns the names of the features."""
        return self.features

In [5]:
path = '../data/processed/combined_data.csv'
df = pd.read_csv(path)
index_list = pd.read_csv('../data/processed/index_list_combined_data.csv', names=['Datum'])
index_list.drop(index_list.index[0], inplace=True)
df.index = index_list['Datum']
df.index = pd.to_datetime(df.index, format='%Y-%m-%d')

In [32]:
number_train = 70
dict_lag = {df.columns.values[76]: [1,2,3], 
            df.columns.values[93]: [0]
            }
df_selected_col = df.loc[:, list(dict_lag)]
df_train = df_selected_col.iloc[:number_train, :]
df_test = df_selected_col.iloc[number_train: , :]
df_train

Unnamed: 0_level_0,ErzPr_Schokoladen und andere Süßwaren,VPI_Schokoladen
Datum,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-31,97.0,99.2
2019-02-28,97.0,99.9
2019-03-31,96.9,101.1
2019-04-30,97.0,99.8
2019-05-31,97.0,101.0
...,...,...
2024-06-30,151.6,142.3
2024-07-31,148.5,141.5
2024-08-31,150.2,140.3
2024-09-30,147.5,142.7


In [30]:
df_neu = pd.DataFrame.from_dict({'date':df_train.index})
df_neu = df_neu.set_index('date')
df_neu

2019-01-31
2019-02-28
2019-03-31
2019-04-30
2019-05-31
...
2024-06-30
2024-07-31
2024-08-31
2024-09-30
2024-10-31


In [37]:
pipeline = Pipeline([('DaIm', DataImputation()), ('FE', FeatureEngineer(dict_lag))])

df_train_transf = pipeline.fit_transform(df_train)
df_test_transf = pipeline.transform(df_test)
df_train_transf



Unnamed: 0_level_0,ErzPr_Schokoladen und andere Süßwaren_lag_by_1,ErzPr_Schokoladen und andere Süßwaren_lag_by_2,ErzPr_Schokoladen und andere Süßwaren_lag_by_3,VPI_Schokoladen
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-31,,,,99.2
2019-02-28,97.0,,,99.9
2019-03-31,97.0,97.0,,101.1
2019-04-30,96.9,97.0,97.0,99.8
2019-05-31,97.0,96.9,97.0,101.0
...,...,...,...,...
2024-06-30,147.0,146.2,138.2,142.3
2024-07-31,151.6,147.0,146.2,141.5
2024-08-31,148.5,151.6,147.0,140.3
2024-09-30,150.2,148.5,151.6,142.7


In [19]:
df_train_transf

Unnamed: 0_level_0,ErzPr_Schokoladen und andere Süßwaren,VPI_Schokoladen,ErzPr_Schokoladen und andere Süßwaren_lag_by_1,ErzPr_Schokoladen und andere Süßwaren_lag_by_2,ErzPr_Schokoladen und andere Süßwaren_lag_by_3
Datum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-01-31,97.0,99.2,,,
2019-02-28,97.0,99.9,97.0,,
2019-03-31,96.9,101.1,97.0,97.0,
2019-04-30,97.0,99.8,96.9,97.0,97.0
2019-05-31,97.0,101.0,97.0,96.9,97.0
...,...,...,...,...,...
2024-06-30,151.6,142.3,147.0,146.2,138.2
2024-07-31,148.5,141.5,151.6,147.0,146.2
2024-08-31,150.2,140.3,148.5,151.6,147.0
2024-09-30,147.5,142.7,150.2,148.5,151.6
