### Objective
The objective of the notebook is to perform outlier treatment for the dependent variable (y). Based on the algorithm provided in the config file, Outlier detection and treatment is performed for each of the timeseries. The following algorithms have been implemented
* Hampel Filter
* IQR method with Outiler capping

In [0]:
import yaml
import glob
import numpy as np
import pandas as pd
from datetime import datetime
from numba import njit
import os
import dotsi

In [0]:
%run ../0_Config.ipynb

In [0]:
# Create the algo directory for storing the results
output_directory = app_config['output_dir_path']
root_dir = "Data_Processing"
category = "Outlier_treatment"
algo_path = os.path.join(output_directory,root_dir,category)
if not os.path.exists(algo_path):
    os.makedirs(algo_path)
    print(algo_path)

#### Hampel filter - Implementation

In [0]:
@njit()
def hampel_filter_forloop_numba(
    input_series: str, window_size: int, n_sigmas: int = 3
) -> pd.Series:
    
    """Function to perform  Hampel filter on a given series to remove outliers

    Parameters
    ----------
    input_series : pandas.Series
        The column in the dataframe containing actual values
    window_size : int
        The length of the window used in the hampel filter
    n_sigmas : int, optional
        The no of standard deviations which indentify the outlier, by default 3

    Returns
    -------
    pandas.Series
        Returns the input series after outliers are removed using Hampel filter
    """
    n = len(input_series)
    new_series = input_series.copy()
    k = 1.4826 # scale factor for Gaussian distribution
    indices = []
    
    for i in range((window_size),(n - window_size)):
        x0 = np.nanmedian(input_series[(i - window_size):(i + window_size)])
        S0 = k * np.nanmedian(np.abs(input_series[(i - window_size):(i + window_size)] - x0))
        if (np.abs(input_series[i] - x0) > n_sigmas * S0):
            new_series[i] = x0
            indices.append(i)
    
    return new_series
  
def hampel_filter_UDF(df: pd.DataFrame) -> pd.DataFrame:
    """Function utilizing broadcasted information from the config file to perform Hampel filter on the target variable in the input dataframe

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe containing raw data obtained from user

    Returns
    -------
    pd.DataFrame
        Returns the raw dataframe after hampel filter is applied on actual values column
    """
    window_size = int(broadcast_window_size.value)
    no_of_sigma = int(broadcast_sigmas.value)
    y = broadcast_dv.value
    df.assign(y=hampel_filter_forloop_numba(df[y].to_numpy(),window_size,no_of_sigma))
    return df

#### Inter-Quartile-Range - Implementation

In [0]:
def inter_quartile_range_numba(
    input_series: pd.Series, lower_quantile_value: float, upper_quantile_value: float
) -> pd.Series:
    """Find the outliers using interquartile range for the given input series and replace them with the provided extreme quantile values

    Parameters
    ----------
    input_series : numpy.Series
        The column in the dataframe containing actual values
    lower_quantile_value : float
        lower quantile value of the series that is first quantile value of the series
    upper_quantile_value : float
        upper quantile value of the series that is third quantile value of the series

    Returns
    -------
    numpy.Series
        Returns the input series after the outliers are removed using IQR method
    """
    lower_cap = np.percentile(input_series, lower_quantile_value, interpolation = "midpoint")
    upper_cap = np.percentile(input_series, upper_quantile_value, interpolation = "midpoint")
    Q1 = np.percentile(input_series, 25, interpolation = 'midpoint')
    Q2 = np.percentile(input_series, 50, interpolation = 'midpoint')
    Q3 = np.percentile(input_series, 75, interpolation = 'midpoint')
    IQR = Q3 - Q1
    low_lim = Q1 - 1.5 * IQR
    upp_lim = Q3 + 1.5 * IQR
    input_series = np.where(input_series < low_lim,lower_cap,input_series)
    input_series = np.where(input_series > upp_lim,upper_cap,input_series)
    return input_series
  
def inter_quartile_range_UDF(df: pd.DataFrame) -> pd.DataFrame:
    """Function utilizing broadcasted information from the config file to perform IQR method on the target variable in the dataframe

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe containing raw data obtained from user

    Returns
    -------
    pd.DataFrame
        Returns the raw dataframe after IQR method is applied on actual values column
    """
    lower_cap = float(broadcast_min_quantile.value)
    upper_cap = float(broadcast_max_quantile.value)
    y = broadcast_dv.value
    df.assign(y=inter_quartile_range_numba(df[y].to_numpy(), lower_cap, upper_cap))
    return df

In [0]:
def data_loading(path):
    if os.path.isdir(path):  
        all_files = os.listdir(path)
        all_files_ext = []
        for file_name in all_files:
            all_files_ext.append(os.path.splitext(file_name)[1])
        df = pd.DataFrame()
        
        if(".csv" in all_files_ext):
            for file_name in all_files:
                if('.csv' in file_name):
                    df = pd.concat([df,pd.read_csv(path + "/" + file_name)], ignore_index = True)
            return df
        elif(".parquet" in all_files_ext):
            for file_name in all_files:
                if('.parquet' in file_name):
                    df = pd.concat([df,pd.read_parquet(path + "/" + file_name, engine='pyarrow')], ignore_index = True)
            return df    
        else:
            assert False, "Only .csv or .parquet file types are supported"
        
    elif os.path.isfile(path):  
        file_type = os.path.splitext(path)[1]
        if(".csv" == file_type):
            df = pd.read_csv(path)
            return df
        elif(".parquet" == file_type):
            df = pd.read_parquet(path, engine='pyarrow')
            return df
        else:
            assert False, "Only .csv or .parquet file types are supported"
    else:  
        assert False, 'Path specified is not correct'

#### Load data

In [0]:
df = data_loading(app_config["input_file_path"])
# print(df.shape)

# Get the modelling granularity
modeling_granularity_conf = app_config["modeling_granularity"]
# print(modeling_granularity_conf)

# Broadcasting dependent variable
broadcast_dv = dotsi.Dict({"value":app_config['dependent_variable']})

df[modeling_granularity_conf] = df[modeling_granularity_conf].astype(str)

# Get the method to treat the outliers(hampel_filter or inter_quartile_range)
algorithm = app_config["data_processing"]["outlier_treatment"]["algorithm"]
algo_params = app_config["data_processing"]["outlier_treatment"][algorithm]
# print(algo_params)

results_s = df.copy()

#### Calling Outlier detection and treatment by the choosen algorithm

In [0]:
if(app_config["data_processing"]["outlier_treatment_needed"] == True):
    if algorithm == "hampel_filter":
        broadcast_window_size = dotsi.Dict({"value":algo_params['window_size']})
        broadcast_sigmas = dotsi.Dict({"value":algo_params['no_of_sigmas']})
        df['gran_tempp'] = df[modeling_granularity_conf].astype(str).sum(axis=1)
        unique_pdts = df['gran_tempp'].unique()
        results_s = pd.DataFrame()
        for pdt in unique_pdts:
            results_s = pd.concat([results_s,hampel_filter_UDF(df[df['gran_tempp']==pdt])])

    elif algorithm == "inter_quartile_range":
        broadcast_min_quantile = dotsi.Dict({"value":algo_params['minimum_quantile']})
        broadcast_max_quantile = dotsi.Dict({"value":algo_params['maximum_quantile']})
        df['gran_tempp'] = df[modeling_granularity_conf].astype(str).sum(axis=1)
        unique_pdts = df['gran_tempp'].unique()
        results_s = pd.DataFrame()
        for pdt in unique_pdts:
            results_s = pd.concat([results_s,inter_quartile_range_UDF(df[df['gran_tempp']==pdt])])

#### Exporting Outlier treatment results

In [0]:
results_s.to_csv(algo_path+"/Outlier_treatment_results ("+datetime.today().strftime('%Y-%m-%d-%H-%M-%S')+").csv", index = False)