### Objective
The objective of the notebook is to perform missing value treatment using the techniques mentioned in the config. The user has the flexibility to use a mix of the techniques by providing the appropriate column names in each section in the config file.

In [0]:
import yaml
import glob
import numpy as np
import pandas as pd
from distutils.command.config import config
from datetime import datetime
import os
import dotsi

In [0]:
%run ../0_Config.ipynb

In [0]:
# Create the algo directory for storing the results
output_directory = app_config['output_dir_path']
root_dir = "Data_Processing"
category = "Missing_value_treatment"
algo_path = os.path.join(output_directory,root_dir,category)
if not os.path.exists(algo_path):
    os.makedirs(algo_path)
    print(algo_path)

In [0]:
def mean_across_years(
    df: pd.DataFrame,
    date_col: str,
    numeric_cols: list,
    modeling_granularity: list,
    time_granularity: str = "weekly",
) -> pd.DataFrame:
    """Function to find the mean across years for different time granularities to impute for missing values

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe which contains values for all the variables
    date_col : str
        The column in the df dataframe which contains datevalues
    numeric_cols : list
        The list of columns containing numeric values
    modeling_granularity : list
        The list of columns containg modeling granularity metrics
    time_granularity : str, optional
        The time granularity at which the dataset is grouped, by default "weekly". Possible values - 'weekly','daily'

    Returns
    -------
    pd.DataFrame
        Returns the dataframe where the missing values are imputed with the mean of the time granularity grouped data

    Raises
    ------
    ValueError
        if it fails to convert date column to datetime datatype
    """
    if not isinstance(df[date_col],(np.datetime64)): # add more types if needed
        try:
            df[date_col] = pd.to_datetime(df[date_col])
        except:
            raise ValueError("Date column is not datetime. Failed to convert.")
    if time_granularity == "weekly":        
        df["week_of_year"] = df[date_col].dt.isocalendar().week  
        model_level_col = ["week_of_year"]  
    elif time_granularity=="daily":
        df["Day"] = df[date_col].dt.day
        df["Month"] = df[date_col].dt.month
        model_level_col = ["Day","Month"]  
    
    df_mean = df[modeling_granularity+model_level_col+numeric_cols].groupby(modeling_granularity+model_level_col).mean()
    df_mean.columns = ["New_"+x if x in numeric_cols else x for x in df_mean.columns]
    df_combine = df.merge(df_mean,on=modeling_granularity+model_level_col,how="left")

    for x in numeric_cols:
        df_combine[x] = np.where(df_combine[x].isna(),df_combine["New_"+x], df_combine[x])
        df_combine = df_combine.drop(["New_"+x], axis = 1)
    df_combine.drop(model_level_col, axis = 1, inplace = True)
    df_combine[date_col] = df_combine[date_col].astype(str)

    return df_combine

In [0]:
def impute_missing_data(
    df: pd.DataFrame,
    col_name: list,
    imputation_type: str,
    arbitrary_value: int = 0,
    window: int = None,
    modeling_granularity: list = [],
    time_granularity: str = None,
    date_col: str = None,
) -> pd.DataFrame:
    """Function to impute or fill the missing data with values based on the imputation type given

    Parameters
    ----------
    df : pd.DataFrame
        The raw dataset which contains the value for all variables
    col_name : list
        The list of names of the columns containing numerical values
    imputation_type : str
        The type of imputation based on which the missing values are filled
    arbitrary_value : int, optional
        The value which is used to fill missing values when imputation type is scalar, by default 0
    window : int, optional
        The size of the window used in the rolling mean and median methods, by default None
    modeling_granularity : list, optional
        The list of names of columns of modeling granularity values, by default []
    time_granularity : str, optional
        The time granularity values required for mean across years imputation type, by default None
    date_col : str, optional
        The name of the column containing date values, by default None

    Returns
    -------
    pd.DataFrame
        the input dataset after the missing values are imputed based on the given imputation type

    Raises
    ------
    ValueError
        if the window size is not provided for the rolling mean type
    ValueError
        if the window size is not provided for the rolling median type
    ValueError
        if the imputation type is not one of them in this function
    """
    cleaned_data = df.copy()
    if imputation_type=="Mean":
        cleaned_data[col_name] = cleaned_data[col_name].fillna(cleaned_data[col_name].mean())
    elif imputation_type=="Median":
        cleaned_data[col_name] = cleaned_data[col_name].fillna(cleaned_data[col_name].median())
    elif imputation_type=="Scalar":
        cleaned_data[col_name] = cleaned_data[col_name].fillna(arbitrary_value)
    elif imputation_type=="Backward_fill":
        cleaned_data[col_name]= cleaned_data[col_name].fillna(method ='bfill').fillna(method ='ffill')
    elif imputation_type=="Forward_fill":
        cleaned_data[col_name]= cleaned_data[col_name].fillna(method ='ffill').fillna(method ='bfill')
    elif imputation_type=="Linear_Interpolation":
        cleaned_data[col_name]= cleaned_data[col_name].interpolate(method='linear').fillna(method ='ffill').fillna(method ='bfill')
    elif imputation_type=="Spline_Interpolation":
        cleaned_data[col_name]= cleaned_data[col_name].interpolate(option='spline').fillna(method ='ffill').fillna(method ='bfill')
    elif imputation_type=="Mode":
        cleaned_data[col_name]= cleaned_data[col_name].fillna(cleaned_data[col_name].mode().iloc[0])
    elif imputation_type=="Rolling_Mean":
        if window == None:
            raise ValueError("Window Size not provided for rolling mean.")
        temp = cleaned_data[col_name].rolling(window, min_periods=1).mean()
        cleaned_data[col_name] = np.where(cleaned_data[col_name].isna(),temp, cleaned_data[col_name])
        cleaned_data[col_name] = cleaned_data[col_name].fillna(method = 'ffill').fillna(method ='bfill')
    elif imputation_type=="Rolling_Median":
        if window == None:
            raise ValueError("Window Size not provided for rolling median.")
        temp = cleaned_data[col_name].rolling(window, min_periods=1).median()
        cleaned_data[col_name] = np.where(cleaned_data[col_name].isna(),temp, cleaned_data[col_name])
        cleaned_data[col_name] = cleaned_data[col_name].fillna(method = 'ffill').fillna(method ='bfill')
    elif imputation_type == "Mean_Across_Years":
        cleaned_data = mean_across_years(cleaned_data, date_col, col_name, modeling_granularity, time_granularity )
        cleaned_data[col_name]= cleaned_data[col_name].fillna(method ='ffill').fillna(method ='bfill')
    else:
        raise ValueError("Incorrect imputation type")
    return cleaned_data

In [0]:
def missing_value_treatment_UDF(df: pd.DataFrame) -> pd.DataFrame:
    """Function utilizing broadcasted information from the config file to treat the missing values in the input dataset

    Parameters
    ----------
    df : pd.DataFrame
        the raw dataset which contains the value for all variables

    Returns
    -------
    pd.DataFrame
        Returns the input dataframe after the missing values in them are treated
    """
  
    algo_params = broadcast_algo_params.value
    modeling_granularity = broadcast_granularity.value
    #date_col = broadcast_date_col.value
    req_params = dict([x for x in broadcast_algo_params.value.items() if len(x[1]['cols'])>0])  
    for algo in req_params.keys():
        if algo in ['Rolling_Mean']:
            window = int(algo_params[algo]['window'])
            cols = algo_params[algo]['cols']
            if algo_params[algo]['zero_as_missing_value'] == True:
                df[cols] = df[cols].replace(0,np.nan)
            df = impute_missing_data(df, cols, algo, window = window)
        elif algo in ['Rolling_Median']:
            window = int(algo_params[algo]['window'])
            cols = algo_params[algo]['cols']
            if algo_params[algo]['zero_as_missing_value'] == True:
                df[cols] = df[cols].replace(0,np.nan)
            df = impute_missing_data(df, cols, algo, window = window)
        elif algo in ['Scalar']:
            value = int(algo_params[algo]['value'])
            cols = algo_params[algo]['cols']
            if algo_params[algo]['zero_as_missing_value'] == True:
                df[cols] = df[cols].replace(0,np.nan)
            df = impute_missing_data(df, cols, algo, arbitrary_value = value)
        elif algo in ['Forward_fill']:
            cols = algo_params[algo]['cols']
            df = impute_missing_data(df, cols, algo) 
        elif algo in ['Backward_fill']:
            cols = algo_params[algo]['cols']
            if algo_params[algo]['zero_as_missing_value'] == True:
                df[cols] = df[cols].replace(0,np.nan)
            df = impute_missing_data(df, cols, algo)
        elif algo in ['Linear_Interpolation']:
            cols = algo_params[algo]['cols']
            if algo_params[algo]['zero_as_missing_value'] == True:
                df[cols] = df[cols].replace(0,np.nan)
            df = impute_missing_data(df, cols, algo)
        elif algo in ['Spline_Interpolation']:
            cols = algo_params[algo]['cols']
            if algo_params[algo]['zero_as_missing_value'] == True:
                df[cols] = df[cols].replace(0,np.nan)
            df = impute_missing_data(df, cols, algo)
        elif algo in ['Mean']:
            cols = algo_params[algo]['cols']
            if algo_params[algo]['zero_as_missing_value'] == True:
                df[cols] = df[cols].replace(0,np.nan)
            df = impute_missing_data(df, cols, algo)
        elif algo in ['Median']:
            cols = algo_params[algo]['cols']
            if algo_params[algo]['zero_as_missing_value'] == True:
                df[cols] = df[cols].replace(0,np.nan)
            df = impute_missing_data(df, cols, algo)
        elif algo in ['Mode']:
            cols = algo_params[algo]['cols']
            df = impute_missing_data(df, cols, algo)
        elif algo in ['Mean_Across_Years']:
            time_granularity = algo_params[algo]['time_granularity']
            cols = algo_params[algo]['cols']
            if algo_params[algo]['zero_as_missing_value'] == True:
                df[cols] = df[cols].replace(0,np.nan)
            df = impute_missing_data(df, cols, algo, modeling_granularity = modeling_granularity, time_granularity = time_granularity, date_col = "ds")
    return df

In [0]:
def data_loading(path):
    if os.path.isdir(path):  
        all_files = os.listdir(path)
        all_files_ext = []
        for file_name in all_files:
            all_files_ext.append(os.path.splitext(file_name)[1])
        df = pd.DataFrame()
        
        if(".csv" in all_files_ext):
            for file_name in all_files:
                if('.csv' in file_name):
                    df = pd.concat([df,pd.read_csv(path + "/" + file_name)], ignore_index = True)
            return df
        elif(".parquet" in all_files_ext):
            for file_name in all_files:
                if('.parquet' in file_name):
                    df = pd.concat([df,pd.read_parquet(path + "/" + file_name, engine='pyarrow')], ignore_index = True)
            return df    
        else:
            assert False, "Only .csv or .parquet file types are supported"
        
    elif os.path.isfile(path):  
        file_type = os.path.splitext(path)[1]
        if(".csv" == file_type):
            df = pd.read_csv(path)
            return df
        elif(".parquet" == file_type):
            df = pd.read_parquet(path, engine='pyarrow')
            return df
        else:
            assert False, "Only .csv or .parquet file types are supported"
    else:  
        assert False, 'Path specified is not correct'

In [0]:
modeling_granularity_conf = app_config["modeling_granularity"]
# print(modeling_granularity_conf)

# Get date and Dependent variable
dv_config = app_config["dependent_variable"]
ds_config = app_config["date_var"]

broadcast_date_col = dotsi.Dict({"value":ds_config})
broadcast_granularity = dotsi.Dict({"value":app_config["modeling_granularity"]})
broadcast_algo_params = dotsi.Dict({"value":app_config['data_processing']['missing_value_treatment']})

#### Loading the latest Outlier_treatment file
##### Please update the reading path with the required data path if "Outlier treatment" was not run

In [0]:
if(app_config["data_processing"]["outlier_treatment_needed"] == True):
    # Reading the latest input file based on timestamp
    all_files = [file for file in os.listdir(app_config['output_dir_path']+"/Data_Processing/Outlier_treatment")]
    outlier_op_files = [file for file in all_files if "Outlier_treatment_results (" in file]
    outlier_op_files = [file.replace(".csv","") for file in outlier_op_files]
    version_dates = [datetime.strptime(x.split('(')[1].replace(')',''), '%Y-%m-%d-%H-%M-%S') for x in outlier_op_files]
    max_date = max(version_dates)
    max_date = max_date.strftime('%Y-%m-%d-%H-%M-%S')
    req_file_name = [x for x in outlier_op_files if max_date in x]
    outlier_op_file_path = os.path.join(app_config['output_dir_path']+"/Data_Processing/Outlier_treatment",req_file_name[0] + ".csv")
    # print(outlier_op_file_path)

    # Reading the data
    df = pd.read_csv(outlier_op_file_path)
    # print(df.shape)
else:
    df = data_loading(app_config["input_file_path"])
    
si_gran = app_config["data_processing"]['feature_engineering']['calculation_based']["higher_level_si_creation"]["granularity"]
grann = app_config["data_processing"]['feature_engineering']['prophet_based']["higher_level_si_trend_creation"]["granularity"]
gran_all = list(set(modeling_granularity_conf + si_gran + grann))
df[gran_all] = df[gran_all].astype(str)
df[ds_config] = pd.to_datetime(df[ds_config],format = app_config["date_format_pandas"])

#### Merging granular level trend results if considered running "Granular Prophet Trend creation"

In [0]:
if(app_config["data_processing"]["feature_engineering"]['prophet_based']['consider_granular_prophet_trend'] == True):
    # Reading the latest input file based on timestamp
    all_files = [file for file in os.listdir(app_config['output_dir_path']+"/Data_Processing/modeling_level_trend")]
    trend_op_files = [file for file in all_files if "granular_level_trend_results (" in file]
    trend_op_files = [file.replace(".csv","") for file in trend_op_files]
    version_dates = [datetime.strptime(x.split('(')[1].replace(')',''), '%Y-%m-%d-%H-%M-%S') for x in trend_op_files]
    max_date = max(version_dates)
    max_date = max_date.strftime('%Y-%m-%d-%H-%M-%S')
    req_file_name = [x for x in trend_op_files if max_date in x]
    trend_op_file_path = os.path.join(app_config['output_dir_path']+"/Data_Processing/modeling_level_trend",req_file_name[0] + ".csv")
    print(trend_op_file_path)
    # Reading the data
    trend_data = pd.read_csv(trend_op_file_path)
    trend_data = trend_data[trend_data["status"] == "success"]
    
    # type conversions
    trend_data['ds'] = pd.to_datetime(trend_data['ds'])
    trend_data[app_config["modeling_granularity"]] = trend_data[app_config["modeling_granularity"]].astype(str)
    
    # renaming the columns
    trend_data.rename(columns = {'trend':"granular_trend", 'ds':ds_config}, inplace = True)
    
    # merging the trend data with raw data
    df = df.merge(trend_data[app_config["modeling_granularity"]+\
                                   [ds_config,'granular_trend']]
                                   ,on = [ds_config] + app_config["modeling_granularity"],how = "left")
    df['granular_trend'] = df['granular_trend'].fillna(0)

#### Merging higher level trend and si results if considered running "Prophet Trend & SI creation"

In [0]:
if(app_config["data_processing"]['feature_engineering']['prophet_based']["consider_prophet_trend_si"] == True):
    # Reading the latest input file based on timestamp
    all_files = [file for file in os.listdir(app_config['output_dir_path']+"/Data_Processing/higher_level_trend_si")]
    trend_op_files = [file for file in all_files if "higher_level_trend_si_results (" in file]
    trend_op_files = [file.replace(".csv","") for file in trend_op_files]
    version_dates = [datetime.strptime(x.split('(')[1].replace(')',''), '%Y-%m-%d-%H-%M-%S') for x in trend_op_files]
    max_date = max(version_dates)
    max_date = max_date.strftime('%Y-%m-%d-%H-%M-%S')
    req_file_name = [x for x in trend_op_files if max_date in x]
    trend_op_file_path = os.path.join(app_config['output_dir_path']+"/Data_Processing/higher_level_trend_si",req_file_name[0] + ".csv")
    print(trend_op_file_path)
    # Reading the data
    trend_data = pd.read_csv(trend_op_file_path)
    trend_data = trend_data[trend_data["status"] == "success"]
    
    # type conversions
    trend_data['ds'] = pd.to_datetime(trend_data['ds'])
    trend_data[grann] = trend_data[grann].astype(str)
    
    # renaming the columns
    trend_data.rename(columns = {'trend':"higher_level_trend", 'yearly':'prophet_si_yearly','ds':ds_config,\
                                 'weekly':'prophet_si_weekly', 'daily':'prophet_si_daily'}, inplace = True)

    # merging the trend data with raw data
    df = df.merge(trend_data[grann+[ds_config,'higher_level_trend','prophet_si_yearly','prophet_si_weekly','prophet_si_daily']]
                                   ,on = [ds_config] + grann, how = "left")
    df[['higher_level_trend','prophet_si_yearly','prophet_si_weekly','prophet_si_daily']] = df[['higher_level_trend','prophet_si_yearly','prophet_si_weekly','prophet_si_daily']].fillna(0)

#### Merging calculated seasonality index results if considered running "Calculated SI creation"

In [0]:
if(app_config["data_processing"]['feature_engineering']['calculation_based']["consider_calulated_si"] == True):
    # Reading the latest input file based on timestamp
    all_files = [file for file in os.listdir(app_config['output_dir_path']+"/Data_Processing/Seasonality_Index")]
    si_op_files = [file for file in all_files if "SI_results (" in file]
    si_op_files = [file.replace(".csv","") for file in si_op_files]
    version_dates = [datetime.strptime(x.split('(')[1].replace(')',''), '%Y-%m-%d-%H-%M-%S') for x in si_op_files]
    max_date = max(version_dates)
    max_date = max_date.strftime('%Y-%m-%d-%H-%M-%S')
    req_file_name = [x for x in si_op_files if max_date in x]
    si_op_file_path = os.path.join(app_config['output_dir_path']+"/Data_Processing/Seasonality_Index",req_file_name[0] + ".csv")
    print(si_op_file_path)
    # Reading the data
    si_data = pd.read_csv(si_op_file_path)
    si_data = si_data[si_data["status"] == "success"]
    
    # type conversions
    si_data[si_gran] = si_data[si_gran].astype(str)
    
    si_data_count = si_data.shape[0]
    if(si_data_count!=0):
        # renaming the columns
        si_data.rename(columns = {'ds':ds_config}, inplace = True)

        total_weeks_df = df[[ds_config]].drop_duplicates()

        history_dates = pd.to_datetime(pd.Series(total_weeks_df[ds_config].unique(), name='ds')).sort_values()
        frequency = pd.infer_freq(history_dates.tail(3))
        # print(frequency)

        total_weeks_df['Qtr'] = total_weeks_df[ds_config].dt.quarter
        total_weeks_df["Year"] = total_weeks_df[ds_config].dt.year

        if(frequency == 'D'):
            total_weeks_df['Day'] = total_weeks_df[ds_config].dt.dayofyear
            total_weeks_df['Week'] = total_weeks_df[ds_config].dt.strftime( '%U').astype(int)
        elif('W-' in frequency):
            total_weeks_df['Week']=1
            for i in range(1,total_weeks_df.shape[0]):
                if (total_weeks_df.loc[i,'Year']==total_weeks_df.loc[i-1,'Year']):
                    total_weeks_df.loc[i,'Week']=total_weeks_df.loc[i-1,'Week']+1

        total_weeks_df['Month'] = total_weeks_df[ds_config].dt.month    
        total_weeks_df['Qtr'] = total_weeks_df[ds_config].dt.quarter
        total_weeks_df["Year"] = total_weeks_df[ds_config].dt.year

        if(frequency == 'D'):
            si_data_daily = si_data[['Day','cal_si_daily'] + si_gran].drop_duplicates()
            total_weeks_df = total_weeks_df.merge(si_data_daily,on = ['Day'],how='left')

            si_data_weekly = si_data[['Week','cal_si_weekly'] + si_gran].drop_duplicates()
            total_weeks_df = total_weeks_df.merge(si_data_weekly,on = ['Week'] + si_gran,how='left')

        elif('W-' in frequency):
            si_data_weekly = si_data[['Week','cal_si_weekly'] + si_gran].drop_duplicates()
            total_weeks_df = total_weeks_df.merge(si_data_weekly,on = ['Week'],how='left')

        # monthly seasonality
        si_data_montly = si_data[['Month','cal_si_monthly'] + si_gran].drop_duplicates()
        total_weeks_df = total_weeks_df.merge(si_data_montly,on = ['Month'] + si_gran,how='left')

        # quarterly seasonality
        si_data_qtrly = si_data[['Qtr','cal_si_qtrly'] + si_gran].drop_duplicates()
        total_weeks_df = total_weeks_df.merge(si_data_qtrly,on = ['Qtr'] + si_gran,how='left')

        si_cols = [var for var in ['cal_si_daily','cal_si_weekly'] if var in total_weeks_df.columns]  + ['cal_si_monthly','cal_si_qtrly']
        req_cols = si_gran + [ds_config] + si_cols
        # print(req_cols)

        # merging the trend data with raw data
        df = df.merge(total_weeks_df[req_cols],on = [ds_config] + si_gran, how = "left")
        df[si_cols] = df[si_cols].fillna(0)
        # print(df.shape)
    elif(si_data_count==0):
        df[['cal_si_daily', 'cal_si_weekly', 'cal_si_monthly', 'cal_si_qtrly']] = 0

#### Calling missing value treatment functions

In [0]:
df['gran_tempp'] = df[modeling_granularity_conf].astype(str).sum(axis=1)
unique_pdts = df['gran_tempp'].unique()
results_s = pd.DataFrame()
for pdt in unique_pdts:
	results_s = pd.concat([results_s,missing_value_treatment_UDF(df[df['gran_tempp']==pdt])])

#### Exporting Missing value treatment results

In [0]:
results_s.to_csv(algo_path + "/Missing_value_treatment_results ("+datetime.today().strftime('%Y-%m-%d-%H-%M-%S')+").csv", index = False)

#### Sample examples for reference

In [0]:
df = pd.DataFrame({'A':[np.nan,1,2,3,np.nan,5],'B':[11,12,22,43,np.nan,65],'C':[0,1,2,np.nan,4,5]})
df

Unnamed: 0,A,B,C
0,,11.0,0.0
1,1.0,12.0,1.0
2,2.0,22.0,2.0
3,3.0,43.0,
4,,,4.0
5,5.0,65.0,5.0


In [0]:
impute_missing_data(df, ['A','B','C'], 'Mode')

Unnamed: 0,A,B,C
0,1.0,11.0,0.0
1,1.0,12.0,1.0
2,2.0,22.0,2.0
3,3.0,43.0,0.0
4,1.0,11.0,4.0
5,5.0,65.0,5.0


In [0]:
impute_missing_data(df, ['A','B','C'], 'Mean', window = 4)

Unnamed: 0,A,B,C
0,2.75,11.0,0.0
1,1.0,12.0,1.0
2,2.0,22.0,2.0
3,3.0,43.0,2.4
4,2.75,30.6,4.0
5,5.0,65.0,5.0


In [0]:
impute_missing_data(df, ['A','B','C'], 'Median', window = 4)

Unnamed: 0,A,B,C
0,2.5,11.0,0.0
1,1.0,12.0,1.0
2,2.0,22.0,2.0
3,3.0,43.0,2.0
4,2.5,22.0,4.0
5,5.0,65.0,5.0


In [0]:
impute_missing_data(df, ['A','B','C'], 'Scalar')

Unnamed: 0,A,B,C
0,0.0,11.0,0.0
1,1.0,12.0,1.0
2,2.0,22.0,2.0
3,3.0,43.0,0.0
4,0.0,0.0,4.0
5,5.0,65.0,5.0


In [0]:
impute_missing_data(df, ['A','B','C'], 'Backward_fill')

Unnamed: 0,A,B,C
0,1.0,11.0,0.0
1,1.0,12.0,1.0
2,2.0,22.0,2.0
3,3.0,43.0,4.0
4,5.0,65.0,4.0
5,5.0,65.0,5.0


In [0]:
impute_missing_data(df, ['A','B','C'], 'Forward_fill')

Unnamed: 0,A,B,C
0,1.0,11.0,0.0
1,1.0,12.0,1.0
2,2.0,22.0,2.0
3,3.0,43.0,2.0
4,3.0,43.0,4.0
5,5.0,65.0,5.0


In [0]:
impute_missing_data(df, ['A','B','C'], 'Linear_Interpolation')

Unnamed: 0,A,B,C
0,1.0,11.0,0.0
1,1.0,12.0,1.0
2,2.0,22.0,2.0
3,3.0,43.0,3.0
4,4.0,54.0,4.0
5,5.0,65.0,5.0


In [0]:
impute_missing_data(df, ['A','B','C'], 'Spline_Interpolation')

Unnamed: 0,A,B,C
0,1.0,11.0,0.0
1,1.0,12.0,1.0
2,2.0,22.0,2.0
3,3.0,43.0,3.0
4,4.0,54.0,4.0
5,5.0,65.0,5.0


In [0]:
impute_missing_data(df, ['A','B','C'], 'Rolling_Mean', window = 4)

Unnamed: 0,A,B,C
0,1.0,11.0,0.0
1,1.0,12.0,1.0
2,2.0,22.0,2.0
3,3.0,43.0,1.0
4,2.0,25.666667,4.0
5,5.0,65.0,5.0


In [0]:
impute_missing_data(df, ['A','B','C'], 'Rolling_Median', window = 4)

Unnamed: 0,A,B,C
0,1.0,11.0,0.0
1,1.0,12.0,1.0
2,2.0,22.0,2.0
3,3.0,43.0,1.0
4,2.0,22.0,4.0
5,5.0,65.0,5.0
