# Imputing Missing Values - Baseline Model
Imputes all missing values with closest values

In [1]:
# import necessary packages
import pandas as pd
import numpy as np
import os

In [59]:
# function to fill missing values
def impute_missing_vals(df, attributes):

    """
    function that imputes missing values.
    
    @param df: dataframe that has missing values to be
               imputed
           attributes: list of String, attributes of dataframe
    @return df_clean: dataframe without missing values

    """
    
    """
    fill missing values by the closest values first
    ffill to fill missing values in the tail
    bfill to fill missing values in the head
    """
    # copy df
    df_clean = df.copy()
    for att in attributes:
        if df_clean[att].isnull().sum() == len(df_clean):
            df_clean[att] = df_clean[att].fillna(0)
        elif df_clean[att].isnull().sum() == len(df_clean) - 1:
            df_clean[att] = df_clean[att].ffill().bfill()
        else:
            df_clean[att] = df_clean[att].interpolate(method='nearest', limit_direction='both')
            df_clean[att] = df_clean[att].ffill().bfill()
    
    return df_clean

In [44]:
# create a list of patient file names
train_dir = '../data/training_setA/'
patients = [p for p in sorted(os.listdir(train_dir))]

In [None]:
# impute missing values and create clean dfs for all patients
for p in patients:
    
    # read in patient data
    df = pd.read_csv(train_dir + '/' + p, sep = "|")
    attributes = df.columns[:-1]
    
    # impute missing values
    df_clean = impute_missing_vals(df, attributes)
    
    # drop unit1 and unit2 with half missing values
    # because these two features have few information
    # drop EtCO2 with all missing values
    df_clean = df_clean.drop(['Unit1', 'Unit2', 'EtCO2'], axis=1)
    
    # save new patient data
    if p in train_path:
    save_path = '../data/train_baseline/'
    df_clean.to_csv(save_path + p, sep='|')        
    
    elif p in valid_path:
    save_path = '../data/valid_baseline/'
    df_clean.to_csv(save_path + p, sep='|')        
    
    else:
        
    save_path = '../data/test_baseline/'
    df_clean.to_csv(save_path + p, sep='|')
    
    print(p)