# Missing Values Imputation Funcation Using ML


## Steps:
1. Import Libraries
2. Load The Dataset
3. Find The Column With Missing Values And Store In An Object
4. Find The Column Based On Data Type
   1. Numeric
   2. Categorical
   3. Boolean
5. Define The Function To Impute The Missing Values
6. Apply The Function To Our Dataset With Missing Values
7. Check The Missing Values After Imputation

In [311]:
# 1. Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.metrics import accuracy_score,precision_score,mean_absolute_error,r2_score

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


In [312]:
# 2. Load the dataset
df = pd.read_csv('./heart_disease_uci.csv')
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [313]:
# 3. Find the columns with missing values and store it an object
missing_data_cols = df.isnull().sum()[df.isnull().sum() > 0].sort_values(ascending = False)
missing_data_cols

ca          611
thal        486
slope       309
fbs          90
oldpeak      62
trestbps     59
thalch       55
exang        55
chol         30
restecg       2
dtype: int64

In [314]:
df.columns

Index(['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs',
       'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')

In [315]:
# 4. Find the only categorical columns
cat_cols = df.select_dtypes(include = "object").columns.tolist()

# Find the only numeric columns
num_cols = df.select_dtypes(include = ['float64','int64'] ).columns.tolist()

# print the categorical and numeric columns
print('Categorical Columns:', cat_cols)
print('Numeric Columns:', num_cols)


Categorical Columns: ['sex', 'dataset', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']
Numeric Columns: ['id', 'age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca', 'num']


In [316]:
# 5. define function to impute missing values in categorical or object columns

def impute_cat_cols(df, passed_col):
    # Ensure passed_col is categorical
    df[passed_col] = df[passed_col].astype('category')

    # Split df into null and not-null based on passed_col
    df_null = df[df[passed_col].isnull()]
    df_not_null = df[df[passed_col].notnull()]

    # Prepare X for model fitting
    X = df_not_null.drop(columns=[passed_col])

    # Create a mapping for all unique categories including unseen ones
    unique_categories = pd.concat([df[passed_col], pd.Series([2])]).unique()
    category_mapping = {k: v for v, k in enumerate(unique_categories)}

    # Transform y using the manual mapping
    y_transformed = df_not_null[passed_col].map(category_mapping)

    # Encode categorical features in X
    label_encoder = LabelEncoder()
    for col in X.columns:
        if X[col].dtype.name == 'category' or X[col].dtype.name == 'object':
            X[col] = label_encoder.fit_transform(X[col].astype(str))

    # Fit the model
    rf_classifier = RandomForestClassifier(random_state=42)
    rf_classifier.fit(X, y_transformed)

    # Predict on the null set
    X_null = df_null.drop(columns=[passed_col])
    
    # Ensure X_null undergoes the same encoding transformations as X
    for col in X_null.columns:
        if X_null[col].dtype.name == 'category' or X_null[col].dtype.name == 'object':
            if col in label_encoder.classes_:
                X_null[col] = label_encoder.transform(X_null[col].astype(str))
            else:
                X_null[col] = label_encoder.fit_transform(X_null[col].astype(str))

    # Check if there are any missing values left in X_null that need to be imputed before prediction
    if X_null.isnull().any().any():
        # Handle remaining missing values here before prediction
        pass

    # Predict and update original DataFrame
    predicted_values = rf_classifier.predict(X_null)
    
    # Inverse map the predicted codes to original categories using the manual mapping
    inverse_category_mapping = {v: k for k, v in category_mapping.items()}
    predicted_categories = [inverse_category_mapping[code] for code in predicted_values]
    
    # Get unique new categories to add
    new_categories = set(predicted_categories) - set(df[passed_col].cat.categories)

    # Update the categories of the original column before assigning new values
    if new_categories:
        df[passed_col] = df[passed_col].cat.add_categories(new_categories)
    
    # Assign the predicted categories to the null values in the DataFrame
    df.loc[df[passed_col].isnull(), passed_col] = predicted_categories

# # Example usage:
# impute_cat_cols(df, 'restecg')


In [317]:
# Call the function with 'thal' as an example column name and pass the DataFrame as an argument
impute_cat_cols(df, 'fbs')
impute_cat_cols(df, 'restecg')
impute_cat_cols(df, 'exang')
impute_cat_cols(df, 'slope')
impute_cat_cols(df, 'thal')


# Now check for missing values in 'thal' column
print(df.isnull().sum())

id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs           0
restecg       0
thalch       55
exang         0
oldpeak      62
slope         0
ca          611
thal          0
num           0
dtype: int64


In [318]:
# 5. define function to impute missing values in numerical columns

def impute_num_cols(df, passed_col):
    # Step 1: Split the DataFrame into two parts based on null values in the passed_col
    df_null = df[df[passed_col].isnull()]
    df_not_null = df[df[passed_col].notnull()]

    # Step 2: Prepare the feature set (X) by dropping the passed_col from the non-null DataFrame
    X = df_not_null.drop(columns=[passed_col])

    # Step 3: Prepare the target set (y) which is the non-null values of passed_col
    y = df_not_null[passed_col]

    # Step 4: Encode all categorical features in X using LabelEncoder
    label_encoder = LabelEncoder()
    for col in X.columns:
        if X[col].dtype.name == 'category' or X[col].dtype.name == 'object':
            X[col] = label_encoder.fit_transform(X[col].astype(str))

    # Step 5: Fit a RandomForestRegressor on the encoded features and target
    rf_regressor = RandomForestRegressor(random_state=42)
    rf_regressor.fit(X, y)

    # Step 6: Predict missing values in passed_col using the trained model on the null DataFrame
    X_null = df_null.drop(columns=[passed_col])
    
    # Ensure that X_null undergoes the same encoding transformations as X
    for col in X_null.columns:
        if X_null[col].dtype.name == 'category' or X_null[col].dtype.name == 'object':
            if col in label_encoder.classes_:
                X_null[col] = label_encoder.transform(X_null[col].astype(str))
            else:
                X_null[col] = label_encoder.fit_transform(X_null[col].astype(str))

    # Check if there are any missing values left in X_null that need to be imputed before prediction
    if X_null.isnull().any().any():
        # Handle remaining missing values here before prediction
        pass

    # Step 7: Predict and update original DataFrame with predicted values for null values
    predicted_values = rf_regressor.predict(X_null)
    
    df.loc[df[passed_col].isnull(), passed_col] = predicted_values

# Example usage:
# impute_num_cols(df, 'age')


In [319]:
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [320]:
impute_num_cols(df, 'trestbps')
impute_num_cols(df, 'chol')
impute_num_cols(df, 'thalch')
impute_num_cols(df, 'oldpeak')
impute_num_cols(df, 'ca')


# Now check for missing values in 'thal' column
print(df.isnull().sum())

id          0
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64


---
Sir Ammar Function

In [None]:
# define the function to impute the missing values in thal column

def impute_categorical_missing_data(passed_col):
    
    df_null = df[df[passed_col].isnull()]
    df_not_null = df[df[passed_col].notnull()]

    X = df_not_null.drop(passed_col, axis=1)
    y = df_not_null[passed_col]
    
    other_missing_cols = [col for col in missing_data_cols if col != passed_col]
    
    label_encoder = LabelEncoder()

    for col in X.columns:
        if X[col].dtype == 'object' or X[col].dtype == 'category':
            X[col] = label_encoder.fit_transform(X[col])

    if passed_col in bool_cols:
        y = label_encoder.fit_transform(y)
        
    iterative_imputer = IterativeImputer(estimator=RandomForestRegressor(random_state=42), add_indicator=True)

    for col in other_missing_cols:
        if X[col].isnull().sum() > 0:
            col_with_missing_values = X[col].values.reshape(-1, 1)
            imputed_values = iterative_imputer.fit_transform(col_with_missing_values)
            X[col] = imputed_values[:, 0]
        else:
            pass
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    rf_classifier = RandomForestClassifier()

    rf_classifier.fit(X_train, y_train)

    y_pred = rf_classifier.predict(X_test)

    acc_score = accuracy_score(y_test, y_pred)

    print("The feature '"+ passed_col+ "' has been imputed with", round((acc_score * 100), 2), "accuracy\n")

    X = df_null.drop(passed_col, axis=1)

    for col in X.columns:
        if X[col].dtype == 'object' or X[col].dtype == 'category':
            X[col] = label_encoder.fit_transform(X[col])

    for col in other_missing_cols:
        if X[col].isnull().sum() > 0:
            col_with_missing_values = X[col].values.reshape(-1, 1)
            imputed_values = iterative_imputer.fit_transform(col_with_missing_values)
            X[col] = imputed_values[:, 0]
        else:
            pass
                
    if len(df_null) > 0: 
        df_null[passed_col] = rf_classifier.predict(X)
        if passed_col in bool_cols:
            df_null[passed_col] = df_null[passed_col].map({0: False, 1: True})
        else:
            pass
    else:
        pass

    df_combined = pd.concat([df_not_null, df_null])
    
    return df_combined[passed_col]

def impute_continuous_missing_data(passed_col):
    
    df_null = df[df[passed_col].isnull()]
    df_not_null = df[df[passed_col].notnull()]

    X = df_not_null.drop(passed_col, axis=1)
    y = df_not_null[passed_col]
    
    other_missing_cols = [col for col in missing_data_cols if col != passed_col]
    
    label_encoder = LabelEncoder()

    for col in X.columns:
        if X[col].dtype == 'object' or X[col].dtype == 'category':
            X[col] = label_encoder.fit_transform(X[col])
    
    iterative_imputer = IterativeImputer(estimator=RandomForestRegressor(random_state=42), add_indicator=True)

    for col in other_missing_cols:
        if X[col].isnull().sum() > 0:
            col_with_missing_values = X[col].values.reshape(-1, 1)
            imputed_values = iterative_imputer.fit_transform(col_with_missing_values)
            X[col] = imputed_values[:, 0]
        else:
            pass
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    rf_regressor = RandomForestRegressor()

    rf_regressor.fit(X_train, y_train)

    y_pred = rf_regressor.predict(X_test)

    print("MAE =", mean_absolute_error(y_test, y_pred), "\n")
    print("RMSE =", mean_squared_error(y_test, y_pred, squared=False), "\n")
    print("R2 =", r2_score(y_test, y_pred), "\n")

    X = df_null.drop(passed_col, axis=1)

    for col in X.columns:
        if X[col].dtype == 'object' or X[col].dtype == 'category':
            X[col] = label_encoder.fit_transform(X[col])

    for col in other_missing_cols:
        if X[col].isnull().sum() > 0:
            col_with_missing_values = X[col].values.reshape(-1, 1)
            imputed_values = iterative_imputer.fit_transform(col_with_missing_values)
            X[col] = imputed_values[:, 0]
        else:
            pass
                
    if len(df_null) > 0: 
        df_null[passed_col] = rf_regressor.predict(X)
    else:
        pass

    df_combined = pd.concat([df_not_null, df_null])
    
    return df_combined[passed_col]