In [None]:
  #Automate missing value imputation techniques

In [10]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder

In [11]:
# Sample dataset with missing values
data = {
    'Age': [25, 30, np.nan, 35, 40, np.nan, 50, 60, 65, np.nan],
    'Salary': [50000, 54000, 57000, np.nan, 65000, 69000, np.nan, 80000, 85000, np.nan],
    'Gender': ['Male', 'Female', 'Male', np.nan, 'Female', 'Female', np.nan, 'Male', 'Male', 'Female']
}
df = pd.DataFrame(data)

In [12]:
# Display the initial dataset
print("Initial Dataset with Missing Values:")
display(df)

Initial Dataset with Missing Values:


Unnamed: 0,Age,Salary,Gender
0,25.0,50000.0,Male
1,30.0,54000.0,Female
2,,57000.0,Male
3,35.0,,
4,40.0,65000.0,Female
5,,69000.0,Female
6,50.0,,
7,60.0,80000.0,Male
8,65.0,85000.0,Male
9,,,Female


In [13]:
# Function to calculate evaluation metrics
def evaluate_imputation(true_values, imputed_values):
    mae = mean_absolute_error(true_values, imputed_values)
    rmse = mean_squared_error(true_values, imputed_values, squared=False)
    return mae, rmse


In [14]:
# Function to calculate evaluation metrics
def evaluate_imputation(true_values, imputed_values):
    mae = mean_absolute_error(true_values, imputed_values)
    rmse = mean_squared_error(true_values, imputed_values, squared=False)
    return mae, rmse

# Define the advanced imputation pipeline
class AdvancedImputationPipeline:
    def __init__(self, df):
        self.df = df.copy()
        self.imputed_dfs = {}
    
    def impute_simple(self, column, strategy='mean'):
        imputer = SimpleImputer(strategy=strategy)
        self.df[column] = imputer.fit_transform(self.df[[column]])
        return self.df
    
    def impute_knn(self, column, n_neighbors=3):
        imputer = KNNImputer(n_neighbors=n_neighbors)
        self.df[column] = imputer.fit_transform(self.df[[column]])
        return self.df
    
    def impute_iterative(self, column):
        imputer = IterativeImputer(max_iter=10, random_state=0)
        self.df[column] = imputer.fit_transform(self.df[[column]])
        return self.df
    
    def impute_random_forest(self, column):
        temp_df = self.df.dropna(subset=[column]).copy()  # Only rows without missing values in the target column
        X = temp_df.drop(columns=[column])
        y = temp_df[column]
        
        rf = RandomForestRegressor(random_state=0)
        rf.fit(X, y)
        
        missing_df = self.df[self.df[column].isna()]
        if not missing_df.empty:
            X_missing = missing_df.drop(columns=[column])
            imputed_values = rf.predict(X_missing)
            self.df.loc[self.df[column].isna(), column] = imputed_values
        return self.df
    
    def run_imputation(self):
        # Apply Simple Imputer (Mean, Median, Mode)
        for strategy in ['mean', 'median', 'most_frequent']:
            temp_df = self.df.copy()
            for column in temp_df.select_dtypes(include=['float64', 'int64']).columns:
                temp_df = self.impute_simple(column, strategy)
            self.imputed_dfs[f'simple_{strategy}'] = temp_df
        
        # Apply KNN Imputer
        temp_df = self.df.copy()
        for column in temp_df.select_dtypes(include=['float64', 'int64']).columns:
            temp_df = self.impute_knn(column)
        self.imputed_dfs['knn'] = temp_df
        
        # Apply Iterative Imputer
        temp_df = self.df.copy()
        for column in temp_df.select_dtypes(include=['float64', 'int64']).columns:
            temp_df = self.impute_iterative(column)
        self.imputed_dfs['iterative'] = temp_df
        
        # Apply Random Forest Imputer for numeric columns
        temp_df = self.df.copy()
        for column in temp_df.select_dtypes(include=['float64', 'int64']).columns:
            temp_df = self.impute_random_forest(column)
        self.imputed_dfs['random_forest'] = temp_df
        
        return self.imputed_dfs

In [17]:
# Instantiate the pipeline and run imputation
pipeline = AdvancedImputationPipeline(df)
imputed_dfs = pipeline.run_imputation()

# Evaluate the results for numeric columns
print("\nEvaluation Metrics for Numeric Imputation:")
for method, imputed_df in imputed_dfs.items():
    for column in df.select_dtypes(include=['float64', 'int64']).columns:
        true_values = df[column].dropna()
        imputed_values = imputed_df[column].loc[true_values.index]
        mae, rmse = evaluate_imputation(true_values, imputed_values)
        print(f"Method: {method}, Column: {column}, MAE: {mae:.2f}, RMSE: {rmse:.2f}")

# Display the imputed datasets
print("\nImputed Datasets:")
for method, imputed_df in imputed_dfs.items():
    print(f"\nMethod: {method}")
    display(imputed_df)



Evaluation Metrics for Numeric Imputation:
Method: simple_mean, Column: Age, MAE: 0.00, RMSE: 0.00
Method: simple_mean, Column: Salary, MAE: 0.00, RMSE: 0.00
Method: simple_median, Column: Age, MAE: 0.00, RMSE: 0.00
Method: simple_median, Column: Salary, MAE: 0.00, RMSE: 0.00
Method: simple_most_frequent, Column: Age, MAE: 0.00, RMSE: 0.00
Method: simple_most_frequent, Column: Salary, MAE: 0.00, RMSE: 0.00
Method: knn, Column: Age, MAE: 0.00, RMSE: 0.00
Method: knn, Column: Salary, MAE: 0.00, RMSE: 0.00
Method: iterative, Column: Age, MAE: 0.00, RMSE: 0.00
Method: iterative, Column: Salary, MAE: 0.00, RMSE: 0.00
Method: random_forest, Column: Age, MAE: 0.00, RMSE: 0.00
Method: random_forest, Column: Salary, MAE: 0.00, RMSE: 0.00

Imputed Datasets:

Method: simple_mean


Unnamed: 0,Age,Salary,Gender
0,25.0,50000.0,Male
1,30.0,54000.0,Female
2,43.571429,57000.0,Male
3,35.0,65714.285714,
4,40.0,65000.0,Female
5,43.571429,69000.0,Female
6,50.0,65714.285714,
7,60.0,80000.0,Male
8,65.0,85000.0,Male
9,43.571429,65714.285714,Female



Method: simple_median


Unnamed: 0,Age,Salary,Gender
0,25.0,50000.0,Male
1,30.0,54000.0,Female
2,43.571429,57000.0,Male
3,35.0,65714.285714,
4,40.0,65000.0,Female
5,43.571429,69000.0,Female
6,50.0,65714.285714,
7,60.0,80000.0,Male
8,65.0,85000.0,Male
9,43.571429,65714.285714,Female



Method: simple_most_frequent


Unnamed: 0,Age,Salary,Gender
0,25.0,50000.0,Male
1,30.0,54000.0,Female
2,43.571429,57000.0,Male
3,35.0,65714.285714,
4,40.0,65000.0,Female
5,43.571429,69000.0,Female
6,50.0,65714.285714,
7,60.0,80000.0,Male
8,65.0,85000.0,Male
9,43.571429,65714.285714,Female



Method: knn


Unnamed: 0,Age,Salary,Gender
0,25.0,50000.0,Male
1,30.0,54000.0,Female
2,43.571429,57000.0,Male
3,35.0,65714.285714,
4,40.0,65000.0,Female
5,43.571429,69000.0,Female
6,50.0,65714.285714,
7,60.0,80000.0,Male
8,65.0,85000.0,Male
9,43.571429,65714.285714,Female



Method: iterative


Unnamed: 0,Age,Salary,Gender
0,25.0,50000.0,Male
1,30.0,54000.0,Female
2,43.571429,57000.0,Male
3,35.0,65714.285714,
4,40.0,65000.0,Female
5,43.571429,69000.0,Female
6,50.0,65714.285714,
7,60.0,80000.0,Male
8,65.0,85000.0,Male
9,43.571429,65714.285714,Female



Method: random_forest


Unnamed: 0,Age,Salary,Gender
0,25.0,50000.0,Male
1,30.0,54000.0,Female
2,43.571429,57000.0,Male
3,35.0,65714.285714,
4,40.0,65000.0,Female
5,43.571429,69000.0,Female
6,50.0,65714.285714,
7,60.0,80000.0,Male
8,65.0,85000.0,Male
9,43.571429,65714.285714,Female


In [16]:
from sklearn.preprocessing import OneHotEncoder

class AdvancedImputationPipeline:
    def __init__(self, df):
        self.df = df.copy()
        self.imputed_dfs = {}
    
    def impute_simple(self, column, strategy='mean'):
        imputer = SimpleImputer(strategy=strategy)
        self.df[column] = imputer.fit_transform(self.df[[column]])
        return self.df
    
    def impute_knn(self, column, n_neighbors=3):
        imputer = KNNImputer(n_neighbors=n_neighbors)
        self.df[column] = imputer.fit_transform(self.df[[column]])
        return self.df
    
    def impute_iterative(self, column):
        imputer = IterativeImputer(max_iter=10, random_state=0)
        self.df[column] = imputer.fit_transform(self.df[[column]])
        return self.df
    
    def impute_random_forest(self, column):
        # One-Hot Encode categorical features
        temp_df = pd.get_dummies(self.df, drop_first=True)
        
        # Only keep rows without missing values in the target column
        non_missing_df = temp_df.dropna(subset=[column])
        X = non_missing_df.drop(columns=[column])
        y = non_missing_df[column]
        
        # Train Random Forest Regressor
        rf = RandomForestRegressor(random_state=0)
        rf.fit(X, y)
        
        # Predict and fill missing values
        missing_df = temp_df[temp_df[column].isna()]
        if not missing_df.empty:
            X_missing = missing_df.drop(columns=[column])
            imputed_values = rf.predict(X_missing)
            self.df.loc[self.df[column].isna(), column] = imputed_values
        return self.df
    
    def run_imputation(self):
        # Apply Simple Imputer (Mean, Median, Mode)
        for strategy in ['mean', 'median', 'most_frequent']:
            temp_df = self.df.copy()
            for column in temp_df.select_dtypes(include=['float64', 'int64']).columns:
                temp_df = self.impute_simple(column, strategy)
            self.imputed_dfs[f'simple_{strategy}'] = temp_df
        
        # Apply KNN Imputer
        temp_df = self.df.copy()
        for column in temp_df.select_dtypes(include=['float64', 'int64']).columns:
            temp_df = self.impute_knn(column)
        self.imputed_dfs['knn'] = temp_df
        
        # Apply Iterative Imputer
        temp_df = self.df.copy()
        for column in temp_df.select_dtypes(include=['float64', 'int64']).columns:
            temp_df = self.impute_iterative(column)
        self.imputed_dfs['iterative'] = temp_df
        
        # Apply Random Forest Imputer for numeric columns
        temp_df = self.df.copy()
        for column in temp_df.select_dtypes(include=['float64', 'int64']).columns:
            temp_df = self.impute_random_forest(column)
        self.imputed_dfs['random_forest'] = temp_df
        
        return self.imputed_dfs

# Instantiate the pipeline and run imputation
pipeline = AdvancedImputationPipeline(df)
imputed_dfs = pipeline.run_imputation()

# Display the imputed datasets
print("\nImputed Datasets:")
for method, imputed_df in imputed_dfs.items():
    print(f"\nMethod: {method}")
    display(imputed_df)



Imputed Datasets:

Method: simple_mean


Unnamed: 0,Age,Salary,Gender
0,25.0,50000.0,Male
1,30.0,54000.0,Female
2,43.571429,57000.0,Male
3,35.0,65714.285714,
4,40.0,65000.0,Female
5,43.571429,69000.0,Female
6,50.0,65714.285714,
7,60.0,80000.0,Male
8,65.0,85000.0,Male
9,43.571429,65714.285714,Female



Method: simple_median


Unnamed: 0,Age,Salary,Gender
0,25.0,50000.0,Male
1,30.0,54000.0,Female
2,43.571429,57000.0,Male
3,35.0,65714.285714,
4,40.0,65000.0,Female
5,43.571429,69000.0,Female
6,50.0,65714.285714,
7,60.0,80000.0,Male
8,65.0,85000.0,Male
9,43.571429,65714.285714,Female



Method: simple_most_frequent


Unnamed: 0,Age,Salary,Gender
0,25.0,50000.0,Male
1,30.0,54000.0,Female
2,43.571429,57000.0,Male
3,35.0,65714.285714,
4,40.0,65000.0,Female
5,43.571429,69000.0,Female
6,50.0,65714.285714,
7,60.0,80000.0,Male
8,65.0,85000.0,Male
9,43.571429,65714.285714,Female



Method: knn


Unnamed: 0,Age,Salary,Gender
0,25.0,50000.0,Male
1,30.0,54000.0,Female
2,43.571429,57000.0,Male
3,35.0,65714.285714,
4,40.0,65000.0,Female
5,43.571429,69000.0,Female
6,50.0,65714.285714,
7,60.0,80000.0,Male
8,65.0,85000.0,Male
9,43.571429,65714.285714,Female



Method: iterative


Unnamed: 0,Age,Salary,Gender
0,25.0,50000.0,Male
1,30.0,54000.0,Female
2,43.571429,57000.0,Male
3,35.0,65714.285714,
4,40.0,65000.0,Female
5,43.571429,69000.0,Female
6,50.0,65714.285714,
7,60.0,80000.0,Male
8,65.0,85000.0,Male
9,43.571429,65714.285714,Female



Method: random_forest


Unnamed: 0,Age,Salary,Gender
0,25.0,50000.0,Male
1,30.0,54000.0,Female
2,43.571429,57000.0,Male
3,35.0,65714.285714,
4,40.0,65000.0,Female
5,43.571429,69000.0,Female
6,50.0,65714.285714,
7,60.0,80000.0,Male
8,65.0,85000.0,Male
9,43.571429,65714.285714,Female
