In [None]:
# Question: Predictive Imputation Using Machine Learning
# Description: Use a simple predictive model to impute missing values in a column.



In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression

def predictive_imputation(df, column_to_impute, features):
    """
    Imputes missing values in a specified column using a predictive model.

    Args:
        df (pd.DataFrame): The input DataFrame.
        column_to_impute (str): The name of the column with missing values.
        features (list): A list of column names to use as features for prediction.

    Returns:
        pd.DataFrame: A new DataFrame with missing values in the specified
                      column imputed.
    """
    df_imputed = df.copy()

    # Separate rows with missing values in the target column
    missing_target = df_imputed[df_imputed[column_to_impute].isnull()]
    complete_target = df_imputed.dropna(subset=[column_to_impute])

    if complete_target.empty:
        print(f"Warning: No complete data available to train the model for '{column_to_impute}'. Returning original DataFrame.")
        return df

    # Drop rows from the complete data that have missing values in the features
    complete_data_for_model = complete_target.dropna(subset=features)

    if complete_data_for_model.empty:
        print(f"Warning: No complete data in the features for '{column_to_impute}'. Returning original DataFrame.")
        return df

    # Prepare features and target variable for training using only complete rows
    X_train = complete_data_for_model[features]
    y_train = complete_data_for_model[column_to_impute]

    # Initialize and train a simple predictive model (Linear Regression for numeric data)
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Prepare features for prediction in the missing target data, handling potential NaNs
    X_predict = missing_target[features].fillna(complete_target[features].mean()) # Impute NaNs in features with the mean of the complete data

    if not X_predict.empty:
        # Predict the missing values
        predicted_values = model.predict(X_predict)

        # Fill the missing values in the copied DataFrame
        df_imputed.loc[df_imputed[column_to_impute].isnull(), column_to_impute] = predicted_values

    return df_imputed

if __name__ == '__main__':
    # Create a sample DataFrame with missing values
    data = {'Feature1': [1, 2, None, 4, 5, None, 7, 8],
            'Feature2': [2.1, 3.5, None, 5.2, 6.8, 7.1, None, 9.3],
            'TargetColumn': [10, 20, 30, None, 50, 60, None, 80]}
    df = pd.DataFrame(data)
    print("Original DataFrame:")
    print(df)
    print("\nMissing values before imputation:")
    print(df.isnull().sum())

    # Define the column to impute and the features to use
    column_to_impute = 'TargetColumn'
    features = ['Feature1', 'Feature2']

    # Impute the missing values
    df_imputed = predictive_imputation(df.copy(), column_to_impute, features)
    print("\nDataFrame after predictive imputation of 'TargetColumn':")
    print(df_imputed)
    print("\nMissing values after imputation of 'TargetColumn':")
    print(df_imputed.isnull().sum())

    # Example with a different column to impute
    column_to_impute_2 = 'Feature2'
    features_2 = ['Feature1', 'TargetColumn']
    df_imputed_2 = predictive_imputation(df.copy(), column_to_impute_2, features_2)
    print(f"\nDataFrame after predictive imputation of '{column_to_impute_2}':")
    print(df_imputed_2)
    print(f"\nMissing values after imputation of '{column_to_impute_2}':")
    print(df_imputed_2.isnull().sum())

Original DataFrame:
   Feature1  Feature2  TargetColumn
0       1.0       2.1          10.0
1       2.0       3.5          20.0
2       NaN       NaN          30.0
3       4.0       5.2           NaN
4       5.0       6.8          50.0
5       NaN       7.1          60.0
6       7.0       NaN           NaN
7       8.0       9.3          80.0

Missing values before imputation:
Feature1        2
Feature2        2
TargetColumn    2
dtype: int64

DataFrame after predictive imputation of 'TargetColumn':
   Feature1  Feature2  TargetColumn
0       1.0       2.1          10.0
1       2.0       3.5          20.0
2       NaN       NaN          30.0
3       4.0       5.2          40.0
4       5.0       6.8          50.0
5       NaN       7.1          60.0
6       7.0       NaN          70.0
7       8.0       9.3          80.0

Missing values after imputation of 'TargetColumn':
Feature1        2
Feature2        2
TargetColumn    0
dtype: int64

DataFrame after predictive imputation of 'Feature2':