In [1]:
# Question: Predictive Imputation Using Machine Learning
# Description: Use a simple predictive model to impute missing values in a column.



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def predictive_imputation(df, column_to_impute, features):
    """
    Imputes missing values in a specified column using a predictive model
    based on other features in the DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame containing the missing values.
        column_to_impute (str): The name of the column with missing values to impute.
        features (list): A list of column names to use as features for prediction.

    Returns:
        pd.DataFrame: The DataFrame with missing values in the specified column imputed.
    """

    # Create copies to avoid modifying the original DataFrame
    df_imputed = df.copy()

    # Separate rows with missing values in the target column
    missing_data = df_imputed[df_imputed[column_to_impute].isnull()]
    non_missing_data = df_imputed.dropna(subset=[column_to_impute])

    if missing_data.empty:
        print(f"No missing values found in '{column_to_impute}'. No imputation needed.")
        return df_imputed

    if non_missing_data.empty or not all(feature in non_missing_data.columns for feature in features):
        print(f"Not enough data or missing features to train the imputation model for '{column_to_impute}'.")
        return df_imputed

    # Prepare features (X) and target (y) for training
    X_train = non_missing_data[features]
    y_train = non_missing_data[column_to_impute]
    X_predict = missing_data[features]

    # Train a simple Linear Regression model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict the missing values
    predicted_values = model.predict(X_predict)

    # Impute the missing values in the copied DataFrame
    df_imputed.loc[df_imputed[column_to_impute].isnull(), column_to_impute] = predicted_values

    print(f"Missing values in '{column_to_impute}' have been imputed using a Linear Regression model.")

    # (Optional) Evaluate the model on a held-out set if you have enough data
    if len(non_missing_data) > len(missing_data):
        X_train_eval, X_test_eval, y_train_eval, y_test_eval = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
        model_eval = LinearRegression()
        model_eval.fit(X_train_eval, y_train_eval)
        y_pred_eval = model_eval.predict(X_test_eval)
        mse = mean_squared_error(y_test_eval, y_pred_eval)
        print(f"(Optional) Mean Squared Error of the imputation model: {mse:.2f}")

    return df_imputed

if __name__ == '__main__':
    # Example Usage
    data = {'Age': [25, 30, None, 40, None, 35, 28],
            'Income': [50000, 60000, 75000, 80000, 65000, 70000, 55000],
            'EducationLevel': [12, 16, 14, 18, 12, 16, 15],
            'Spending': [2000, 2500, 3000, 3500, 2800, 3200, 2200]}
    df = pd.DataFrame(data)

    column_to_impute = 'Age'
    features_to_use = ['Income', 'EducationLevel', 'Spending']

    df_imputed = predictive_imputation(df.copy(), column_to_impute, features_to_use)

    print("\nOriginal DataFrame:")
    print(df)
    print("\nDataFrame with Imputed Values:")
    print(df_imputed)

Missing values in 'Age' have been imputed using a Linear Regression model.
(Optional) Mean Squared Error of the imputation model: 0.25

Original DataFrame:
    Age  Income  EducationLevel  Spending
0  25.0   50000              12      2000
1  30.0   60000              16      2500
2   NaN   75000              14      3000
3  40.0   80000              18      3500
4   NaN   65000              12      2800
5  35.0   70000              16      3200
6  28.0   55000              15      2200

DataFrame with Imputed Values:
         Age  Income  EducationLevel  Spending
0  25.000000   50000              12      2000
1  30.000000   60000              16      2500
2  37.313725   75000              14      3000
3  40.000000   80000              18      3500
4  32.215686   65000              12      2800
5  35.000000   70000              16      3200
6  28.000000   55000              15      2200
