In [None]:
# Activity 1: Handling Missing Data

# Task A: Dropping vs Imputation

# 1. Dropping Missing Data:
# - Load a dataset (e.g., a CSV file with some missing values like employees.csv ).
# - Inspect the dataset for missing values using a Python library (e.g., Pandas).
# - Drop rows with missing data and save the result.






# 2. Imputation using Mean:
# - Use the same dataset.
# - Fill missing numerical values with the column mean.
# - Save and display the modified data.









# 3. Imputation using Median and Mode:
# - For numerical columns, replace missing values with the median.
# - For categorical columns, use the mode.
# - Display the updated dataset.







In [None]:
# Task B: Predictive Imputation

# 4. ML-based Imputation with Simple Imputer:
# - Use SimpleImputer from sklearn to fill missing values.
# - Choose a strategy (e.g., mean) and apply it to the dataset.





# 5. Imputation using a Regression Model:
# - Use a regression model to predict missing values.
# - Train the model on complete cases and fill the missing data.




# 6. K-Nearest Neighbors Imputation:
# - Use KNNImputer from sklearn .
# - Impute missing data based on neighbors' information.






In [3]:
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import numpy as np

def perform_predictive_imputation(df, numeric_cols):
    """
    Performs ML-based imputation on a Pandas DataFrame for specified numeric columns.

    Args:
        df (pd.DataFrame): The input DataFrame.
        numeric_cols (list): A list of numeric column names to impute.

    Returns:
        pd.DataFrame: A DataFrame with missing values imputed using different methods.
    """
    df_imputed = df.copy()

    print("\n--- 4. ML-based Imputation with SimpleImputer ---")
    for col in numeric_cols:
        if df_imputed[col].isnull().any():
            imputer_mean = SimpleImputer(strategy='mean')
            df_imputed[[col]] = imputer_mean.fit_transform(df_imputed[[col]])
            print(f"Missing values in '{col}' imputed using mean.")
        else:
            print(f"No missing values in '{col}', SimpleImputer (mean) not applied.")

    df_regression_imputed = df.copy()
    print("\n--- 5. Imputation using a Regression Model ---")
    for col_to_impute in numeric_cols:
        if df_regression_imputed[col_to_impute].isnull().any():
            features = [col for col in numeric_cols if col != col_to_impute]
            if not features:
                print(f"Cannot perform regression imputation for '{col_to_impute}' as there are no other numeric columns as features.")
                continue

            train_data = df_regression_imputed[df_regression_imputed[col_to_impute].notnull()]
            test_data = df_regression_imputed[df_regression_imputed[col_to_impute].isnull()].drop(columns=[col_to_impute])

            if train_data.empty or test_data.empty:
                print(f"Not enough complete or missing data in '{col_to_impute}' for regression imputation.")
                continue

            # Impute missing values in features of training data
            feature_imputer = SimpleImputer(strategy='mean') # Or other strategy
            X_train = pd.DataFrame(feature_imputer.fit_transform(train_data[features]), columns=features, index=train_data.index)
            y_train = train_data[col_to_impute]

            # Impute missing values in features of testing data using the fitted imputer
            X_test_imputed = pd.DataFrame(feature_imputer.transform(test_data[features]), columns=features, index=test_data.index)
            X_test = X_test_imputed

            # Train a Linear Regression model
            model = LinearRegression()
            model.fit(X_train, y_train)

            # Predict missing values
            predicted_values = model.predict(X_test)

            # Fill missing values in the original DataFrame copy
            df_regression_imputed.loc[df_regression_imputed[col_to_impute].isnull(), col_to_impute] = predicted_values
            print(f"Missing values in '{col_to_impute}' imputed using Linear Regression.")
        else:
            print(f"No missing values in '{col_to_impute}', Regression Imputation not applied.")

    df_knn_imputed = df.copy()
    print("\n--- 6. K-Nearest Neighbors Imputation ---")
    for col in numeric_cols:
        if df_knn_imputed[col].isnull().any():
            knn_imputer = KNNImputer(n_neighbors=5)
            df_knn_imputed[[col]] = knn_imputer.fit_transform(df_knn_imputed[[col]])
            print(f"Missing values in '{col}' imputed using KNN (n_neighbors=5).")
        else:
            print(f"No missing values in '{col}', KNNImputer not applied.")

    return df_imputed, df_regression_imputed, df_knn_imputed

# Example Usage (same as before):
data = {'col1': [1, 2, np.nan, 4, np.nan],
        'col2': [np.nan, 7, np.nan, 9, np.nan],
        'col3': ['a', 'b', 'c', 'd', 'e'],
        'col4': [10.5, np.nan, 12.3, 14.7, 15.0]}
df = pd.DataFrame(data)

numeric_columns = df.select_dtypes(include=np.number).columns.tolist()

df_simple_imputed, df_regression_imputed, df_knn_imputed = perform_predictive_imputation(df.copy(), numeric_columns)

print("\n--- Imputed DataFrames ---")
print("\nDataFrame after SimpleImputer:")
print(df_simple_imputed)

print("\nDataFrame after Regression Imputation:")
print(df_regression_imputed)

print("\nDataFrame after KNN Imputation:")
print(df_knn_imputed)


--- 4. ML-based Imputation with SimpleImputer ---
Missing values in 'col1' imputed using mean.
Missing values in 'col2' imputed using mean.
Missing values in 'col4' imputed using mean.

--- 5. Imputation using a Regression Model ---
Missing values in 'col1' imputed using Linear Regression.
Missing values in 'col2' imputed using Linear Regression.
Missing values in 'col4' imputed using Linear Regression.

--- 6. K-Nearest Neighbors Imputation ---
Missing values in 'col1' imputed using KNN (n_neighbors=5).
Missing values in 'col2' imputed using KNN (n_neighbors=5).
Missing values in 'col4' imputed using KNN (n_neighbors=5).

--- Imputed DataFrames ---

DataFrame after SimpleImputer:
       col1  col2 col3    col4
0  1.000000   8.0    a  10.500
1  2.000000   7.0    b  13.125
2  2.333333   8.0    c  12.300
3  4.000000   9.0    d  14.700
4  2.333333   8.0    e  15.000

DataFrame after Regression Imputation:
       col1      col2 col3       col4
0  1.000000  6.000000    a  10.500000
1  2.00