# Install Libraries

In [1]:
pip install pandas seaborn matplotlib scikit-learn numpy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd

# Step 1: Create a dataset with missing values
data = {
    'Age': [25, np.nan, 30, 22, np.nan, 35, 40],
    'Income': [5000, 6000, np.nan, 4500, 5500, np.nan, 8000],
    'City': ['Tehran', 'Shiraz', 'Isfahan', np.nan, 'Tabriz', 'Tehran', 'Mashhad']
}

df = pd.DataFrame(data)

# Step 2: Encoding Missing as a Separate Category (For categorical column 'City')
df['City'] = df['City'].fillna('Missing')

# Step 3: Flagging Missing Values (For numerical columns)
df['Age_Missing_Flag'] = df['Age'].isna().astype(int)
df['Income_Missing_Flag'] = df['Income'].isna().astype(int)

# Step 4: Fill missing values in numerical columns (optional)
df['Age'] = df['Age'].fillna(df['Age'].median())  # Fill with median
df['Income'] = df['Income'].fillna(df['Income'].median())  # Fill with median

# Print the final dataset
print(df)


    Age  Income     City  Age_Missing_Flag  Income_Missing_Flag
0  25.0  5000.0   Tehran                 0                    0
1  30.0  6000.0   Shiraz                 1                    0
2  30.0  5500.0  Isfahan                 0                    1
3  22.0  4500.0  Missing                 0                    0
4  30.0  5500.0   Tabriz                 1                    0
5  35.0  5500.0   Tehran                 0                    1
6  40.0  8000.0  Mashhad                 0                    0


In [3]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer  # Required for IterativeImputer
from sklearn.impute import IterativeImputer

# Step 1: Create a dataset with missing values
data = {
    'Age': [25, np.nan, 30, 22, np.nan, 35, 40],
    'Income': [5000, 6000, np.nan, 4500, 5500, np.nan, 8000],
    'Spending_Score': [65, 80, 70, np.nan, 60, 75, np.nan]
}

df = pd.DataFrame(data)

# Step 2: Apply KNN Imputation
knn_imputer = KNNImputer(n_neighbors=2)  # Use 2 nearest neighbors
df_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(df), columns=df.columns)

# Step 3: Apply Multivariate (Iterative) Imputation
iter_imputer = IterativeImputer(max_iter=10, random_state=42)  # Regression-based imputation
df_iter_imputed = pd.DataFrame(iter_imputer.fit_transform(df), columns=df.columns)

# Print the original and imputed DataFrames
print("Original DataFrame with Missing Values:\n", df)
print("\nKNN Imputed DataFrame:\n", df_knn_imputed)
print("\nMultivariate (Iterative) Imputed DataFrame:\n", df_iter_imputed)


Original DataFrame with Missing Values:
     Age  Income  Spending_Score
0  25.0  5000.0            65.0
1   NaN  6000.0            80.0
2  30.0     NaN            70.0
3  22.0  4500.0             NaN
4   NaN  5500.0            60.0
5  35.0     NaN            75.0
6  40.0  8000.0             NaN

KNN Imputed DataFrame:
     Age  Income  Spending_Score
0  25.0  5000.0            65.0
1  32.5  6000.0            80.0
2  30.0  4750.0            70.0
3  22.0  4500.0            72.5
4  32.5  5500.0            60.0
5  35.0  7000.0            75.0
6  40.0  8000.0            72.5

Multivariate (Iterative) Imputed DataFrame:
          Age       Income  Spending_Score
0  25.000000  5000.000000       65.000000
1  29.854751  6000.000000       80.000000
2  30.000000  6028.496285       70.000000
3  22.000000  4500.000000       63.519291
4  27.310712  5500.000000       60.000000
5  35.000000  7009.262620       75.000000
6  40.000000  8000.000000       79.634126
