<a href="https://colab.research.google.com/github/shubham-madhukar/shubham/blob/main/data_imputation_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

MEAN/MEADIAN/MODE


In [1]:
import pandas as pd
import numpy as np

# Sample DataFrame with missing values
data = {
    'PatientID': [1, 2, 3, 4, 5],
    'BloodPressure': [120, 130, np.nan, 140, np.nan]
}
df = pd.DataFrame(data)

# Mean Imputation
df['BloodPressure_mean'] = df['BloodPressure'].fillna(df['BloodPressure'].mean())

# Median Imputation
df['BloodPressure_median'] = df['BloodPressure'].fillna(df['BloodPressure'].median())

# Mode Imputation
df['BloodPressure_mode'] = df['BloodPressure'].fillna(df['BloodPressure'].mode()[0])

print(df)


   PatientID  BloodPressure  BloodPressure_mean  BloodPressure_median  \
0          1          120.0               120.0                 120.0   
1          2          130.0               130.0                 130.0   
2          3            NaN               130.0                 130.0   
3          4          140.0               140.0                 140.0   
4          5            NaN               130.0                 130.0   

   BloodPressure_mode  
0               120.0  
1               130.0  
2               120.0  
3               140.0  
4               120.0  


 Forward/Backward Fill Imputation


In [2]:
# Sample DataFrame with missing values
data = {
    'Date': pd.date_range(start='2023-01-01', periods=5, freq='D'),
    'Temperature': [30, np.nan, 25, np.nan, 28]
}
df = pd.DataFrame(data)

# Forward Fill Imputation
df['Temperature_ffill'] = df['Temperature'].ffill()

# Backward Fill Imputation
df['Temperature_bfill'] = df['Temperature'].bfill()

print(df)


        Date  Temperature  Temperature_ffill  Temperature_bfill
0 2023-01-01         30.0               30.0               30.0
1 2023-01-02          NaN               30.0               25.0
2 2023-01-03         25.0               25.0               25.0
3 2023-01-04          NaN               25.0               28.0
4 2023-01-05         28.0               28.0               28.0


K-Nearest Neighbors (KNN) Imputation

In [3]:
from sklearn.impute import KNNImputer

# Sample DataFrame with missing values
data = {
    'Feature1': [1, 2, np.nan, 4, 5],
    'Feature2': [5, 4, 3, np.nan, 1],
    'Feature3': [2, 3, 4, 5, np.nan]
}
df = pd.DataFrame(data)

# KNN Imputer
imputer = KNNImputer(n_neighbors=2)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

print(df_imputed)


   Feature1  Feature2  Feature3
0       1.0       5.0       2.0
1       2.0       4.0       3.0
2       3.0       3.0       4.0
3       4.0       2.0       5.0
4       5.0       1.0       4.5


Multivariate Imputation by Chained Equations (MICE)

In [5]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Sample DataFrame with missing values
data = {
    'Feature1': [1, 2, np.nan, 4, 5],
    'Feature2': [5, 4, 3, np.nan, 1],
    'Feature3': [2, 3, 4, 5, np.nan]
}
df = pd.DataFrame(data)

# MICE Imputer
imputer = IterativeImputer(max_iter=10, random_state=0)  # Initialize MICE imputer
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)  # Impute missing values

print(df_imputed)

   Feature1  Feature2  Feature3
0  1.000000  5.000000  2.000000
1  2.000000  4.000000  3.000000
2  3.000052  3.000000  4.000000
3  4.000000  2.000122  5.000000
4  5.000000  1.000000  6.000047


Predictive Modeling Imputation

In [6]:
from sklearn.ensemble import RandomForestRegressor

# Sample DataFrame with missing values
data = {
    'Feature1': [1, 2, 3, 4, 5],
    'Feature2': [5, 4, np.nan, 2, 1],
    'Target': [10, 20, 30, 40, 50]
}
df = pd.DataFrame(data)

# Separate the columns with missing data
missing_data = df[df['Feature2'].isnull()]

# Train a model on the available data
train_data = df.dropna()
X_train = train_data[['Feature1', 'Target']]
y_train = train_data['Feature2']
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Predict the missing values
X_missing = missing_data[['Feature1', 'Target']]
df.loc[df['Feature2'].isnull(), 'Feature2'] = model.predict(X_missing)

print(df)


   Feature1  Feature2  Target
0         1      5.00      10
1         2      4.00      20
2         3      3.52      30
3         4      2.00      40
4         5      1.00      50
