# Identification and Simple Removal (Pandas)

##  Check for Missing Values

In [18]:
import pandas as pd
import numpy as np

data = {'A': [10.56, 24.23, np.nan, 45.36],
        'B': [104.50, np.nan, 304.25, 414.25],
        'C': [100, 200, 300, 400]}
df = pd.DataFrame(data)



In [19]:
# Check total missing values per column
print("Total Missing Values per Column:")
print(df.isnull().sum())

# Check percentage of missing values per column
print("\nPercentage Missing per Column:")
print((df.isnull().sum() / len(df)) * 100)

Total Missing Values per Column:
A    1
B    1
C    0
dtype: int64

Percentage Missing per Column:
A    25.0
B    25.0
C     0.0
dtype: float64


In [20]:
df

Unnamed: 0,A,B,C
0,10.56,104.5,100
1,24.23,,200
2,,304.25,300
3,45.36,414.25,400


## Row Removal (Listwise Deletion)

In [21]:
# Create a copy to show the effect
df_dropped_rows = df.dropna(how='any')

print("\nDataFrame after dropping rows with any NaN:")
print(df_dropped_rows)


DataFrame after dropping rows with any NaN:
       A       B    C
0  10.56  104.50  100
3  45.36  414.25  400


#  Imputation with SimpleImputer (Scikit-learn)

In [30]:
from sklearn.impute import SimpleImputer

# Sample Data with NaNs
X = df


In [31]:
# --- Mean Imputation ---
imputer_mean = SimpleImputer(strategy='mean')

In [33]:
# 1. Fit ONLY on X_train
imputer_mean.fit(X)

# 2. Transform both X_train and X_test
X_imputed_mean = imputer_mean.transform(X)


print("\nMean Imputation - Learned Mean (A, B, C):")
print(imputer_mean.statistics_)
print("\nX")
print(X)
      
print("\nMean Imputation - Scaled Training Data (A, B, C):")
print(X_imputed_mean)



Mean Imputation - Learned Mean (A, B, C):
[ 26.71666667 274.33333333 250.        ]

X
       A       B    C
0  10.56  104.50  100
1  24.23     NaN  200
2    NaN  304.25  300
3  45.36  414.25  400

Mean Imputation - Scaled Training Data (A, B, C):
[[ 10.56       104.5        100.        ]
 [ 24.23       274.33333333 200.        ]
 [ 26.71666667 304.25       300.        ]
 [ 45.36       414.25       400.        ]]


In [38]:
# --- Mode Imputation (for Categorical/Discrete Data) ---
# Use strategy='most_frequent'
# multiple mode then first mode
imputer_mode = SimpleImputer(strategy='most_frequent')
imputer_mode.fit(X)
X_imputed_mode = imputer_mode.transform(X)
print("\nMode Imputation - Learned Mode (A, B, C):")
print(imputer_mode.statistics_)
print("\nX")
print(X)
      
print("\nMode Imputation - Scaled Training Data (A, B, C):")
print(X_imputed_mode)



Mode Imputation - Learned Mode (A, B, C):
[ 10.56 104.5  100.  ]

X
       A       B    C
0  10.56  104.50  100
1  24.23     NaN  200
2    NaN  304.25  300
3  45.36  414.25  400

Mode Imputation - Scaled Training Data (A, B, C):
[[ 10.56 104.5  100.  ]
 [ 24.23 104.5  200.  ]
 [ 10.56 304.25 300.  ]
 [ 45.36 414.25 400.  ]]


In [39]:
# --- Median Imputation ---
# Use strategy='median'
imputer_median = SimpleImputer(strategy='median')
imputer_median.fit(X)
X_imputed_median = imputer_median.transform(X)
print("\nMedian Imputation - Learned Median (A, B, C):")
print(imputer_median.statistics_)
print("\nX")
print(X)
      
print("\nMedian Imputation - Scaled Training Data (A, B, C):")
print(X_imputed_median)


Median Imputation - Learned Median (A, B, C):
[ 24.23 304.25 250.  ]

X
       A       B    C
0  10.56  104.50  100
1  24.23     NaN  200
2    NaN  304.25  300
3  45.36  414.25  400

Median Imputation - Scaled Training Data (A, B, C):
[[ 10.56 104.5  100.  ]
 [ 24.23 304.25 200.  ]
 [ 24.23 304.25 300.  ]
 [ 45.36 414.25 400.  ]]


# KNNImputer

In [40]:
from sklearn.impute import KNNImputer

# Re-initialize the imputer (using default n_neighbors=5)
knn_imputer = KNNImputer()


knn_imputer.fit(X)

# 2. Transform the data
X_imputed_knn = knn_imputer.transform(X)

print("\nKNN Imputation - Scaled Data (A, B, C):")
print(X_imputed_knn)


KNN Imputation - Scaled Data (A, B, C):
[[ 10.56       104.5        100.        ]
 [ 24.23       274.33333333 200.        ]
 [ 26.71666667 304.25       300.        ]
 [ 45.36       414.25       400.        ]]


# Creating a Missing Indicator Variable

In [42]:
from sklearn.impute import SimpleImputer

# Use SimpleImputer with add_indicator=True
imputer_indicator = SimpleImputer(strategy='mean', add_indicator=True)

# Fit and transform the data
X_imputed_ind = imputer_indicator.fit_transform(X)

# The result array will have 3 original columns + 3 indicator columns
print("\nMissing Indicator - Imputed Data (4 columns: A, B, C, Indicator_A, Indicator_B):")
print(X_imputed_ind)

# The last two columns are the binary indicators (1 means it was missing)
# Column C has no missing value so no indicating column


Missing Indicator - Imputed Data (4 columns: A, B, C, Indicator_A, Indicator_B):
[[ 10.56       104.5        100.           0.           0.        ]
 [ 24.23       274.33333333 200.           0.           1.        ]
 [ 26.71666667 304.25       300.           1.           0.        ]
 [ 45.36       414.25       400.           0.           0.        ]]


# IterativeImputer

In [43]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer # if old version
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge # A common estimator for IterativeImputer
from sklearn.model_selection import train_test_split

# 1. Create Sample Data with Missing Values
data = {
    'Hours_Studied': [2, 5, np.nan, 8, 1, 6],
    'Score': [60, 95, 75, np.nan, 50, 90],
    'IQ': [100, 110, 120, 105, np.nan, 115]
}
df = pd.DataFrame(data)
X = df.copy()

In [44]:
# Split the data (essential step!)
X_train, X_test = train_test_split(X, test_size=0.5, random_state=42)

print("Original Training Data:")
print(X_train)
print("-" * 50)

Original Training Data:
   Hours_Studied  Score     IQ
2            NaN   75.0  120.0
4            1.0   50.0    NaN
3            8.0    NaN  105.0
--------------------------------------------------


In [45]:
# 2. Initialize the IterativeImputer
# estimator: The model used for prediction (default is BayesianRidge)
# max_iter: The number of imputation cycling steps
imputer = IterativeImputer(estimator=BayesianRidge(),
                           max_iter=10,
                           random_state=42)

In [46]:
# 3. Fit ONLY on Training Data
# The imputer learns the relationships between the features in the training set.
imputer.fit(X_train)

0,1,2
,estimator,BayesianRidge()
,missing_values,
,sample_posterior,False
,max_iter,10
,tol,0.001
,n_nearest_features,
,initial_strategy,'mean'
,fill_value,
,imputation_order,'ascending'
,skip_complete,False

0,1,2
,max_iter,300
,tol,0.001
,alpha_1,1e-06
,alpha_2,1e-06
,lambda_1,1e-06
,lambda_2,1e-06
,alpha_init,
,lambda_init,
,compute_score,False
,fit_intercept,True


In [47]:
# 4. Transform Data
X_train_imputed = imputer.transform(X_train)
X_test_imputed = imputer.transform(X_test)

print("Imputed Training Data (Numpy Array):")
print(X_train_imputed)

Imputed Training Data (Numpy Array):
[[245.9697529   75.         120.        ]
 [  1.          50.         104.55876759]
 [  8.          50.714374   105.        ]]


In [48]:
# Convert back to DataFrame for readability
X_train_imputed_df = pd.DataFrame(X_train_imputed, columns=X.columns, index=X_train.index)
print("\nImputed Training Data (DataFrame):")
print(X_train_imputed_df)


Imputed Training Data (DataFrame):
   Hours_Studied      Score          IQ
2     245.969753  75.000000  120.000000
4       1.000000  50.000000  104.558768
3       8.000000  50.714374  105.000000
