In [1]:
import pandas as pd
import numpy as np

## 1. Deletion method

In [2]:
from sklearn.datasets import fetch_california_housing

house = fetch_california_housing()
df = pd.DataFrame(house.data, columns=house.feature_names)

# introduce missing values
df.iloc[::10] = np.nan

# Check for missing values
df.isnull().sum()

MedInc        2064
HouseAge      2064
AveRooms      2064
AveBedrms     2064
Population    2064
AveOccup      2064
Latitude      2064
Longitude     2064
dtype: int64

In [3]:
df = df.dropna()
df.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

## 2. Mean/Median Imputation method

In [4]:
from sklearn.impute import SimpleImputer

house = fetch_california_housing()
df = pd.DataFrame(house.data, columns=house.feature_names)

df.iloc[::10, :] = np.nan
df.isnull().sum()

MedInc        2064
HouseAge      2064
AveRooms      2064
AveBedrms     2064
Population    2064
AveOccup      2064
Latitude      2064
Longitude     2064
dtype: int64

In [5]:
imputer = SimpleImputer(strategy="mean")
df_imputed = imputer.fit_transform(df)
# df_imputed is now a numpy array

# convert it back to pandas df
df_imputed = pd.DataFrame(df_imputed, columns=df.columns)

df_imputed.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

## Or

In [6]:
house = fetch_california_housing()
df = pd.DataFrame(house.data, columns=house.feature_names)

df.iloc[::10] = np.nan
df.isnull().sum()

MedInc        2064
HouseAge      2064
AveRooms      2064
AveBedrms     2064
Population    2064
AveOccup      2064
Latitude      2064
Longitude     2064
dtype: int64

In [7]:
# Replace missing values with mean/median
df.fillna(df.mean(), inplace=True)
df.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

## 3. Regression imputation method

In [8]:
from sklearn.linear_model import LinearRegression

house = fetch_california_housing()
df = pd.DataFrame(house.data, columns=house.feature_names)

df.iloc[10:20, 0] = np.nan
df.isnull().sum()

MedInc        10
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
dtype: int64

In [9]:
# Split the dataset into with and without missing values

# .isna.any() -> default axis for any is 0 (columns) i.e.
# i.e it shows (null values present or not) corresponding to the columns
# change axis = 1 to do it row wise

x_missing = df.loc[df.isna().any(axis=1), :]
x_no_missing = df.dropna()
x_missing

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
10,,52.0,5.477612,1.079602,910.0,2.263682,37.85,-122.26
11,,52.0,4.77248,1.024523,1504.0,2.049046,37.85,-122.26
12,,52.0,5.32265,1.012821,1098.0,2.346154,37.85,-122.26
13,,52.0,4.0,1.097701,345.0,1.982759,37.84,-122.26
14,,52.0,4.262903,1.009677,1212.0,1.954839,37.85,-122.26
15,,50.0,4.242424,1.07197,697.0,2.640152,37.85,-122.26
16,,52.0,5.939577,1.048338,793.0,2.39577,37.85,-122.27
17,,52.0,4.052805,0.966997,648.0,2.138614,37.85,-122.27
18,,50.0,5.343675,1.085919,990.0,2.362768,37.84,-122.26
19,,52.0,5.465455,1.083636,690.0,2.509091,37.84,-122.27


In [10]:
x_train = x_no_missing.drop(columns=["MedInc"])
y_train = x_no_missing["MedInc"]
x_test = x_missing.drop(columns=["MedInc"])

In [11]:
model = LinearRegression()
model.fit(x_train, y_train)

In [12]:
# Impute missing values using the trained model
predicted_missing = model.predict(x_test)

x_missing.loc[:, "MedInc"] = predicted_missing
x_missing

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_missing.loc[:, "MedInc"] = predicted_missing


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
10,3.985553,52.0,5.477612,1.079602,910.0,2.263682,37.85,-122.26
11,3.490337,52.0,4.77248,1.024523,1504.0,2.049046,37.85,-122.26
12,4.1297,52.0,5.32265,1.012821,1098.0,2.346154,37.85,-122.26
13,2.374866,52.0,4.0,1.097701,345.0,1.982759,37.84,-122.26
14,3.032323,52.0,4.262903,1.009677,1212.0,1.954839,37.85,-122.26
15,2.75751,50.0,4.242424,1.07197,697.0,2.640152,37.85,-122.26
16,4.623283,52.0,5.939577,1.048338,793.0,2.39577,37.85,-122.27
17,3.030212,52.0,4.052805,0.966997,648.0,2.138614,37.85,-122.27
18,3.843677,50.0,5.343675,1.085919,990.0,2.362768,37.84,-122.26
19,3.973694,52.0,5.465455,1.083636,690.0,2.509091,37.84,-122.27


In [13]:
# Concatenate the two datasets back together
x_imputed = pd.concat([x_missing, x_no_missing], axis=0)


print(x_imputed.isnull().sum())
x_imputed

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
10,3.985553,52.0,5.477612,1.079602,910.0,2.263682,37.85,-122.26
11,3.490337,52.0,4.772480,1.024523,1504.0,2.049046,37.85,-122.26
12,4.129700,52.0,5.322650,1.012821,1098.0,2.346154,37.85,-122.26
13,2.374866,52.0,4.000000,1.097701,345.0,1.982759,37.84,-122.26
14,3.032323,52.0,4.262903,1.009677,1212.0,1.954839,37.85,-122.26
...,...,...,...,...,...,...,...,...
20635,1.560300,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.556800,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.700000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.867200,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


## 4 Using Interpolation method


In [14]:
from sklearn.datasets import load_iris

data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

In [15]:
# Add some missing values
X.iloc[10:20, 0] = None
X.iloc[20:30, 2] = None

In [16]:
X.isnull().sum()

sepal length (cm)    10
sepal width (cm)      0
petal length (cm)    10
petal width (cm)      0
dtype: int64

In [17]:
# Interpolate missing values using linear interpolation
X = X.interpolate(method="linear")

In [18]:
X.isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

## 5. Using multiple interpolation methods

In [19]:
from sklearn.datasets import load_iris

data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

In [20]:
X.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [21]:
X.iloc[10:20, 0:2] = np.nan
X.isnull().sum()

sepal length (cm)    10
sepal width (cm)     10
petal length (cm)     0
petal width (cm)      0
dtype: int64

In [22]:
X["sepal length (cm)"] = X["sepal length (cm)"].interpolate(method="linear")
X["sepal width (cm)"] = X["sepal width (cm)"].interpolate(method="quadratic")

In [23]:
X.isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

## 6. Multiple Imputation technique


In [24]:
from sklearn.datasets import load_iris
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [25]:
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)

X.iloc[0:100:4, 0] = None
X.isnull().sum()

sepal length (cm)    25
sepal width (cm)      0
petal length (cm)     0
petal width (cm)      0
dtype: int64

In [26]:
# Impute missing values using IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=0)
X_imputed = imp.fit_transform(X)

pd.DataFrame(X_imputed, columns=X.columns).isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64