In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing

## 1. Deletion method

In [2]:
# Read the data
house = fetch_california_housing()

# Convert it into pandas dataframe
df = pd.DataFrame(house.data, columns=house.feature_names)

# introduce missing values into the df
# make every 10th row = nan (missing value)
df.iloc[::10] = np.nan

# Check for missing values
df.isnull().sum()

MedInc        2064
HouseAge      2064
AveRooms      2064
AveBedrms     2064
Population    2064
AveOccup      2064
Latitude      2064
Longitude     2064
dtype: int64

In [3]:
df = df.dropna()
df.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

## 2. Mean/Median Imputation method

In [4]:
from sklearn.impute import SimpleImputer

# Read data
house = fetch_california_housing()
# Convert it into dataframe
df = pd.DataFrame(house.data, columns=house.feature_names)

# introduce missing values into the df
# make every 10th row = nan (missing value)
df.iloc[::10] = np.nan

# Check for missing values
df.isnull().sum()

MedInc        2064
HouseAge      2064
AveRooms      2064
AveBedrms     2064
Population    2064
AveOccup      2064
Latitude      2064
Longitude     2064
dtype: int64

In [5]:
# Fill missing values with the mean of the column
imputer = SimpleImputer(strategy="mean")
df_imputed = imputer.fit_transform(df)

# Convert the imputed data back to a Pandas DataFrame
df_imputed = pd.DataFrame(df_imputed, columns=df.columns)

df_imputed.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

## Or

In [6]:
# Read data
house = fetch_california_housing()
# Convert it into dataframe
df = pd.DataFrame(house.data, columns=house.feature_names)

# introduce missing values into the df
# make every 10th row = nan (missing value)
df.iloc[::10] = np.nan

# Check for missing values
df.isnull().sum()

MedInc        2064
HouseAge      2064
AveRooms      2064
AveBedrms     2064
Population    2064
AveOccup      2064
Latitude      2064
Longitude     2064
dtype: int64

In [7]:
# Replace missing values with mean or median
df.fillna(df.mean(), inplace=True)
df.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

## 3. Regression imputation method

In [8]:
from sklearn.linear_model import LinearRegression

# Read data
house = fetch_california_housing()

# Convert it into dataframe
df = pd.DataFrame(house.data, columns=house.feature_names)

# adding missing values into the df
df.iloc[10:20, 0] = np.nan

# Check for missing values
df.isnull().sum()

MedInc        10
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
dtype: int64

In [9]:
# Split the dataset into with and without missing values
x_missing = df[df.isna().any(axis=1)]
x_no_missing = df.dropna()

In [10]:
x_train = x_no_missing.drop(columns=["MedInc"])
y_train = x_no_missing["MedInc"]
x_test = x_missing.drop(columns=["MedInc"])

In [11]:
# Define a model
model = LinearRegression()
model.fit(x_train, y_train)  # Fit a model with features and targetr as MedInc

In [12]:
x_missing.loc[:, ("MedInc", "HouseAge")]

Unnamed: 0,MedInc,HouseAge
10,,52.0
11,,52.0
12,,52.0
13,,52.0
14,,52.0
15,,50.0
16,,52.0
17,,52.0
18,,50.0
19,,52.0


In [13]:
# Impute missing values using the trained model
x_missing.loc[:, ("MedInc")] = model.predict(x_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_missing.loc[:, ("MedInc")] = model.predict(x_test)


In [14]:
# Concatenate the two datasets back together
x_imputed = pd.concat([x_missing, x_no_missing], axis=0)

# Check if there are any missing values left
x_imputed.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

## 4 Using Interpolation method


In [15]:
from sklearn.datasets import load_iris

# Load Iris dataset
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)  # features
y = data.target  # target

In [16]:
# Add some missing values
X.iloc[10:20, 0] = None
X.iloc[20:30, 2] = None

In [17]:
X.isnull().sum()

sepal length (cm)    10
sepal width (cm)      0
petal length (cm)    10
petal width (cm)      0
dtype: int64

In [18]:
# Interpolate missing values using linear interpolation
X = X.interpolate(method="linear")  # simple

In [19]:
X.isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

## 5. Using multiple interpolation methods

In [20]:
from sklearn.datasets import load_iris

# Load Iris dataset
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)  # features
y = data.target  # target

In [21]:
X.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [22]:
# Put some missing values
X.iloc[10:20, 0] = None
X.isnull().sum()

sepal length (cm)    10
sepal width (cm)      0
petal length (cm)     0
petal width (cm)      0
dtype: int64

In [23]:
# Interpolate missing values using different interpolation methods
X["sepal length (cm)"] = X["sepal length (cm)"].interpolate(method="linear")
X["sepal width (cm)"] = X["sepal width (cm)"].interpolate(method="quadratic")

In [24]:
X.isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

## 6. Multiple Imputation technique


In [25]:
from sklearn.datasets import load_iris
from sklearn.experimental import enable_iterative_imputer  # iteration
from sklearn.impute import IterativeImputer  # mutltiple imputer

In [26]:
# Load Iris dataset
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)

# Add some missing values
X.iloc[0:100:4, 0] = None
X.isnull().sum()

sepal length (cm)    25
sepal width (cm)      0
petal length (cm)     0
petal width (cm)      0
dtype: int64

In [27]:
# Impute missing values using IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=0)
X_imputed = imp.fit_transform(X)

# Check if there are any missing values left
pd.DataFrame(X_imputed).isnull().sum()

0    0
1    0
2    0
3    0
dtype: int64

In [28]:
# i.e
hey = pd.DataFrame(X_imputed, columns=data.feature_names)
hey.isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64