In [1]:
# Author: Roi Yehoshua <roiyeho@gmail.com>
# Date: September 2024
# License: MIT

In [2]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [3]:
# Load the dataset
X, y = fetch_california_housing(as_frame=True, return_X_y=True)
feature_names = X.columns

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create a DataFrame with standardized features
X_scaled_df = pd.DataFrame(X_scaled, columns=feature_names)

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data['Feature'] = X_scaled_df.columns
vif_data['VIF'] = [variance_inflation_factor(X_scaled_df.values, i) for i in range(X_scaled_df.shape[1])]
print(vif_data)

      Feature       VIF
0      MedInc  2.501295
1    HouseAge  1.241254
2    AveRooms  8.342786
3   AveBedrms  6.994995
4  Population  1.138125
5    AveOccup  1.008324
6    Latitude  9.297624
7   Longitude  8.962263
