In [1]:
from sklearn.datasets import fetch_california_housing

# Step 1: Load the California housing dataset
data = fetch_california_housing()

In [2]:
# Inspect the type and structure of the loaded data
print(f"Type of data: {type(data)}")
print(f"Keys in data: {list(data.keys())}")

Type of data: <class 'sklearn.utils._bunch.Bunch'>
Keys in data: ['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR']


In [3]:
# Display the value for a single key: 'feature_names'
print("\nFeature Names:")
print(data['DESCR'])


Feature Names:
.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, usin

In [4]:
# Inspect the structure of the dataset
print("\nDataset structure:")
print(f"Shape of data (features): {data['data'].shape}")
print(f"Shape of target: {data['target'].shape}")

# Display the first few rows of data and target
print("\nFirst 5 rows of data (features):")
print(data['data'][:5])

print("\nFirst 5 target values (median house values):")
print(data['target'][:5])


Dataset structure:
Shape of data (features): (20640, 8)
Shape of target: (20640,)

First 5 rows of data (features):
[[ 8.32520000e+00  4.10000000e+01  6.98412698e+00  1.02380952e+00
   3.22000000e+02  2.55555556e+00  3.78800000e+01 -1.22230000e+02]
 [ 8.30140000e+00  2.10000000e+01  6.23813708e+00  9.71880492e-01
   2.40100000e+03  2.10984183e+00  3.78600000e+01 -1.22220000e+02]
 [ 7.25740000e+00  5.20000000e+01  8.28813559e+00  1.07344633e+00
   4.96000000e+02  2.80225989e+00  3.78500000e+01 -1.22240000e+02]
 [ 5.64310000e+00  5.20000000e+01  5.81735160e+00  1.07305936e+00
   5.58000000e+02  2.54794521e+00  3.78500000e+01 -1.22250000e+02]
 [ 3.84620000e+00  5.20000000e+01  6.28185328e+00  1.08108108e+00
   5.65000000e+02  2.18146718e+00  3.78500000e+01 -1.22250000e+02]]

First 5 target values (median house values):
[4.526 3.585 3.521 3.413 3.422]


In [5]:
import pandas as pd

# Convert to Pandas DataFrame for better readability
features_df = pd.DataFrame(data['data'], columns=data['feature_names'])
target_df = pd.DataFrame(data['target'], columns=['MedianHouseValue'])

# Display the first few rows of the DataFrame
print("\nFirst 5 rows of the features DataFrame:")
print(features_df.head())

print("\nFirst 5 rows of the target DataFrame:")
print(target_df.head())


First 5 rows of the features DataFrame:
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  
0    -122.23  
1    -122.22  
2    -122.24  
3    -122.25  
4    -122.25  

First 5 rows of the target DataFrame:
   MedianHouseValue
0             4.526
1             3.585
2             3.521
3             3.413
4             3.422


In [6]:

# Check for missing values in features and target
print("\nMissing values in features:")
print(features_df.isnull().sum())

print("\nMissing values in target:")
print(target_df.isnull().sum())


Missing values in features:
MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

Missing values in target:
MedianHouseValue    0
dtype: int64


In [7]:
from sklearn.preprocessing import StandardScaler


# Scale the features using StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features_df)

# Convert scaled features back to DataFrame for readability
scaled_features_df = pd.DataFrame(scaled_features, columns=features_df.columns)

# Display the first few rows of the scaled features DataFrame
print("\nFirst 5 rows of the scaled features DataFrame:")
print(scaled_features_df.head())


First 5 rows of the scaled features DataFrame:
     MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  2.344766  0.982143  0.628559  -0.153758   -0.974429 -0.049597  1.052548   
1  2.332238 -0.607019  0.327041  -0.263336    0.861439 -0.092512  1.043185   
2  1.782699  1.856182  1.155620  -0.049016   -0.820777 -0.025843  1.038503   
3  0.932968  1.856182  0.156966  -0.049833   -0.766028 -0.050329  1.038503   
4 -0.012881  1.856182  0.344711  -0.032906   -0.759847 -0.085616  1.038503   

   Longitude  
0  -1.327835  
1  -1.322844  
2  -1.332827  
3  -1.337818  
4  -1.337818  


In [8]:
from sklearn.model_selection import train_test_split

# Step 2: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(scaled_features_df, target_df, test_size=0.2, random_state=42)

print("\nTraining set size (features):", X_train.shape)
print("Testing set size (features):", X_test.shape)


Training set size (features): (16512, 8)
Testing set size (features): (4128, 8)


In [9]:

# Randomly sample 2,000 instances from the training set
sample_size = 2000
X_train_sample = X_train.sample(n=sample_size, random_state=42)
y_train_sample = y_train.loc[X_train_sample.index]

print(f"Sampled training set size: {X_train_sample.shape}")
print(f"Sampled training set size: {y_train_sample.shape}")

Sampled training set size: (2000, 8)
Sampled training set size: (2000, 1)


In [10]:
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
import pandas as pd
from scipy.stats import uniform
import numpy as np

# Define the parameter distributions
param_distributions = {
    'C': uniform(1, 20),          # C values between 1 and 21
    'epsilon': uniform(0.01, 1),  # Epsilon values between 0.01 and 1.01
    'kernel': ['linear', 'rbf', 'poly']
}

# Initialize the SVR model
svr = SVR()

# Define a custom RMSE scoring function
def rmse_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    return -rmse  # Negate to align with scikit-learn's "greater_is_better=False"

# Set up RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(
    estimator=svr,
    param_distributions=param_distributions,
    n_iter=30,                    # Number of random combinations to try
    cv=5,                         # 5-fold cross-validation
    scoring=rmse_scorer,          # Use custom RMSE scorer
    verbose=2,
    random_state=42,
    n_jobs=-1                     # Use all available cores
)

# Perform random search on the sampled dataset
random_search.fit(X_train_sample, y_train_sample.values.ravel())

# Display the best parameters and the best score
print(f"Best parameters: {random_search.best_params_}")
print(f"Best cross-validation RMSE: {-random_search.best_score_:.4f}")

# Create a DataFrame to show the performance of all hyperparameter combinations
results_df = pd.DataFrame(random_search.cv_results_)

# Extract and display relevant information
results_df = results_df[['param_C', 'param_epsilon', 'param_kernel', 'mean_test_score']]

print("\nPerformance of all hyperparameter combinations:")
print(results_df.sort_values(by='mean_test_score', ascending=False))

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best parameters: {'C': 14.25044568707964, 'epsilon': 0.32171107608941096, 'kernel': 'rbf'}
Best cross-validation RMSE: 0.6143

Performance of all hyperparameter combinations:
      param_C param_epsilon param_kernel  mean_test_score
17  14.250446      0.321711          rbf        -0.614312
16   4.467293      0.401061          rbf        -0.623756
1    16.59382       0.60685          rbf        -0.629571
3     13.0223      0.718073          rbf        -0.637105
19  20.391693      0.785133          rbf        -0.646457
4    2.128232      0.731999          rbf        -0.660818
26   1.312728      0.433401       linear        -1.154285
5    5.246782      0.191825       linear        -1.163311
12  10.009985      0.023265       linear        -1.165679
2     4.11989      0.068084       linear        -1.165827
23   8.773546      0.281349       linear        -1.170434
11   1.929008      0.617545       linear        -1.181115
18   5.15

In [11]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Initialize the SVR model with the best hyperparameters
final_svr_model = SVR(C=10, epsilon=0.1, kernel='rbf')

# Train the model on the full training set
final_svr_model.fit(X_train, y_train.values.ravel())

# Predict on the test set
y_pred_final = final_svr_model.predict(X_test)

# Compute Root Mean Squared Error (RMSE)
rmse_final = np.sqrt(mean_squared_error(y_test, y_pred_final))

# Compute Mean Absolute Percentage Error (MAPE)
mape_final = np.mean(np.abs((y_test.values.ravel() - y_pred_final) / y_test.values.ravel())) * 100

# Compute Normalized RMSE (NRMSE)
nrmse_final = rmse_final / (y_test.max().values[0] - y_test.min().values[0])

print(f"Final SVR Model RMSE: {rmse_final:.4f}")
print(f"Final SVR Model MAPE: {mape_final:.2f}%")
print(f"Final SVR Model NRMSE: {nrmse_final:.4f}")

Final SVR Model RMSE: 0.5672
Final SVR Model MAPE: 20.40%
Final SVR Model NRMSE: 0.1170
