## Import libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Regression Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

## Load California Housing Dataset

In [3]:
# Load California Housing dataset
california = fetch_california_housing()
X = pd.DataFrame(california.data, columns=california.feature_names)
y = pd.Series(california.target, name='MedHouseVal')  # Median House Value

## Split Data into Training and Test Sets

In [4]:
# Split into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Preprocess Data

In [5]:
# Standardize features (important for models like SVR)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Train and Evaluate Models

In [6]:
# Define regression models to test
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=1.0),
    "Support Vector Regression": SVR(kernel='rbf'),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42)
}

# Evaluate each model
results = []
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results.append((name, mse, r2))

# Display results
results_df = pd.DataFrame(results, columns=["Model", "MSE", "R²"])
print(results_df)

                       Model       MSE        R²
0          Linear Regression  0.555892  0.575788
1           Ridge Regression  0.555855  0.575816
2           Lasso Regression  1.310696 -0.000219
3  Support Vector Regression  0.357004  0.727563
4              Decision Tree  0.493969  0.623042
5              Random Forest  0.255170  0.805275


## Feature Importance (Random Forest)

In [7]:
# Get feature importances for the best model (Random Forest)
importances = pd.DataFrame({
    "Feature": california.feature_names,
    "Importance": models["Random Forest"].feature_importances_
}).sort_values(by="Importance", ascending=False)

print(importances)

      Feature  Importance
0      MedInc    0.524871
5    AveOccup    0.138443
6    Latitude    0.088936
7   Longitude    0.088629
1    HouseAge    0.054593
2    AveRooms    0.044272
4  Population    0.030650
3   AveBedrms    0.029606


## Test with a Sample from the Test Set

In [8]:
# Select the first row of the test set (unscaled)
sample = X_test.iloc[0:1]  # Keep as DataFrame to preserve column names

# Scale the sample
sample_scaled = scaler.transform(sample)

# Predict
prediction = model.predict(sample_scaled)

# Actual value from y_test
actual = y_test.iloc[0]

print("Sample from Test Set:")
print(sample)
print(f"\nPredicted MedHouseVal: {prediction[0]:.2f} ($100,000s)")
print(f"Actual MedHouseVal: {actual:.2f} ($100,000s)")

Sample from Test Set:
       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
20046  1.6812      25.0  4.192201   1.022284      1392.0  3.877437     36.06   

       Longitude  
20046    -119.01  

Predicted MedHouseVal: 0.51 ($100,000s)
Actual MedHouseVal: 0.48 ($100,000s)


## Test with Hypothetical Data

In [10]:

hypothetical_data = pd.DataFrame({
    "MedInc": [3.0],          # Median income
    "HouseAge": [25.0],       # House age
    "AveRooms": [4.0],        # Avg rooms per household
    "AveBedrms": [1.0],       # Avg bedrooms per household
    "Population": [1000.0],   # Population
    "AveOccup": [2.0],        # Avg household members
    "Latitude": [34.05],      # Latitude
    "Longitude": [-118.24]    # Longitude
})

# Scale the data
hypothetical_scaled = scaler.transform(hypothetical_data)

# Predict
hypothetical_pred = model.predict(hypothetical_scaled)

print("\nHypothetical Input:")
print(hypothetical_data)
print(f"\nPredicted MedHouseVal: {hypothetical_pred[0]:.2f} ($100,000s)")


Hypothetical Input:
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0     3.0      25.0       4.0        1.0      1000.0       2.0     34.05   

   Longitude  
0    -118.24  

Predicted MedHouseVal: 2.21 ($100,000s)
