In [1]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

# Load the California housing dataset
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['MedHouseVal'] = housing.target # Median House Value is the target variable

display(df.head())

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np

# Check for missing values
print("Missing values before imputation:")
print(df.isnull().sum())

# Impute missing values (if any) - using median for numerical data
# The California Housing dataset doesn't have missing values, but this is included
# as a general preprocessing step.
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

print("\nMissing values after imputation:")
print(df_imputed.isnull().sum())

# Separate features (X) and target (y)
X = df_imputed.drop('MedHouseVal', axis=1)
y = df_imputed['MedHouseVal']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nData preprocessing complete. Data split into training and testing sets and features scaled.")
print("Shape of X_train_scaled:", X_train_scaled.shape)
print("Shape of X_test_scaled:", X_test_scaled.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Missing values before imputation:
MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64

Missing values after imputation:
MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64

Data preprocessing complete. Data split into training and testing sets and features scaled.
Shape of X_train_scaled: (16512, 8)
Shape of X_test_scaled: (4128, 8)
Shape of y_train: (16512,)
Shape of y_test: (4128,)


In [3]:
from sklearn.linear_model import LinearRegression

# Build and train the linear regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

print("Linear regression model trained successfully.")

Linear regression model trained successfully.


In [4]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R-squared (R2): {r2:.2f}")

Mean Squared Error (MSE): 0.56
Mean Absolute Error (MAE): 0.53
R-squared (R2): 0.58


In [5]:
# Example of making predictions on new data
# In a real scenario, this 'new_data' would come from actual new patients

# Create a sample of new data (using the first 5 rows of the scaled test set for demonstration)
new_data_scaled = X_test_scaled[:5]

# Make predictions
new_predictions = model.predict(new_data_scaled)

print("Predictions for the first 5 samples of the test set:")
for i, prediction in enumerate(new_predictions):
    print(f"Sample {i+1}: Predicted House Value = {prediction:.2f}")

# You can compare these predictions to the actual values in y_test[:5]
print("\nActual House Values for the first 5 samples of the test set:")
for i, actual in enumerate(y_test[:5]):
     print(f"Sample {i+1}: Actual House Value = {actual:.2f}")

Predictions for the first 5 samples of the test set:
Sample 1: Predicted House Value = 0.72
Sample 2: Predicted House Value = 1.76
Sample 3: Predicted House Value = 2.71
Sample 4: Predicted House Value = 2.84
Sample 5: Predicted House Value = 2.60

Actual House Values for the first 5 samples of the test set:
Sample 1: Actual House Value = 0.48
Sample 2: Actual House Value = 0.46
Sample 3: Actual House Value = 5.00
Sample 4: Actual House Value = 2.19
Sample 5: Actual House Value = 2.78
