In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, train_test_split

# Step a: Load dataset
#url = "USA_Housing.csv"   # replace with your local file path after downloading
df = pd.read_csv('USA_Housingg.csv')
# Separate input features (X) and target (y)
X = df.drop("Price", axis=1).values
y = df["Price"].values.reshape(-1, 1)   #just the Price column (the target), reshaped to a 2D column vector

# Step b: Scale the input features
scaler = StandardScaler()     
X_scaled = scaler.fit_transform(X)

# Step c: 5-fold cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_beta = None
best_r2 = -np.inf     #To make sure that any real R² value we calculate is larger than our starting value, we initialize with the lowest possible value:
r2_scores = []

# Step d: Perform 5-fold CV
for fold, (train_idx, test_idx) in enumerate(kf.split(X_scaled)):      #kf.split(X_scaled) generates 5 splits (since we used KFold(n_splits=5)).
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Add bias column of ones for intercept
    X_train_bias = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    X_test_bias = np.c_[np.ones((X_test.shape[0], 1)), X_test]

    # Compute beta using Least Squares: β = (XᵀX)^(-1) Xᵀy
    beta = np.linalg.inv(X_train_bias.T @ X_train_bias) @ (X_train_bias.T @ y_train)
     # Predictions
    y_pred = X_test_bias @ beta

    # R2 score
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    print(f"Fold {fold+1}: R2 Score = {r2:.4f}")

    # Track best beta
    if r2 > best_r2:
        best_r2 = r2
        best_beta = beta

print("\nAverage R2 Score across 5 folds:", np.mean(r2_scores))
print("Best R2 Score:", best_r2)

# Step e: Train on 70% using best beta and test on 30%
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

X_train_bias = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_test_bias = np.c_[np.ones((X_test.shape[0], 1)), X_test]

# Train using best beta on 70% data
y_train_pred = X_train_bias @ best_beta
y_test_pred = X_test_bias @ best_beta

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("\nFinal Model Performance using Best Beta:")
print(f"Train R2 Score: {train_r2:.4f}")
print(f"Test R2 Score: {test_r2:.4f}")

Fold 1: R2 Score = 0.9180
Fold 2: R2 Score = 0.9146
Fold 3: R2 Score = 0.9116
Fold 4: R2 Score = 0.9193
Fold 5: R2 Score = 0.9244

Average R2 Score across 5 folds: 0.9175745431092714
Best R2 Score: 0.9243869413350316

Final Model Performance using Best Beta:
Train R2 Score: 0.9193
Test R2 Score: 0.9147
