In [3]:
import os
print(os.listdir("./data"))

['boston_raw.txt', 'boston_cleaned.csv']


In [7]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Ensure the correct file path
data_path = "./data/boston_cleaned.csv"
reports_dir = "./reports"  # Define the directory for saving results
reports_path = os.path.join(reports_dir, "regression_results.csv")

# Check if the cleaned dataset exists before proceeding
if not os.path.exists(data_path):
    raise FileNotFoundError(f"Error: The file {data_path} does not exist. Run Notebook 2 first.")

# Load cleaned dataset
df = pd.read_csv(data_path)

# Splitting into training & test sets
np.random.seed(10)
numberRows = len(df)
randomlyShuffledRows = np.random.permutation(numberRows)

trainingRows = randomlyShuffledRows[0:405]  # 80% training data
testRows = randomlyShuffledRows[405:]  # 20% test data

xTrainingData = df.iloc[trainingRows, 0:-1]  # Features
yTrainingData = df.iloc[trainingRows, -1]    # Target (MEDV)

xTestData = df.iloc[testRows, 0:-1]
yTestData = df.iloc[testRows, -1]

# Train Linear Regression Model
reg = LinearRegression()
reg.fit(xTrainingData, yTrainingData)

# Model Coefficients
print("Model Coefficients:", reg.coef_)
print("Intercept:", reg.intercept_)

# Predictions
yPredictions = reg.predict(xTestData)

# Model Evaluation
mse = mean_squared_error(yTestData, yPredictions)
r2 = r2_score(yTestData, yPredictions)

# Adjusted R² Calculation
n = len(xTestData)  # Number of observations
p = len(xTestData.columns)  # Number of predictors
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

print("\nModel Evaluation Metrics:")
print(f"Mean Squared Error (MSE): {mse:.3f}")
print(f"R² Score: {r2:.4f}")
print(f"Adjusted R² Score: {adj_r2:.4f}")

# Ensure the "reports/" directory exists
os.makedirs(reports_dir, exist_ok=True)

# Save results to a file
results = {
    "MSE": mse,
    "R2": r2,
    "Adjusted_R2": adj_r2,
    "Coefficients": reg.coef_.tolist(),
    "Intercept": reg.intercept_
}

results_df = pd.DataFrame([results])
results_df.to_csv(reports_path, index=False)

print(f"\n✅ Regression results saved to {reports_path}")

Model Coefficients: [-1.24918124e-01  3.32128299e-02  1.33812411e-03  2.97105114e+00
 -1.52709220e+01  3.94205181e+00 -4.19683876e-03 -1.35975824e+00
  3.06334641e-01 -1.38625832e-02 -9.55873269e-01 -5.43480934e-01]
Intercept: 38.75264682038655

Model Evaluation Metrics:
Mean Squared Error (MSE): 25.763
R² Score: 0.6581
Adjusted R² Score: 0.6115

✅ Regression results saved to ./reports/regression_results.csv
