# Linear Regression demo (housing.csv)

This notebook loads `housing.csv` from the repository root, trains a simple scikit-learn LinearRegression model, evaluates it, plots actual vs predicted values, and saves the trained model to `model.joblib`.

Run this notebook inside the project's virtual environment (`.venv`) for reproducible results. See `requirements.txt` for dependencies.

In [None]:
%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import joblib

sns.set(style="whitegrid")

In [None]:
# Load the data
csv_path = os.path.join(os.getcwd(), "housing.csv")
print('Looking for:', csv_path)
df = pd.read_csv(csv_path)
print('Loaded dataframe shape:', df.shape)
df.head()

In [None]:
# Choose a sensible feature and target if available
if 'median_income' in df.columns and 'median_house_value' in df.columns:
    X = df[['median_income']].copy()
    y = df['median_house_value'].copy()
    print('Using median_income -> median_house_value')
else:
    # fall back to numeric columns: first numeric as feature, last numeric as target
    numeric = df.select_dtypes(include=[np.number])
    if numeric.shape[1] < 2:
        raise ValueError('Not enough numeric columns to run a regression')
    X = numeric.iloc[:, [0]].copy()
    y = numeric.iloc[:, -1].copy()
    print(f'Using fallback columns: {X.columns.tolist()} -> {y.name}')

# Simple preprocessing: drop rows with missing values in chosen columns
data = pd.concat([X, y], axis=1).dropna()
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
print('After dropna, samples =', len(y))

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('Train samples:', len(y_train), 'Test samples:', len(y_test))

# Fit linear regression
model = LinearRegression()
model.fit(X_train, y_train)
print('Coefficient(s):', model.coef_)
print('Intercept:', model.intercept_)

In [None]:
# Predictions and evaluation
y_pred = model.predict(X_test)
r2 = metrics.r2_score(y_test, y_pred)
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
print(f'R^2: {r2:.4f}')
print(f'MAE: {mae:.4f}')
print(f'MSE: {mse:.4f}')

In [None]:
# Plot actual vs predicted
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred, alpha=0.6)
minv = min(y_test.min(), y_pred.min())
maxv = max(y_test.max(), y_pred.max())
plt.plot([minv, maxv], [minv, maxv], color='red', linestyle='--')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted')
plt.show()

In [None]:
# Save the model
model_path = os.path.join(os.getcwd(), 'model.joblib')
joblib.dump(model, model_path)
print('Saved model to', model_path)

## Notes and next steps
- Install dependencies inside the project's virtual environment: `source .venv/bin/activate` then `pip install -r requirements.txt`.
- You can extend the notebook to try multiple features, pipeline preprocessing, or cross-validation.
- To run programmatically, see tools such as `nbclient`, `papermill`, or `nbconvert` for execution and testing.