
# Exercise 1 — Baseline Model Prediction (House Prices)

This notebook downloads the train/test data, performs light EDA, builds a **baseline Linear Regression** model, evaluates it with **MAE**, and generates **`submission.csv`**.

**Files saved by this notebook (in this `module3` folder):**
- `exercise1.ipynb` (this notebook)
- `submission.csv` with columns: `id,SalePrice`

**Targets:**
- MAE threshold: **36,000** (lower is better).


In [None]:

# %% [markdown]
# ## 1) Setup & Data Download

import os
import pandas as pd
import numpy as np

# plotting
import matplotlib.pyplot as plt

# modeling
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# I/O
import requests

DATA_DIR = "."
TRAIN_URL = "https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv"
TEST_URL  = "https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv"

def download_file(url, file_name):
    if not os.path.exists(file_name):
        r = requests.get(url, timeout=60)
        r.raise_for_status()
        with open(file_name, "wb") as f:
            f.write(r.content)
        print(f"Downloaded {file_name} from {url}")
    else:
        print(f"Found cached file: {file_name}")

train_path = os.path.join(DATA_DIR, "module3_exercise_train.csv")
test_path  = os.path.join(DATA_DIR, "module3_exercise_test.csv")

download_file(TRAIN_URL, train_path)
download_file(TEST_URL,  test_path)

df_train = pd.read_csv(train_path, sep=",", index_col="id")
df_test  = pd.read_csv(test_path,  sep=",", index_col="id")

df_train.head(3)


In [None]:

# %% [markdown]
# ## 2) Quick EDA
print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)
print("\nTrain columns:", list(df_train.columns))

na_counts = df_train.isna().sum().sort_values(ascending=False)
print("\nTop missing values in train:")
print(na_counts[na_counts>0].head(10))

# Quick check of target
df_train['SalePrice'].plot(kind='hist', bins=40, title='SalePrice distribution')
plt.xlabel('SalePrice')
plt.show()


In [None]:

# %% [markdown]
# ## 3) Data Prep
# We'll keep it simple for a baseline:
# - Use only numeric features (excluding the target)
# - Median impute missing values

target = "SalePrice"
X = df_train.drop(columns=[target])
y = df_train[target].astype(float)

# Select numeric columns (object columns are ignored for baseline)
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# Sanity: some datasets include an erroneous 'col1'—drop if present
if 'col1' in X.columns:
    X = X.drop(columns=['col1'])
    df_test = df_test.drop(columns=['col1'], errors='ignore')
    num_cols = [c for c in num_cols if c != 'col1']

X_num = X[num_cols]
X_test_num = df_test[num_cols]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

preprocess = ColumnTransformer(
    transformers=[("num", numeric_transformer, num_cols)],
    remainder="drop"
)

X_num.head(3)


In [None]:

# %% [markdown]
# ## 4) Baseline Model & Validation
# We'll use a simple Linear Regression on the numeric features.

X_train, X_valid, y_train, y_valid = train_test_split(
    X_num, y, test_size=0.2, random_state=42
)

model = Pipeline(steps=[
    ("prep", preprocess),
    ("lr", LinearRegression())
])

model.fit(X_train, y_train)
preds = model.predict(X_valid)
mae = mean_absolute_error(y_valid, preds)
print(f"Validation MAE: {mae:,.0f}")


In [None]:

# %% [markdown]
# ## 5) Fit on Full Train & Predict Test, Save the file

model.fit(X_num, y)
test_preds = model.predict(X_test_num)

submission = pd.DataFrame({
    "id": df_test.index,
    "SalePrice": np.maximum(0, test_preds)  # no negatives
})

submission_path = os.path.join(DATA_DIR, "submission.csv")
submission.to_csv(submission_path, index=False)
print(f"Saved: {submission_path}")
submission.head()
