In [1]:
# Exercise 2: cross validation(k-fold)

In [2]:
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [3]:
# Load data
housing = fetch_california_housing()
X, y = housing['data'], housing['target']

# Split data train test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, shuffle=True, random_state=43
)

# Create pipeline
pipeline = [
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('lr', LinearRegression())
]
pipe = Pipeline(pipeline)

In [5]:
# Cross validate with 10 folds
cv_results = cross_validate(
    pipe, X_train, y_train, 
    cv=10, 
    return_train_score=False,
    scoring='r2'  # Using R² score
)
cv_results

{'fit_time': array([0.0345912 , 0.0195353 , 0.01637435, 0.01627493, 0.01626635,
        0.01577806, 0.01615214, 0.01580238, 0.01876068, 0.01574898]),
 'score_time': array([0.00219274, 0.00132418, 0.0015204 , 0.00138402, 0.00145698,
        0.00126791, 0.00131583, 0.00130939, 0.00179124, 0.00128961]),
 'test_score': array([0.62433594, 0.61648956, 0.62486602, 0.59891024, 0.59284295,
        0.61307055, 0.54630341, 0.60742976, 0.60014575, 0.59574508])}

In [8]:
# Extract scores from results
scores = cv_results['test_score']

print("Scores on validation sets:\n", scores)
print("\nMean of scores on validation sets:\n", scores.mean())  # or np.mean(scores)
print("\nStandard deviation of scores on validation sets:\n", np.std(scores)) # or scores.std()


Scores on validation sets:
 [0.62433594 0.61648956 0.62486602 0.59891024 0.59284295 0.61307055
 0.54630341 0.60742976 0.60014575 0.59574508]

Mean of scores on validation sets:
 0.6020139252674299

Standard deviation of scores on validation sets:
 0.02149838227734666


In [9]:
#  Explanation

# cross_validate() splits the data into k folds (here 10) automatically.
# Each fold acts as a validation set once while the remaining 9 folds form the training set.

# The function returns a dictionary containing:

## 'fit_time': time to train the model in each fold
## 'score_time': time to evaluate
## 'test_score': performance (R² score by default for regression)

# We use only the training set (X_train) for cross-validation.
# The test set (X_test) remains untouched for final evaluation after model selection.

#  Key Learnings

## Cross-validation reduces overfitting risk by validating model performance on multiple splits.
## Mean score shows the model’s average performance.
## Standard deviation indicates model stability — smaller values mean consistent results.
## The key 'test_score' in results refers to the validation folds, not the final test set.

# This process ensures that the model’s quality is not due to luck or a favorable split.