# ML100 Exercises: Data Splitting & Feature Fundamentals

These exercises cover core data preparation skills: train/test splitting, cross-validation,
data leakage detection, feature engineering, and preprocessing pipelines.

**Difficulty increases with each exercise.**

In [None]:
# ============================================================
# Setup: Run this cell first
# ============================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import (
    train_test_split, cross_val_score, KFold, StratifiedKFold
)
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris, make_classification

np.random.seed(42)
RANDOM_STATE = 42

print("Setup complete.")

---
## Exercise 1: Stratified Train/Test Split

**Goal:** Split a dataset using `train_test_split` with stratification and verify that
the class proportions in the train and test sets match the original dataset.

**Tasks:**
1. Load the Iris dataset.
2. Perform an 80/20 stratified split (`random_state=42`).
3. Print the class proportions (as fractions) in the original, train, and test sets.
4. Assert that all three sets have approximately the same proportions.

In [None]:
# Exercise 1 - Starter Code

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()
X, y = iris.data, iris.target

# TODO 1: Perform an 80/20 stratified split
# X_train, X_test, y_train, y_test = ...

# TODO 2: Compute class proportions for original, train, and test sets
# Hint: use np.bincount(y) / len(y) for proportions
# original_proportions = ...
# train_proportions = ...
# test_proportions = ...

# TODO 3: Print all three proportion arrays

# TODO 4: Assert train proportions are close to original proportions
# np.testing.assert_array_almost_equal(..., ..., decimal=1)

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()
X, y = iris.data, iris.target

# 1. Stratified 80/20 split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 2. Compute class proportions
original_proportions = np.bincount(y) / len(y)
train_proportions = np.bincount(y_train) / len(y_train)
test_proportions = np.bincount(y_test) / len(y_test)

# 3. Print proportions
print(f"Original proportions: {original_proportions}")
print(f"Train proportions:    {train_proportions}")
print(f"Test proportions:     {test_proportions}")

# 4. Assert approximately equal
np.testing.assert_array_almost_equal(
    train_proportions, original_proportions, decimal=1
)
np.testing.assert_array_almost_equal(
    test_proportions, original_proportions, decimal=1
)
print("\nAll proportions match!")
```

</details>

---
## Exercise 2: 5-Fold Cross-Validation on Iris

**Goal:** Implement 5-fold cross-validation on the Iris dataset using
`LogisticRegression` and report the mean and standard deviation of accuracy.

**Tasks:**
1. Load the Iris dataset.
2. Create a `LogisticRegression` model (`max_iter=200, random_state=42`).
3. Run 5-fold cross-validation using `cross_val_score`.
4. Print mean accuracy +/- standard deviation.

In [None]:
# Exercise 2 - Starter Code

from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

iris = load_iris()
X, y = iris.data, iris.target

# TODO 1: Create LogisticRegression model (max_iter=200, random_state=42)
# model = ...

# TODO 2: Run 5-fold cross-validation
# scores = cross_val_score(..., ..., ..., cv=5, scoring='accuracy')

# TODO 3: Print individual fold scores
# print(f"Fold scores: {scores}")

# TODO 4: Print mean +/- std
# print(f"Accuracy: {scores.mean():.4f} +/- {scores.std():.4f}")

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

iris = load_iris()
X, y = iris.data, iris.target

# 1. Create model
model = LogisticRegression(max_iter=200, random_state=42)

# 2. Run 5-fold cross-validation
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

# 3. Print individual fold scores
print(f"Fold scores: {scores}")

# 4. Print mean +/- std
print(f"Accuracy: {scores.mean():.4f} +/- {scores.std():.4f}")
```

</details>

---
## Exercise 3: Identify and Fix Data Leakage

**Goal:** The code below contains a common data leakage mistake. Identify the
problem, explain why it causes leakage, and rewrite the code correctly.

**Tasks:**
1. Read the "bad" code below.
2. In a comment, explain what the leakage is.
3. Rewrite the code so that no leakage occurs.
4. Compare the scores from the leaky and correct approaches.

In [None]:
# Exercise 3 - Starter Code
# ---- BAD CODE (contains data leakage) ----

from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score

X, y = make_classification(
    n_samples=1000, n_features=20, n_informative=10,
    random_state=42
)

# --- LEAKY approach ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # <-- Scaling on ALL data before splitting

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)
leaky_model = LogisticRegression(random_state=42, max_iter=200)
leaky_model.fit(X_train, y_train)
leaky_score = leaky_model.score(X_test, y_test)
print(f"Leaky test accuracy: {leaky_score:.4f}")

# TODO 1: Explain in a comment why the code above has data leakage.
#
# Your explanation: ...
#

# TODO 2: Rewrite the code correctly (split FIRST, then scale)
# X_train_raw, X_test_raw, y_train, y_test = ...
# scaler_correct = StandardScaler()
# X_train_correct = scaler_correct.fit_transform(...)  # fit on train only
# X_test_correct = scaler_correct.transform(...)        # transform test only
# correct_model = ...
# correct_score = ...

# TODO 3: Print both scores and compare
# print(f"Correct test accuracy: {correct_score:.4f}")

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X, y = make_classification(
    n_samples=1000, n_features=20, n_informative=10,
    random_state=42
)

# --- LEAKY approach (for comparison) ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Leakage: test data stats leak into training

X_train_leak, X_test_leak, y_train_leak, y_test_leak = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)
leaky_model = LogisticRegression(random_state=42, max_iter=200)
leaky_model.fit(X_train_leak, y_train_leak)
leaky_score = leaky_model.score(X_test_leak, y_test_leak)
print(f"Leaky test accuracy: {leaky_score:.4f}")

# Explanation: The StandardScaler is fit on the ENTIRE dataset (train + test)
# before splitting. This means the mean and standard deviation used for
# scaling include information from the test set, which the model should
# never see during training. This is data leakage.

# --- CORRECT approach ---
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler_correct = StandardScaler()
X_train_correct = scaler_correct.fit_transform(X_train_raw)  # fit on train only
X_test_correct = scaler_correct.transform(X_test_raw)         # transform test

correct_model = LogisticRegression(random_state=42, max_iter=200)
correct_model.fit(X_train_correct, y_train)
correct_score = correct_model.score(X_test_correct, y_test)
print(f"Correct test accuracy: {correct_score:.4f}")

print(f"\nDifference: {abs(leaky_score - correct_score):.4f}")
```

</details>

---
## Exercise 4: Feature Engineering from Dates and Numerics

**Goal:** Given a synthetic dataset with a date column and numeric columns,
engineer at least 3 new meaningful features.

**Tasks:**
1. From the `date` column, extract: year, month, and day-of-week.
2. Create an interaction feature: `feature_A * feature_B`.
3. Create a ratio feature: `feature_A / (feature_B + 1)`.
4. Create a binned version of `feature_A` (e.g., low / medium / high).
5. Print the resulting DataFrame showing all new columns.

In [None]:
# Exercise 4 - Starter Code

import pandas as pd
import numpy as np

np.random.seed(42)

# Create synthetic dataset
n = 200
df = pd.DataFrame({
    'date': pd.date_range('2020-01-01', periods=n, freq='D'),
    'feature_A': np.random.normal(50, 15, n),
    'feature_B': np.random.exponential(10, n),
    'category': np.random.choice(['Electronics', 'Clothing', 'Food'], n)
})

print("Original DataFrame:")
print(df.head())
print(f"Shape: {df.shape}")

# TODO 1: Extract year, month, and day-of-week from 'date'
# df['year'] = ...
# df['month'] = ...
# df['day_of_week'] = ...

# TODO 2: Create interaction feature
# df['A_times_B'] = ...

# TODO 3: Create ratio feature (add 1 to denominator to avoid division by zero)
# df['A_over_B'] = ...

# TODO 4: Create binned version of feature_A (low / medium / high)
# Hint: use pd.cut or pd.qcut
# df['A_binned'] = ...

# TODO 5: Print the DataFrame with new features
# print(df.head(10))
# print(f"New shape: {df.shape}")

### Solution

<details><summary>Click to reveal solution</summary>

```python
import pandas as pd
import numpy as np

np.random.seed(42)

# Create synthetic dataset
n = 200
df = pd.DataFrame({
    'date': pd.date_range('2020-01-01', periods=n, freq='D'),
    'feature_A': np.random.normal(50, 15, n),
    'feature_B': np.random.exponential(10, n),
    'category': np.random.choice(['Electronics', 'Clothing', 'Food'], n)
})

# 1. Extract date features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.dayofweek  # 0=Monday, 6=Sunday

# 2. Interaction feature
df['A_times_B'] = df['feature_A'] * df['feature_B']

# 3. Ratio feature
df['A_over_B'] = df['feature_A'] / (df['feature_B'] + 1)

# 4. Binned version of feature_A
df['A_binned'] = pd.qcut(df['feature_A'], q=3, labels=['low', 'medium', 'high'])

# 5. Print results
print("DataFrame with engineered features:")
print(df.head(10))
print(f"\nNew shape: {df.shape}")
print(f"\nNew columns: {[c for c in df.columns if c not in ['date', 'feature_A', 'feature_B', 'category']]}")
```

</details>

---
## Exercise 5: ColumnTransformer Pipeline

**Goal:** Build a `ColumnTransformer` pipeline that applies `StandardScaler` to
numeric features and `OneHotEncoder` to categorical features, then train a
`LogisticRegression` through the pipeline.

**Tasks:**
1. Create a DataFrame with numeric and categorical columns.
2. Define numeric and categorical column lists.
3. Build a `ColumnTransformer` with appropriate transformers.
4. Create a `Pipeline` with the preprocessor and `LogisticRegression`.
5. Evaluate with 5-fold cross-validation and print results.

In [None]:
# Exercise 5 - Starter Code

import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

np.random.seed(42)

# Create a synthetic mixed-type dataset
n = 500
df = pd.DataFrame({
    'age': np.random.randint(18, 70, n),
    'income': np.random.normal(50000, 15000, n),
    'hours_per_week': np.random.randint(10, 60, n),
    'education': np.random.choice(['High School', 'Bachelors', 'Masters', 'PhD'], n),
    'occupation': np.random.choice(['Tech', 'Healthcare', 'Finance', 'Education', 'Other'], n),
})
# Binary target
df['target'] = (df['income'] + df['age'] * 100 + np.random.normal(0, 5000, n) > 55000).astype(int)

X = df.drop('target', axis=1)
y = df['target']

print("Features:")
print(X.head())
print(f"\nTarget distribution: {y.value_counts().to_dict()}")

# TODO 1: Define numeric and categorical columns
# numeric_cols = [...]
# categorical_cols = [...]

# TODO 2: Build ColumnTransformer
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), numeric_cols),
#         ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
#     ]
# )

# TODO 3: Create Pipeline with preprocessor + LogisticRegression
# pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('classifier', LogisticRegression(random_state=42, max_iter=200))
# ])

# TODO 4: Evaluate with 5-fold cross-validation
# scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')

# TODO 5: Print results
# print(f"CV Accuracy: {scores.mean():.4f} +/- {scores.std():.4f}")

### Solution

<details><summary>Click to reveal solution</summary>

```python
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

np.random.seed(42)

# Create a synthetic mixed-type dataset
n = 500
df = pd.DataFrame({
    'age': np.random.randint(18, 70, n),
    'income': np.random.normal(50000, 15000, n),
    'hours_per_week': np.random.randint(10, 60, n),
    'education': np.random.choice(['High School', 'Bachelors', 'Masters', 'PhD'], n),
    'occupation': np.random.choice(['Tech', 'Healthcare', 'Finance', 'Education', 'Other'], n),
})
df['target'] = (df['income'] + df['age'] * 100 + np.random.normal(0, 5000, n) > 55000).astype(int)

X = df.drop('target', axis=1)
y = df['target']

# 1. Define column types
numeric_cols = ['age', 'income', 'hours_per_week']
categorical_cols = ['education', 'occupation']

# 2. Build ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# 3. Create Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=200))
])

# 4. Evaluate with 5-fold cross-validation
scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')

# 5. Print results
print(f"Fold scores: {scores}")
print(f"CV Accuracy: {scores.mean():.4f} +/- {scores.std():.4f}")
```

</details>