# ML300 Exercises: Logistic Regression & Classification

These exercises cover classification fundamentals: logistic regression, evaluation metrics,
ROC/PR curves, threshold optimization, imbalanced data handling, and probability calibration.

**Difficulty increases with each exercise.**

In [None]:
# ============================================================
# Setup: Run this cell first
# ============================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
    confusion_matrix, classification_report,
    precision_score, recall_score, f1_score,
    roc_curve, roc_auc_score,
    precision_recall_curve, average_precision_score,
    ConfusionMatrixDisplay
)
from sklearn.datasets import load_breast_cancer, make_classification
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

np.random.seed(42)
RANDOM_STATE = 42

print("Setup complete.")

---
## Exercise 1: Logistic Regression on Breast Cancer Dataset

**Goal:** Train a logistic regression classifier on the breast cancer dataset and
compute key classification metrics.

**Tasks:**
1. Load the breast cancer dataset and split 80/20.
2. Train a `LogisticRegression` model (`max_iter=10000, random_state=42`).
3. Generate predictions and compute the confusion matrix.
4. Compute precision, recall, and F1 score.
5. Display the confusion matrix and print the classification report.

In [None]:
# Exercise 1 - Starter Code

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix, classification_report,
    precision_score, recall_score, f1_score,
    ConfusionMatrixDisplay
)

data = load_breast_cancer()
X, y = data.data, data.target

# TODO 1: Split 80/20
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42
# )

# TODO 2: Train LogisticRegression
# model = LogisticRegression(max_iter=10000, random_state=42)
# model.fit(X_train, y_train)

# TODO 3: Predict and compute confusion matrix
# y_pred = model.predict(X_test)
# cm = confusion_matrix(y_test, y_pred)
# print("Confusion Matrix:")
# print(cm)

# TODO 4: Compute precision, recall, F1
# precision = precision_score(y_test, y_pred)
# recall = recall_score(y_test, y_pred)
# f1 = f1_score(y_test, y_pred)
# print(f"\nPrecision: {precision:.4f}")
# print(f"Recall:    {recall:.4f}")
# print(f"F1 Score:  {f1:.4f}")

# TODO 5: Display confusion matrix and classification report
# ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
# plt.title('Confusion Matrix')
# plt.show()
# print("\nClassification Report:")
# print(classification_report(y_test, y_pred, target_names=data.target_names))

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix, classification_report,
    precision_score, recall_score, f1_score,
    ConfusionMatrixDisplay
)

data = load_breast_cancer()
X, y = data.data, data.target

# 1. Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 2. Train
model = LogisticRegression(max_iter=10000, random_state=42)
model.fit(X_train, y_train)

# 3. Predict and confusion matrix
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# 4. Metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"\nPrecision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

# 5. Display
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.title('Confusion Matrix')
plt.show()

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=data.target_names))
```

</details>

---
## Exercise 2: ROC and Precision-Recall Curves

**Goal:** Plot ROC and Precision-Recall curves for the logistic regression model
trained on the breast cancer dataset, and report AUC scores.

**Tasks:**
1. Train a logistic regression model (reuse from Exercise 1 or retrain).
2. Get probability predictions with `predict_proba`.
3. Plot the ROC curve with the AUC score in the legend.
4. Plot the Precision-Recall curve with the average precision in the legend.
5. Display both plots side by side.

In [None]:
# Exercise 2 - Starter Code

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_curve, roc_auc_score,
    precision_recall_curve, average_precision_score
)

data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LogisticRegression(max_iter=10000, random_state=42)
model.fit(X_train, y_train)

# TODO 1: Get probability predictions for the positive class
# y_prob = model.predict_proba(X_test)[:, 1]

# TODO 2: Compute ROC curve and AUC
# fpr, tpr, roc_thresholds = roc_curve(y_test, y_prob)
# roc_auc = roc_auc_score(y_test, y_prob)

# TODO 3: Compute Precision-Recall curve and average precision
# precision_vals, recall_vals, pr_thresholds = precision_recall_curve(y_test, y_prob)
# avg_precision = average_precision_score(y_test, y_prob)

# TODO 4: Plot both curves side by side
# fig, axes = plt.subplots(1, 2, figsize=(14, 5))
#
# # ROC curve
# axes[0].plot(fpr, tpr, label=f'ROC (AUC = {roc_auc:.4f})')
# axes[0].plot([0, 1], [0, 1], 'k--', label='Random')
# axes[0].set_xlabel('False Positive Rate')
# axes[0].set_ylabel('True Positive Rate')
# axes[0].set_title('ROC Curve')
# axes[0].legend()
#
# # PR curve
# axes[1].plot(recall_vals, precision_vals, label=f'PR (AP = {avg_precision:.4f})')
# axes[1].set_xlabel('Recall')
# axes[1].set_ylabel('Precision')
# axes[1].set_title('Precision-Recall Curve')
# axes[1].legend()
#
# plt.tight_layout()
# plt.show()

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_curve, roc_auc_score,
    precision_recall_curve, average_precision_score
)

data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LogisticRegression(max_iter=10000, random_state=42)
model.fit(X_train, y_train)

# 1. Probability predictions
y_prob = model.predict_proba(X_test)[:, 1]

# 2. ROC
fpr, tpr, roc_thresholds = roc_curve(y_test, y_prob)
roc_auc = roc_auc_score(y_test, y_prob)

# 3. PR
precision_vals, recall_vals, pr_thresholds = precision_recall_curve(y_test, y_prob)
avg_precision = average_precision_score(y_test, y_prob)

# 4. Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(fpr, tpr, label=f'ROC (AUC = {roc_auc:.4f})')
axes[0].plot([0, 1], [0, 1], 'k--', label='Random')
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')
axes[0].set_title('ROC Curve')
axes[0].legend()

axes[1].plot(recall_vals, precision_vals, label=f'PR (AP = {avg_precision:.4f})')
axes[1].set_xlabel('Recall')
axes[1].set_ylabel('Precision')
axes[1].set_title('Precision-Recall Curve')
axes[1].legend()

plt.tight_layout()
plt.show()

print(f"ROC AUC: {roc_auc:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
```

</details>

---
## Exercise 3: Optimal Threshold for F1 Score

**Goal:** Find the decision threshold that maximizes the F1 score rather than
using the default 0.5 threshold.

**Tasks:**
1. Train a logistic regression model on the breast cancer dataset.
2. Compute precision and recall at various thresholds using `precision_recall_curve`.
3. Compute F1 score at each threshold.
4. Find the threshold that maximizes F1.
5. Compare the F1 score at the default threshold (0.5) vs. the optimal threshold.

In [None]:
# Exercise 3 - Starter Code

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, f1_score

data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LogisticRegression(max_iter=10000, random_state=42)
model.fit(X_train, y_train)
y_prob = model.predict_proba(X_test)[:, 1]

# TODO 1: Compute precision, recall at various thresholds
# precisions, recalls, thresholds = precision_recall_curve(y_test, y_prob)

# TODO 2: Compute F1 at each threshold
# Note: precisions and recalls have one more element than thresholds
# f1_scores = 2 * (precisions[:-1] * recalls[:-1]) / (precisions[:-1] + recalls[:-1] + 1e-10)

# TODO 3: Find optimal threshold
# best_idx = np.argmax(f1_scores)
# best_threshold = thresholds[best_idx]
# best_f1 = f1_scores[best_idx]

# TODO 4: Compare with default threshold
# default_f1 = f1_score(y_test, (y_prob >= 0.5).astype(int))
# optimal_f1 = f1_score(y_test, (y_prob >= best_threshold).astype(int))
# print(f"Default threshold (0.5): F1 = {default_f1:.4f}")
# print(f"Optimal threshold ({best_threshold:.4f}): F1 = {optimal_f1:.4f}")

# TODO 5: Plot F1 vs threshold
# plt.figure(figsize=(8, 5))
# plt.plot(thresholds, f1_scores)
# plt.axvline(x=best_threshold, color='r', linestyle='--', label=f'Optimal: {best_threshold:.4f}')
# plt.axvline(x=0.5, color='g', linestyle='--', label='Default: 0.5')
# plt.xlabel('Threshold')
# plt.ylabel('F1 Score')
# plt.title('F1 Score vs Decision Threshold')
# plt.legend()
# plt.tight_layout()
# plt.show()

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, f1_score

data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LogisticRegression(max_iter=10000, random_state=42)
model.fit(X_train, y_train)
y_prob = model.predict_proba(X_test)[:, 1]

# 1. Precision-recall at various thresholds
precisions, recalls, thresholds = precision_recall_curve(y_test, y_prob)

# 2. F1 at each threshold
f1_scores = 2 * (precisions[:-1] * recalls[:-1]) / (precisions[:-1] + recalls[:-1] + 1e-10)

# 3. Optimal threshold
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
best_f1 = f1_scores[best_idx]

# 4. Compare
default_f1 = f1_score(y_test, (y_prob >= 0.5).astype(int))
optimal_f1 = f1_score(y_test, (y_prob >= best_threshold).astype(int))
print(f"Default threshold (0.5):            F1 = {default_f1:.4f}")
print(f"Optimal threshold ({best_threshold:.4f}): F1 = {optimal_f1:.4f}")

# 5. Plot
plt.figure(figsize=(8, 5))
plt.plot(thresholds, f1_scores)
plt.axvline(x=best_threshold, color='r', linestyle='--', label=f'Optimal: {best_threshold:.4f}')
plt.axvline(x=0.5, color='g', linestyle='--', label='Default: 0.5')
plt.xlabel('Threshold')
plt.ylabel('F1 Score')
plt.title('F1 Score vs Decision Threshold')
plt.legend()
plt.tight_layout()
plt.show()
```

</details>

---
## Exercise 4: Handling Imbalanced Data with class_weight

**Goal:** Train on an imbalanced dataset with and without `class_weight='balanced'`
and compare the results.

**Tasks:**
1. Create an imbalanced dataset using `make_classification(weights=[0.9, 0.1])`.
2. Train logistic regression WITHOUT `class_weight` (default).
3. Train logistic regression WITH `class_weight='balanced'`.
4. Compare confusion matrices and classification reports.
5. Discuss the effect of class weighting.

In [None]:
# Exercise 4 - Starter Code

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# Create imbalanced dataset: 90% class 0, 10% class 1
X, y = make_classification(
    n_samples=2000, n_features=20, n_informative=10,
    weights=[0.9, 0.1], flip_y=0.01, random_state=42
)
print(f"Class distribution: {np.bincount(y)}")
print(f"Class 0: {np.mean(y == 0):.1%}, Class 1: {np.mean(y == 1):.1%}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# TODO 1: Train WITHOUT class_weight
# model_default = LogisticRegression(max_iter=1000, random_state=42)
# model_default.fit(X_train, y_train)
# y_pred_default = model_default.predict(X_test)

# TODO 2: Train WITH class_weight='balanced'
# model_balanced = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
# model_balanced.fit(X_train, y_train)
# y_pred_balanced = model_balanced.predict(X_test)

# TODO 3: Compare confusion matrices side by side
# fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# ConfusionMatrixDisplay.from_predictions(y_test, y_pred_default, ax=axes[0])
# axes[0].set_title('Default (No Weighting)')
# ConfusionMatrixDisplay.from_predictions(y_test, y_pred_balanced, ax=axes[1])
# axes[1].set_title('Balanced Weighting')
# plt.tight_layout()
# plt.show()

# TODO 4: Print classification reports for both
# print("=== Default ===")
# print(classification_report(y_test, y_pred_default))
# print("=== Balanced ===")
# print(classification_report(y_test, y_pred_balanced))

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# Imbalanced dataset
X, y = make_classification(
    n_samples=2000, n_features=20, n_informative=10,
    weights=[0.9, 0.1], flip_y=0.01, random_state=42
)
print(f"Class distribution: {np.bincount(y)}")
print(f"Class 0: {np.mean(y == 0):.1%}, Class 1: {np.mean(y == 1):.1%}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 1. Without class_weight
model_default = LogisticRegression(max_iter=1000, random_state=42)
model_default.fit(X_train, y_train)
y_pred_default = model_default.predict(X_test)

# 2. With class_weight='balanced'
model_balanced = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
model_balanced.fit(X_train, y_train)
y_pred_balanced = model_balanced.predict(X_test)

# 3. Confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_default, ax=axes[0])
axes[0].set_title('Default (No Weighting)')
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_balanced, ax=axes[1])
axes[1].set_title('Balanced Weighting')
plt.tight_layout()
plt.show()

# 4. Classification reports
print("=== Default (No Weighting) ===")
print(classification_report(y_test, y_pred_default))
print("=== Balanced Weighting ===")
print(classification_report(y_test, y_pred_balanced))

# Discussion:
# - Without class_weight, the model tends to predict the majority class (0)
#   more often, resulting in low recall for the minority class (1).
# - With class_weight='balanced', the model penalizes misclassification of
#   the minority class more, improving recall for class 1 at some cost
#   to precision.
```

</details>

---
## Exercise 5: Probability Calibration

**Goal:** Apply `CalibratedClassifierCV` to a logistic regression model and
compare calibration curves before and after calibration.

**Tasks:**
1. Train a logistic regression model on the breast cancer dataset.
2. Compute the calibration curve for the uncalibrated model.
3. Apply `CalibratedClassifierCV` with `method='sigmoid'`.
4. Compute the calibration curve for the calibrated model.
5. Plot both calibration curves on the same axes.

In [None]:
# Exercise 5 - Starter Code

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV, calibration_curve

data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# TODO 1: Train uncalibrated model
# model = LogisticRegression(max_iter=10000, random_state=42)
# model.fit(X_train, y_train)

# TODO 2: Get calibration curve for uncalibrated model
# y_prob_uncalib = model.predict_proba(X_test)[:, 1]
# fraction_pos_uncalib, mean_pred_uncalib = calibration_curve(
#     y_test, y_prob_uncalib, n_bins=10
# )

# TODO 3: Apply CalibratedClassifierCV
# calibrated_model = CalibratedClassifierCV(model, method='sigmoid', cv=5)
# calibrated_model.fit(X_train, y_train)

# TODO 4: Get calibration curve for calibrated model
# y_prob_calib = calibrated_model.predict_proba(X_test)[:, 1]
# fraction_pos_calib, mean_pred_calib = calibration_curve(
#     y_test, y_prob_calib, n_bins=10
# )

# TODO 5: Plot both calibration curves
# plt.figure(figsize=(8, 6))
# plt.plot(mean_pred_uncalib, fraction_pos_uncalib, 's-', label='Uncalibrated')
# plt.plot(mean_pred_calib, fraction_pos_calib, 's-', label='Calibrated (sigmoid)')
# plt.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
# plt.xlabel('Mean Predicted Probability')
# plt.ylabel('Fraction of Positives')
# plt.title('Calibration Curves')
# plt.legend()
# plt.tight_layout()
# plt.show()

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV, calibration_curve

data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# 1. Train uncalibrated model
model = LogisticRegression(max_iter=10000, random_state=42)
model.fit(X_train, y_train)

# 2. Calibration curve - uncalibrated
y_prob_uncalib = model.predict_proba(X_test)[:, 1]
fraction_pos_uncalib, mean_pred_uncalib = calibration_curve(
    y_test, y_prob_uncalib, n_bins=10
)

# 3. Calibrate
calibrated_model = CalibratedClassifierCV(model, method='sigmoid', cv=5)
calibrated_model.fit(X_train, y_train)

# 4. Calibration curve - calibrated
y_prob_calib = calibrated_model.predict_proba(X_test)[:, 1]
fraction_pos_calib, mean_pred_calib = calibration_curve(
    y_test, y_prob_calib, n_bins=10
)

# 5. Plot
plt.figure(figsize=(8, 6))
plt.plot(mean_pred_uncalib, fraction_pos_uncalib, 's-', label='Uncalibrated')
plt.plot(mean_pred_calib, fraction_pos_calib, 's-', label='Calibrated (sigmoid)')
plt.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
plt.xlabel('Mean Predicted Probability')
plt.ylabel('Fraction of Positives')
plt.title('Calibration Curves')
plt.legend()
plt.tight_layout()
plt.show()

# Note: Logistic regression is already fairly well-calibrated,
# so the improvement may be small. Calibration is more impactful
# for models like SVMs or Naive Bayes.
print(f"Uncalibrated accuracy: {model.score(X_test, y_test):.4f}")
print(f"Calibrated accuracy:   {calibrated_model.score(X_test, y_test):.4f}")
```

</details>