# Lab 1

## Datasets

For this lab, we will use the three following datasets:

- Iris plants dataset `sklearn.datasets.load_iris`
- Optical recognition of handwritten digits dataset `sklearn.datasets.load_digits`
- Breast cancer wisconsin (diagnostic) dataset `sklearn.datasets.load_breast_cancer`


In [None]:
import pandas as pd
from sklearn.datasets import load_iris, fetch_california_housing
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from torchvision import datasets, transforms
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from utils import plot_confusion_matrix, plot_evaluation, plot_decision_boundaries

## Iris dataset


### Load the dataset


In [None]:
data, _ = load_iris()
X, y, feature_names, class_names = (
    data.data,
    data.target,
    data.feature_names,
    data.target_names,
)

print(f"{feature_names = }")
print(f"{class_names = }")

df = pd.DataFrame(X, columns=feature_names)
df["species"] = [class_names[i] for i in y]
display(df)

In [None]:
sns.pairplot(
    df,
    hue="species",
    palette="Set2",
    diag_kind="kde",
    markers=["o", "s", "D"],
    height=2.5,
    aspect=1,
    plot_kws=dict(s=20),
)
plt.suptitle("Pairwise features", y=1.02)
plt.show()

### Split the dataset into a training set and a testing set


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
print(f"{X_train.shape = }")
print(f"{X_test.shape = }")

### Standardize the data


In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Initialize and fit/train the model


In [None]:
model = LogisticRegression(random_state=0)  # random_state=0 for reproducibility
model.fit(X_train, y_train)

### Predict on the testing set


In [None]:
y_pred = model.predict(X_test)

### Evaluate the model


In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

plot_confusion_matrix(y_test, y_pred, class_names)
plot_evaluation(y_test, y_pred, class_names)

#### Accuracy

$$
\text{Accuracy} = \frac{TP + TN}{TP + TN + FP + FN}
$$

Accuracy measures the proportion of **correctly classified instances** (both true positives and true negatives) out of the total instances.

Where:

- $TP$ = True Positives
- $TN$ = True Negatives
- $FP$ = False Positives
- $FN$ = False Negatives

#### Precision

$$
\text{Precision} = \frac{TP}{TP + FP}
$$

Precision measures the proportion of **true positive instances** out of the instances that were **predicted as positive**. It indicates how many of the predicted positives are actually positive.

#### Recall

$$
\text{Recall} = \frac{TP}{TP + FN}
$$

Recall measures the proportion of **true positive instances** out of the **actual positive instances**. It indicates how many of the actual positives were correctly identified.

#### F1-Score

$$
\text{F1-Score} = 2 \times \frac{\text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}}
$$

The F1-Score is the **harmonic mean** of **precision and recall**. It provides a single metric that balances both precision and recall, especially useful when you need to account for both false positives and false negatives.


### Two features only


In [None]:
def fit_predict_evaluate_iris(
    model, features_used: tuple[int, int], normalize: bool = True
):
    data, _ = load_iris()
    X, y, feature_names, class_names = (
        data.data[:, features_used],
        data.target,
        [data.feature_names[i] for i in features_used],
        data.target_names,
    )

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.5, random_state=0
    )

    if normalize:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    fig, axs = plt.subplots(1, 4, figsize=(24, 5))
    plt.suptitle(
        f"{model.__class__.__name__} using features={features_used} and normalization={normalize}"
    )

    plot_confusion_matrix(y_test, y_pred, class_names, ax=axs[0])
    plot_evaluation(y_test, y_pred, class_names, ax=axs[1])
    plot_decision_boundaries(model, X_train, y_train, feature_names, ax=axs[2])
    axs[2].set_title(axs[2].get_title() + " (Training Data)")
    plot_decision_boundaries(model, X_test, y_test, feature_names, ax=axs[3])
    axs[3].set_title(axs[3].get_title() + " (Testing Data)")


model.__class__.__name__
fit_predict_evaluate_iris(LogisticRegression(random_state=0), (0, 1))

In [None]:
fit_predict_evaluate_iris(LogisticRegression(random_state=0), (2, 3))

In [None]:
fit_predict_evaluate_iris(LogisticRegression(random_state=0), (2, 3), normalize=False)

In [None]:
fit_predict_evaluate_iris(KNeighborsClassifier(n_neighbors=5), (2, 3))

In [None]:
fit_predict_evaluate_iris(KNeighborsClassifier(n_neighbors=5), (0, 1))

In [None]:
fit_predict_evaluate_iris(KNeighborsClassifier(n_neighbors=5), (1, 2))

In [None]:
fit_predict_evaluate_iris(
    SVC(kernel="poly"),
    (0, 1),
)

In [None]:
fit_predict_evaluate_iris(
    SVC(kernel="poly", degree=10),
    (0, 1),
)

In [None]:
fit_predict_evaluate_iris(SVC(kernel="rbf"), (0, 1))

In [None]:
fit_predict_evaluate_iris(
    RandomForestClassifier(n_estimators=1000, random_state=0), (0, 1)
)

## California Housing dataset


In [None]:
data = fetch_california_housing()
X, y, classes = data.data, data.target, data.feature_names
print(X.shape, y.shape, classes)

In [None]:
transform = transforms.Compose([transforms.ToTensor()])
train_data = datasets.FashionMNIST(
    root="./data", train=True, download=True, transform=transform
)
test_data = datasets.FashionMNIST(
    root="./data", train=False, download=True, transform=transform
)
print(train_data.data.shape, train_data.targets.shape)