# Classification with Breast Cancer Dataset

## Breast Cancer Dataset

In [None]:
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()

In [None]:
cancer.keys()

In [None]:
print(cancer.DESCR)

In [None]:
cancer.feature_names

In [None]:
cancer.data[:5]

In [None]:
cancer.target_names

In [None]:
cancer.target

## Exploring Dataset with Pandas

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df.head()

In [None]:
df.info()

In [None]:
df.describe()

## ML Workflow using Scikit-Learn

### Train-Test Split

* Split data into train and test sets ([model_selection.train_test_split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html))

In [None]:
from sklearn.model_selection import train_test_split

seed = 100

X, y = cancer.data, cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=seed)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

### Training & Validation

* Logistic regresssion ([linear_model.LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html))
* K-folds cross validator ([model_selection.KFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html))
* Evaluate a score by CV ([model_selection.cross_val_score](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html))
* Pipeline of transforms with a final estimator ([pipeline.Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html))
* Polynomial and interaction features ([preprocessing.PolynomialFeatures](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

K-Fold Cross Validation for a Single Model

 You can change the scoring function by using the `scoring` parameter in `cross_val_score`.
* `accuracy`: Accuracy (**default**)
* `roc_auc`: Area under the receiver operating characteristic (ROC) curve
* `f1`: F1 score
* `precision`: Precision
* `recall`: Recall

In [None]:
model = LogisticRegression(solver="saga", max_iter=5000, penalty=None)

# Polynomial Regression
# model = Pipeline([
#     ("poly_features", PolynomialFeatures(degree=2)),
#     ("softmax_reg", LogisticRegression(solver="saga", max_iter=5000, penalty=None))
# ])

kf = KFold(n_splits=3, shuffle=True, random_state=seed)
scores = cross_val_score(model, X_train, y_train, cv=kf, scoring="accuracy")

print("Scores from each iteration:", scores)
print("Average score:", scores.mean())

K-Fold Cross Validation for Multiple Models

In [None]:
max_iter = 10000

models = {
  "Baseline": LogisticRegression(solver="saga", max_iter=max_iter, penalty=None),
  "L2": LogisticRegression(solver="saga", max_iter=max_iter, penalty="l2", C=1.0),
  "L1": LogisticRegression(solver="saga", max_iter=max_iter, penalty="l1", C=1.0),
  "Polynomial": Pipeline([("poly_features", PolynomialFeatures(degree=2)),
                          ("softmax_reg", LogisticRegression(solver="saga", max_iter=max_iter, penalty=None))])
}

kf = KFold(n_splits=3, shuffle=True, random_state=seed)

scores = {}
for name, model in models.items():
    _scores = cross_val_score(model, X_train, y_train, cv=kf, scoring="accuracy")
    print("{}: {}".format(name, _scores))
    scores[name] = _scores.mean()

print(scores)

K-Fold Cross Validation using Grid Search

* Grid search over specified parameter values ([model_selection.GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html))

In [None]:
from sklearn.model_selection import GridSearchCV

model = LogisticRegression(solver="saga")

# Define the hyperparameters and their possible values
param_grid = {
    "max_iter": [5000, 10000],
    "penalty": ["l1", "l2"],
    "C": [0.01, 0.1, 1, 10, 100]
}

grid_search = GridSearchCV(model, param_grid, cv=3, scoring="accuracy")
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding score
print("Best parameters: ", grid_search.best_params_)
print("Best CV score: {:.2f}".format(grid_search.best_score_))

### Evaluation & Interpretation

* Accuracy ([metrics.accuracy_score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html))
* F1 ([metrics.f1_score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html))
* ROC AUC ([metrics.roc_auc_score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html))

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

model = LogisticRegression(solver="saga", max_iter=5000, penalty="l2", C=0.01)
# model = Pipeline([
#     ("poly_features", PolynomialFeatures(degree=2)),
#     ("softmax_reg", LogisticRegression(solver="saga", max_iter=10000, penalty=None))
# ])
model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)
print("Estimated probs:", y_prob[:10])

y_cls = model.predict(X_test)
print("Estimated classes:", y_cls[:10])
print()

print("Accuracy:", accuracy_score(y_test, y_cls))
print("F1:", f1_score(y_test, y_cls))
print("ROC AUC:", roc_auc_score(y_test, y_prob[:, 1]))

print()
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)
# print("Coefficients:", model.named_steps['softmax_reg'].coef_) # Degree-2 polynomial features: [1, a, b, a^2, ab, b^2]
# print("Intercept:", model.named_steps['softmax_reg'].intercept_)

* ROC Curve ([metrics.roc_curve](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html))

In [None]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob[:, 1])

plt.plot(fpr, tpr, color="darkorange", lw=2)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlabel("1 - Specificity (FP Rate)")
plt.ylabel("Sensitivity (TP Rate)")
plt.title("ROC Curve")
plt.show()

# Regression with Diabetes Dataset

## Diabetes Dataset

In [None]:
from sklearn.datasets import load_diabetes

diabetes = load_diabetes()
print(diabetes.DESCR)

In [None]:
diabetes.keys()

In [None]:
diabetes.feature_names

In [None]:
diabetes.data[:5]

In [None]:
diabetes.target[:10]

## ML Workflow using Scikit-Learn

### Train-Test Split

In [None]:
seed = 100

X, y = diabetes.data, diabetes.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=100, shuffle=True, random_state=seed)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

### Training & Validation

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso

Let's explore three regression models:
* Linear regresssion ([linear_model.LinearRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression))
* Ridge regression ([linear_model.Ridge](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge))
* LASSO ([linear_model.Lasso](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html#sklearn.linear_model.Lasso))

You can change the scoring function by using the `scoring` parameter in `cross_val_score`.
* `r2`: $R^2$ score
* `neg_mean_squared_error`: Negative mean squared error (MSE)
* `neg_mean_absolute_error`: Negative mean absolute error (MAE)

Note that the negative values for MSE, MAE, and RMSE are used by convention because `cross_val_score` or `GridSearchCV` is designed to prefer higher scoring metrics.

Identify the best model using the validation technique, and then integrate it into the code provided below.


In [None]:
# Find the best model!

### Evaluation & Interpretation

* MSE ([sklearn.metrics.mean_squared_error](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html))
* MAE ([sklearn.metrics.mean_absolute_error](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html))
* $R^2$ ([sklearn.metrics.r2_score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html))

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

model = None # Define your model
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Estimated values:", y_pred[:10])
print()

print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R^2:", r2_score(y_test, y_pred))

print()
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,6))
plt.plot(y_test, y_pred, "o")

plt.title('True vs. Predicted Values')
plt.xlabel('True Values')
plt.ylabel('Predicted Values')

# Optional: plot a 45-degree line for reference
max_val = max(y_test.max(), y_pred.max())  # Find the maximum value between true and predicted values
min_val = min(y_test.min(), y_pred.min())  # Find the minimum value between true and predicted values
plt.plot([min_val, max_val], [min_val, max_val], '--', color='red', linewidth=2)

plt.grid()
plt.show()