## Setup

In [None]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
import sklearn

sklearn.set_config(display="text")

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# SPAM

In [None]:
from sklearn.linear_model import LogisticRegression

email = pd.read_csv(
  'https://sta663-sp22.github.io/slides/data/email.csv'
)[
  ['spam', 'exclaim_mess', 'format', 'num_char', 'line_breaks', 'number']
]

In [None]:
email

In [None]:
email_dc = pd.get_dummies(email)
email_dc

In [None]:
y = email_dc.spam
X = email_dc.drop('spam', axis=1)

## Baseline - LogisticRegression

In [None]:
m = LogisticRegression(fit_intercept = False, max_iter=500).fit(X, y)

In [None]:
m.coef_

In [None]:
m.score(X,y)

In [None]:
confusion_matrix(y, m.predict(X))

In [None]:
print(
  classification_report(y, m.predict(X), zero_division=0)
)

## Example 1 - DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree_gs = GridSearchCV(
  DecisionTreeClassifier(),
  param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [2,3,4,5,6,7]
  },
  cv = KFold(10, shuffle=True, random_state=1234),
  scoring = "accuracy",
  n_jobs = 4
).fit(
  X, y
)

In [None]:
tree_gs.best_estimator_

In [None]:
tree_gs.best_score_

In [None]:
for p, s in  zip(tree_gs.cv_results_["params"], tree_gs.cv_results_["mean_test_score"]):
  print(f"{p} Score: {s}")

In [None]:
confusion_matrix(y, tree_gs.best_estimator_.predict(X))

In [None]:
print(
  classification_report(y, tree_gs.best_estimator_.predict(X))
)

In [None]:
tree_gs.best_estimator_.predict_proba(X)

## Example 2 - SVC

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

svc_pipe = make_pipeline(
  StandardScaler(),
  SVC()
)

svc_gs = GridSearchCV(
  svc_pipe,
  param_grid = [
    {"svc__kernel": ["rbf"], "svc__C": [0.1, 1, 10, 100]},
    {"svc__kernel": ["linear"], "svc__C": [0.1, 1, 10]}
  ],
  cv = KFold(5, shuffle=True, random_state=1234),
  scoring = "accuracy",
  n_jobs = 10
).fit(
  X, y
)

In [None]:
svc_gs.best_estimator_

In [None]:
svc_gs.best_score_

In [None]:
for p, s in  zip(svc_gs.cv_results_["params"], svc_gs.cv_results_["mean_test_score"]):
  print(f"{p} Score: {s}")

In [None]:
print(
  classification_report(y, svc_gs.best_estimator_.predict(X))
)

In [None]:
svc_gs.best_estimator_.predict_proba(X)

# MNIST Digits

In [None]:
from sklearn.datasets import load_digits
digits = load_digits(as_frame=True)

In [None]:
X, y = digits.data, digits.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, shuffle=True, random_state=1234
)

## Example 3 - Classification Tree

In [None]:
digits_tree = GridSearchCV(
  DecisionTreeClassifier(),
  param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": range(2,16)
  },
  cv = KFold(5, shuffle=True, random_state=12345),
  n_jobs = 4
).fit(
  X_train, y_train
)

In [None]:
digits_tree.best_estimator_

In [None]:
digits_tree.best_score_

In [None]:
digits_tree.best_estimator_.score(X_test, y_test)

In [None]:
confusion_matrix(
  y_test, digits_tree.best_estimator_.predict(X_test)
)

In [None]:
print(
  classification_report(y_test, digits_tree.best_estimator_.predict(X_test))
)

## Example 4 - GridSearchCV w/ Multiple models

In [None]:
from sklearn.ensemble import RandomForestClassifier

p = Pipeline([
  ("model", DecisionTreeClassifier())
])


digits_mm = GridSearchCV(
  p,
  param_grid = {
    "model": [
      DecisionTreeClassifier(),
      RandomForestClassifier()
    ],
    "model__criterion": ["gini", "entropy"],
    "model__max_depth": range(2,10)
  },
  cv = KFold(5, shuffle=True, random_state=12345),
  n_jobs = 10
).fit(
  X_train, y_train
)

In [None]:
digits_mm.best_estimator_

In [None]:
digits_mm.best_score_

In [None]:
digits_mm.best_estimator_.score(X_test, y_test)

In [None]:
confusion_matrix(
  y_test, digits_mm.best_estimator_.predict(X_test)
)

In [None]:
print(
  classification_report(y_test, digits_mm.best_estimator_.predict(X_test))
)