## Setup

In [2]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [3]:
import sklearn

sklearn.set_config(display="text")

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# SPAM

In [4]:
from sklearn.linear_model import LogisticRegression

email = pd.read_csv(
  'https://sta663-sp22.github.io/slides/data/email.csv'
)[
  ['spam', 'exclaim_mess', 'format', 'num_char', 'line_breaks', 'number']
]

In [5]:
email

Unnamed: 0,spam,exclaim_mess,format,num_char,line_breaks,number
0,0,0,1,11.370,202,big
1,0,1,1,10.504,202,small
2,0,6,1,7.773,192,small
3,0,48,1,13.256,255,small
4,0,1,0,1.231,29,none
...,...,...,...,...,...,...
3916,1,0,0,0.332,12,small
3917,1,0,0,0.323,15,small
3918,0,5,1,8.656,208,small
3919,0,0,0,10.185,132,small


In [6]:
email_dc = pd.get_dummies(email)
email_dc

Unnamed: 0,spam,exclaim_mess,format,num_char,line_breaks,number_big,number_none,number_small
0,0,0,1,11.370,202,1,0,0
1,0,1,1,10.504,202,0,0,1
2,0,6,1,7.773,192,0,0,1
3,0,48,1,13.256,255,0,0,1
4,0,1,0,1.231,29,0,1,0
...,...,...,...,...,...,...,...,...
3916,1,0,0,0.332,12,0,0,1
3917,1,0,0,0.323,15,0,0,1
3918,0,5,1,8.656,208,0,0,1
3919,0,0,0,10.185,132,0,0,1


In [7]:
y = email_dc.spam
X = email_dc.drop('spam', axis=1)

## Baseline - LogisticRegression

In [11]:
m = LogisticRegression(fit_intercept = False, max_iter=500).fit(X, y)

In [12]:
m.coef_

array([[ 0.00982304, -0.61873796,  0.05448642, -0.00555706, -1.21151583,
        -0.69342307, -1.92053976]])

In [13]:
m.score(X,y)

0.90640142820709

In [14]:
confusion_matrix(y, m.predict(X))

array([[3554,    0],
       [ 367,    0]])

In [15]:
print(
  classification_report(y, m.predict(X), zero_division=0)
)

              precision    recall  f1-score   support

           0       0.91      1.00      0.95      3554
           1       0.00      0.00      0.00       367

    accuracy                           0.91      3921
   macro avg       0.45      0.50      0.48      3921
weighted avg       0.82      0.91      0.86      3921



## Example 1 - DecisionTreeClassifier

In [16]:
from sklearn.tree import DecisionTreeClassifier

In [17]:
tree_gs = GridSearchCV(
  DecisionTreeClassifier(),
  param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [2,3,4,5,6,7]
  },
  cv = KFold(10, shuffle=True, random_state=1234),
  scoring = "accuracy",
  n_jobs = 4
).fit(
  X, y
)

In [18]:
tree_gs.best_estimator_

DecisionTreeClassifier(criterion='entropy', max_depth=5)

In [19]:
tree_gs.best_score_

0.9219537051461808

In [21]:
for p, s in  zip(tree_gs.cv_results_["params"], tree_gs.cv_results_["mean_test_score"]):
  print(f"{p} Score: {s}")

{'criterion': 'gini', 'max_depth': 2} Score: 0.9056297709923664
{'criterion': 'gini', 'max_depth': 3} Score: 0.9117502726281351
{'criterion': 'gini', 'max_depth': 4} Score: 0.918382276574752
{'criterion': 'gini', 'max_depth': 5} Score: 0.9196577867788337
{'criterion': 'gini', 'max_depth': 6} Score: 0.9188931297709925
{'criterion': 'gini', 'max_depth': 7} Score: 0.9160928493534819
{'criterion': 'entropy', 'max_depth': 2} Score: 0.9063950771148154
{'criterion': 'entropy', 'max_depth': 3} Score: 0.9102196603832373
{'criterion': 'entropy', 'max_depth': 4} Score: 0.9209332969829154
{'criterion': 'entropy', 'max_depth': 5} Score: 0.9219537051461808
{'criterion': 'entropy', 'max_depth': 6} Score: 0.9158312561665888
{'criterion': 'entropy', 'max_depth': 7} Score: 0.9132860777898946


In [20]:
confusion_matrix(y, tree_gs.best_estimator_.predict(X))

array([[3544,   10],
       [ 286,   81]])

In [22]:
print(
  classification_report(y, tree_gs.best_estimator_.predict(X))
)

              precision    recall  f1-score   support

           0       0.93      1.00      0.96      3554
           1       0.89      0.22      0.35       367

    accuracy                           0.92      3921
   macro avg       0.91      0.61      0.66      3921
weighted avg       0.92      0.92      0.90      3921



In [23]:
tree_gs.best_estimator_.predict_proba(X)

array([[0.97848306, 0.02151694],
       [0.97848306, 0.02151694],
       [0.97848306, 0.02151694],
       ...,
       [0.97848306, 0.02151694],
       [0.97848306, 0.02151694],
       [0.8908046 , 0.1091954 ]])

## Example 2 - SVC

In [24]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

svc_pipe = make_pipeline(
  StandardScaler(),
  SVC()
)

svc_gs = GridSearchCV(
  svc_pipe,
  param_grid = [
    {"svc__kernel": ["rbf"], "svc__C": [0.1, 1, 10, 100]},
    {"svc__kernel": ["linear"], "svc__C": [0.1, 1, 10]}
  ],
  cv = KFold(5, shuffle=True, random_state=1234),
  scoring = "accuracy",
  n_jobs = 10
).fit(
  X, y
)

In [25]:
svc_gs.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()), ('svc', SVC(C=1))])

In [26]:
svc_gs.best_score_

0.911243338099571

In [27]:
for p, s in  zip(svc_gs.cv_results_["params"], svc_gs.cv_results_["mean_test_score"]):
  print(f"{p} Score: {s}")

{'svc__C': 0.1, 'svc__kernel': 'rbf'} Score: 0.9063973742363187
{'svc__C': 1, 'svc__kernel': 'rbf'} Score: 0.911243338099571
{'svc__C': 10, 'svc__kernel': 'rbf'} Score: 0.9089477447029767
{'svc__C': 100, 'svc__kernel': 'rbf'} Score: 0.9099681528662421
{'svc__C': 0.1, 'svc__kernel': 'linear'} Score: 0.9063973742363187
{'svc__C': 1, 'svc__kernel': 'linear'} Score: 0.9063973742363187
{'svc__C': 10, 'svc__kernel': 'linear'} Score: 0.9063973742363187


In [28]:
print(
  classification_report(y, svc_gs.best_estimator_.predict(X))
)

              precision    recall  f1-score   support

           0       0.91      1.00      0.95      3554
           1       0.83      0.07      0.13       367

    accuracy                           0.91      3921
   macro avg       0.87      0.53      0.54      3921
weighted avg       0.90      0.91      0.88      3921



In [29]:
confusion_matrix(y, svc_gs.best_estimator_.predict(X))

array([[3549,    5],
       [ 342,   25]])

In [32]:
#svc_gs.best_estimator_.predict_proba(X)

# MNIST Digits

In [33]:
from sklearn.datasets import load_digits
digits = load_digits(as_frame=True)

In [34]:
X, y = digits.data, digits.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, shuffle=True, random_state=1234
)

## Example 3 - Classification Tree

In [41]:
digits_tree = GridSearchCV(
  DecisionTreeClassifier(),
  param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": range(2,10)
  },
  cv = KFold(5, shuffle=True, random_state=12345),
  n_jobs = 4
).fit(
  X_train, y_train
)

In [42]:
digits_tree.best_estimator_

DecisionTreeClassifier(criterion='entropy', max_depth=7)

In [43]:
digits_tree.best_score_

0.8453769017980637

In [44]:
digits_tree.best_estimator_.score(X_test, y_test)

0.867003367003367

In [45]:
confusion_matrix(
  y_test, digits_tree.best_estimator_.predict(X_test)
)

array([[51,  0,  1,  0,  1,  0,  0,  0,  0,  0],
       [ 0, 51,  1,  4,  0,  4,  0,  2,  2,  0],
       [ 2,  0, 49,  2,  0,  1,  3,  0,  1,  0],
       [ 0,  1,  2, 54,  0,  0,  0,  0,  1,  2],
       [ 2,  0,  0,  0, 63,  1,  1,  3,  0,  1],
       [ 0,  3,  0,  3,  3, 56,  2,  0,  1,  0],
       [ 0,  1,  0,  0,  0,  0, 55,  0,  1,  0],
       [ 2,  2,  1,  0,  1,  0,  0, 48,  1,  0],
       [ 0,  0,  2,  2,  3,  4,  0,  2, 42,  0],
       [ 0,  1,  0,  2,  0,  2,  0,  2,  0, 46]])

In [40]:
print(
  classification_report(y_test, digits_tree.best_estimator_.predict(X_test))
)

              precision    recall  f1-score   support

           0       0.89      0.92      0.91        53
           1       0.90      0.88      0.89        64
           2       0.87      0.81      0.84        58
           3       0.86      0.93      0.90        60
           4       0.90      0.89      0.89        71
           5       0.88      0.87      0.87        68
           6       0.87      0.96      0.92        57
           7       0.83      0.91      0.87        55
           8       0.85      0.75      0.80        55
           9       0.90      0.85      0.87        53

    accuracy                           0.88       594
   macro avg       0.88      0.88      0.88       594
weighted avg       0.88      0.88      0.88       594



## Example 4 - GridSearchCV w/ Multiple models

In [46]:
from sklearn.ensemble import RandomForestClassifier

p = Pipeline([
  ("model", DecisionTreeClassifier())
])


digits_mm = GridSearchCV(
  p,
  param_grid = {
    "model": [
      DecisionTreeClassifier(),
      RandomForestClassifier()
    ],
    "model__criterion": ["gini", "entropy"],
    "model__max_depth": range(2,10)
  },
  cv = KFold(5, shuffle=True, random_state=12345),
  n_jobs = 10
).fit(
  X_train, y_train
)

In [47]:
digits_mm.best_estimator_

Pipeline(steps=[('model',
                 RandomForestClassifier(criterion='entropy', max_depth=9))])

In [48]:
digits_mm.best_score_

0.9717358229598894

In [49]:
digits_mm.best_estimator_.score(X_test, y_test)

0.9595959595959596

In [50]:
confusion_matrix(
  y_test, digits_mm.best_estimator_.predict(X_test)
)

array([[53,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 64,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, 56,  0,  0,  0,  0,  0,  0,  2],
       [ 0,  1,  0, 58,  0,  0,  0,  0,  1,  0],
       [ 0,  0,  0,  0, 68,  0,  0,  2,  1,  0],
       [ 0,  0,  0,  0,  1, 63,  1,  0,  0,  3],
       [ 0,  0,  0,  0,  0,  0, 56,  0,  1,  0],
       [ 0,  0,  0,  0,  0,  0,  0, 55,  0,  0],
       [ 0,  3,  1,  0,  0,  0,  0,  0, 50,  1],
       [ 0,  0,  0,  1,  0,  1,  0,  3,  1, 47]])

In [51]:
print(
  classification_report(y_test, digits_mm.best_estimator_.predict(X_test))
)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        53
           1       0.94      1.00      0.97        64
           2       0.98      0.97      0.97        58
           3       0.98      0.97      0.97        60
           4       0.99      0.96      0.97        71
           5       0.98      0.93      0.95        68
           6       0.98      0.98      0.98        57
           7       0.92      1.00      0.96        55
           8       0.93      0.91      0.92        55
           9       0.89      0.89      0.89        53

    accuracy                           0.96       594
   macro avg       0.96      0.96      0.96       594
weighted avg       0.96      0.96      0.96       594

