In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import SGDClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import RocCurveDisplay
from sklearn.model_selection import learning_curve

# Logistic Regression with SGD

Load data

In [None]:
x_trn = None #TODO
y_trn = None
x_tst = None

Set random seed

In [None]:
rng = np.random.RandomState(0)

Create cross-validation generator

In [None]:
cvg = StratifiedKFold(n_splits=5, shuffle=False, random_state=rng)

Create an instance of the classifier we want to use

In [None]:
clf = SGDClassifier(loss='log', penalty='l2', random_state=rng)

Take a look at the parameters we'll need to create a grid for

In [None]:
clf.get_params(deep=False)

Create parameter grid

In [None]:
param_grid = {}#TODO
param_grid = ParameterGrid(param_grid)

Create successive halving grid search cross-validation estimator

In [None]:
search = HalvingGridSearchCV(estimator=clf, 
                             param_grid=param_grid, 
                             factor=3, 
                             cv=cvg, 
                             random_state=rng)
search.fit(x_trn, y_trn)

Create and save successive halving heatmap https://scikit-learn.org/stable/auto_examples/model_selection/plot_successive_halving_heatmap.html#

In [None]:
#TODO

Plot and save candidate scores over successive halving iterations https://scikit-learn.org/stable/auto_examples/model_selection/plot_successive_halving_iterations.html#

In [None]:
results = pd.DataFrame(search.cv_results_)
results["params_str"] = results.params.apply(str)
results.drop_duplicates(subset=("params_str", "iter"), inplace=True)
mean_scores = results.pivot(
    index="iter", columns="params_str", values="mean_test_score"
)
ax = mean_scores.plot(legend=False, alpha=0.6)

labels = [
    f"iter={i}\nn_samples={search.n_resources_[i]}\nn_candidates={search.n_candidates_[i]}"
    for i in range(search.n_iterations_)
]

ax.set_xticks(range(search.n_iterations_))
ax.set_xticklabels(labels, rotation=45, multialignment="left")
ax.set_title("Scores of candidates over iterations")
ax.set_ylabel("mean test score", fontsize=15)
ax.set_xlabel("iterations", fontsize=15)
plt.tight_layout()
plt.show()
#TODO: save plot

Plot and save feature importance scores to determine top features https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html 

In [None]:
#TODO

Plot and save ROC curve https://scikit-learn.org/stable/modules/generated/sklearn.metrics.RocCurveDisplay.html#sklearn.metrics.RocCurveDisplay.from_predictions

In [None]:
#TODO
#use RocCurveDisplay.from_predictions(y_true, y_pred)

Set classifier parameters to optimal values identified by cross-validation and fit on entire training set, then output predictions on test set

In [None]:
clf.set_params(search.best_params_) #check value, might need to join with params from block 4
clf.fit(x_trn, y_trn)
#we need to decide what to do with these predictions, since it's an unlabeled test set.

Plot and save feature importance scores to determine top features

In [1]:
coef = clf.coef_

coef = coef[0,:]
features = x_trn.columns
arg_rank = np.argsort(-np.abs(coef))

used_features = len(features)

# Plot coefficient chart
plt.hlines( np.arange(used_features),0,coef, color='red')
plt.plot(coef, np.arange(used_features), 'D')
plt.plot([0, 0], [np.arange(used_features).min(), np.arange(used_features).max()], '--')
plt.title("Coefficients of Access Behavior Anomaly Model")
plt.ylabel("Access Behavior Anomaly Features")
plt.xlabel("Coefficients from Access Behavior Anomaly model")
plt.yticks(np.arange(used_features),features)
plt.tight_layout()

NameError: name 'clf' is not defined