In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import SGDClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import RocCurveDisplay
from sklearn.model_selection import learning_curve

# Linear SVC with SGD

Load data

In [8]:
x_trn = pd.read_pickle("./data/x_train.pkl")
y_trn = pd.read_pickle("./data/y_train.pkl")
x_tst = pd.read_pickle("./data/x_test.pkl")

Set random seed

In [9]:
rng = np.random.RandomState(0)

Create cross-validation generator

In [11]:
cvg = StratifiedKFold(n_splits=5, shuffle=True, random_state=rng)

Create an instance of the classifier we want to use

In [12]:
clf = SGDClassifier(loss='hinge', penalty='l2', random_state=rng)

Take a look at the parameters we'll need to create a grid for

In [13]:
clf.get_params(deep=False)

{'alpha': 0.0001,
 'average': False,
 'class_weight': None,
 'early_stopping': False,
 'epsilon': 0.1,
 'eta0': 0.0,
 'fit_intercept': True,
 'l1_ratio': 0.15,
 'learning_rate': 'optimal',
 'loss': 'hinge',
 'max_iter': 1000,
 'n_iter_no_change': 5,
 'n_jobs': None,
 'penalty': 'l2',
 'power_t': 0.5,
 'random_state': RandomState(MT19937) at 0x7F9751FBA040,
 'shuffle': True,
 'tol': 0.001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

Create parameter grid

In [16]:
param_grid = {'alpha':[0.00005,0.0001, 0.0005, 0.005, 0.001, 0.01], 'tol':[0.0005, 0.005, 0.001, 0.01]}

param_grid = ParameterGrid(param_grid)

Create successive halving grid search cross-validation estimator

In [None]:
search = HalvingGridSearchCV(estimator=clf, 
                             param_grid=param_grid, 
                             factor=3, 
                             cv=cvg, 
                             random_state=rng)
search.fit(x_trn, y_trn)

Create and save successive halving heatmap https://scikit-learn.org/stable/auto_examples/model_selection/plot_successive_halving_heatmap.html#

In [None]:
#TODO

Plot and save candidate scores over successive halving iterations https://scikit-learn.org/stable/auto_examples/model_selection/plot_successive_halving_iterations.html#

In [None]:
results = pd.DataFrame(search.cv_results_)
results["params_str"] = results.params.apply(str)
results.drop_duplicates(subset=("params_str", "iter"), inplace=True)
mean_scores = results.pivot(
    index="iter", columns="params_str", values="mean_test_score"
)
ax = mean_scores.plot(legend=False, alpha=0.6)

labels = [
    f"iter={i}\nn_samples={search.n_resources_[i]}\nn_candidates={search.n_candidates_[i]}"
    for i in range(search.n_iterations_)
]

ax.set_xticks(range(search.n_iterations_))
ax.set_xticklabels(labels, rotation=45, multialignment="left")
ax.set_title("Scores of candidates over iterations")
ax.set_ylabel("mean test score", fontsize=15)
ax.set_xlabel("iterations", fontsize=15)
plt.tight_layout()
plt.show()
#TODO: save plot

Plot and save feature importance scores to determine top features https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html

In [None]:
#TODO

Plot and save ROC curve https://scikit-learn.org/stable/modules/generated/sklearn.metrics.RocCurveDisplay.html#sklearn.metrics.RocCurveDisplay.from_predictions

In [None]:
#TODO
#use RocCurveDisplay.from_predictions(y_true, y_pred)

Set classifier parameters to optimal values identified by cross-validation and fit on entire training set, then output predictions on test set

In [None]:
# Add loss = hinge/log (depending on model), and penalty = l2 to best params

clf.set_params(search.best_params_) #check value, might need to join with params from block 4
clf.fit(x_trn, y_trn)
#we need to decide what to do with these predictions, since it's an unlabeled test set.