# Running experiments on the Mushroom dataset

In [1]:
import os
import time
import warnings

%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np

from tqdm import TqdmSynchronisationWarning
warnings.simplefilter("ignore", TqdmSynchronisationWarning)

The paths

In [2]:
PATH_DATA = "../data/mushrooms"

PATH_ARCHIVE = os.path.join(PATH_DATA, "arch")
if not os.path.isdir(PATH_ARCHIVE):
    os.mkdir(PATH_ARCHIVE)

Filenames

In [3]:
filenames = {
    "input": "staged_dataset.gz",
    "output": "results.gz"
}

The dataset and results

In [4]:
filename_input = os.path.join(PATH_DATA, filenames["input"])

filename_output = os.path.join(PATH_DATA, filenames["output"])

if os.path.exists(filename_output):
    mdttm = time.strftime("%Y%m%d%H%M%S", time.gmtime(os.path.getmtime(filename_input)))
    os.rename(filename_output, os.path.join(PATH_ARCHIVE, "%s%s" % (filenames["output"], mdttm)))

The train-test splitting

In [5]:
from sklearn.utils import check_random_state
from sklearn.model_selection._split import _validate_shuffle_split


def mc_split(R, n_splits=1, test_size='default', train_size=None, random_state=None):
    n_samples = np.prod(R.shape)
    n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size)

    rng = check_random_state(random_state)
    for i in range(n_splits):
        permutation = rng.permutation(n_samples)
        ind_test = permutation[:n_test]
        ind_train = permutation[n_test:(n_test + n_train)]

        yield ind_train, ind_test

A helper function to extract a sparse submatrix from a dense one accorind to the provided indices.

In [6]:
from scipy.sparse import coo_matrix


def get_submatrix(mat, indices):
    nz_ij = np.unravel_index(indices, mat.shape)
    subm = coo_matrix((mat.flat[indices], nz_ij), shape=mat.shape)
    return subm.tocsr()

The scores collected in the experiment

In [7]:
from sklearn.metrics import roc_auc_score
# from sklearn.metrics import confusion_matrix


def mc_get_scores(R_true, R_prob):
    R_pred = np.where(R_prob.data > 0.5, 1, -1)

    # compute the confusion matrix for ±1 labels (`-1` is negative)
    ii, jj = ((R_pred + 1) // 2).astype(int), ((R_true.data + 1) // 2).astype(int)
    cnfsn = coo_matrix((np.ones_like(ii), (ii, jj)), dtype=np.int64).toarray()

    return {"tn": cnfsn[0, 0], "fn": cnfsn[1, 0],
            "fp": cnfsn[0, 1], "tp": cnfsn[1, 1],
            "auc": roc_auc_score(R_true.data, R_prob.data)}

Fix the seed

In [8]:
random_state = np.random.RandomState(0x0BADCAFE)

Load the dataset

In [9]:
import gzip
import pickle

with gzip.open(filename_input, "rb") as fin:
    X, Y, R_full = pickle.load(fin)

Get the development and test datasets

In [10]:
from sklearn.model_selection import train_test_split

ind_dvlp, ind_test = next(mc_split(R_full, n_splits=1, random_state=random_state,
                                   test_size=0.25))

R_test = get_submatrix(R_full, ind_test)

Set up the parameter grid

In [11]:
from sklearn.model_selection import ParameterGrid

grid_dataset = ParameterGrid({
    "train_size": np.arange(0.00001, 0.001001, 0.00003),  # [1e-3],
    "test_size": [0.1],
    "n_splits": [3],
})

grid_model = ParameterGrid({
    "lasso_mlt": [1e1],
    "C_group": [2e-5, 2e-4, 2e-3],
    "C_ridge": [1e-3],
    "rank": [5]
})

Load the Sparse Group IMC class for binary classification.

In [12]:
from sgimc import SparseGroupIMCClassifier

Run the experiment: fit the IMC model on a train, and compute scores on a validation set

In [13]:
import tqdm
from sklearn.model_selection import ShuffleSplit


results = []
for par_dtst in tqdm.tqdm(grid_dataset):

    splt = ShuffleSplit(**par_dtst, random_state=random_state)
    for cv, (ind_train, ind_valid) in enumerate(splt.split(ind_dvlp)):

        # prepare the train and test indices
        ind_train, ind_valid = ind_dvlp[ind_train], ind_dvlp[ind_valid]
        R_train = get_submatrix(R_full, ind_train)
        R_valid = get_submatrix(R_full, ind_valid)

        # Run the experiment: the model 
        for par_mdl in grid_model:  # tqdm.tqdm(, desc="cv %02d" % (cv,))

            # fit the model
            C_lasso = par_mdl["lasso_mlt"] * par_mdl["C_group"]
            C_group, C_ridge = par_mdl["C_group"], par_mdl["C_ridge"]
            imc = SparseGroupIMCClassifier(par_mdl["rank"], n_threads=3, random_state=42,
                                           C_lasso=C_lasso, C_group=C_group, C_ridge=C_ridge)
            imc.fit(X, Y, R_train)

            # compute the class probabilities
            prob_full = imc.predict_proba(X, Y)  # uses own copies of W, H
            prob_valid = get_submatrix(prob_full, ind_valid)
            prob_test = get_submatrix(prob_full, ind_test)

            scores_valid = mc_get_scores(R_valid, prob_valid)
            scores_test = mc_get_scores(R_test, prob_test)

            # record the results
            results.append((par_dtst, cv, par_mdl, scores_valid, scores_test))
        # end for
    # end for
# end for

100%|██████████| 34/34 [1:46:30<00:00, 187.97s/it]


Save the results in a pickle.

In [14]:
with gzip.open(filename_output, "wb+", 4) as fout:
    pickle.dump(results, fout)

<br/>
<hr/>