# Running experiments on the Mushroom dataset

In [1]:
import os
import time
import gzip
import pickle
import warnings

%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np

from tqdm import TqdmSynchronisationWarning
warnings.simplefilter("ignore", TqdmSynchronisationWarning)

The paths

In [2]:
PATH_TO_EXP = '/cobrain/groups/ml_group/experiments/dustpelt/imc_exp/'
PATH_DATA = os.path.join(PATH_TO_EXP, 'data/mushrooms')
#PATH_DATA = "../data/mushrooms"

PATH_ARCHIVE = os.path.join(PATH_DATA, "arch_sgimc")
if not os.path.isdir(PATH_ARCHIVE):
    os.mkdir(PATH_ARCHIVE)

Filenames

In [3]:
filenames = {
    "input": "staged_dataset.gz",
    "output": "results_sgimc.gz"
}

The dataset and results

In [4]:
filename_input = os.path.join(PATH_DATA, filenames["input"])

filename_output = os.path.join(PATH_DATA, filenames["output"])

if os.path.exists(filename_output):
    mdttm = time.strftime("%Y%m%d_%H%M%S")
    os.rename(filename_output, os.path.join(PATH_ARCHIVE, "%s%s" % (mdttm, filenames["output"])))

The train-test splitting

In [5]:
from sgimc.utils import mc_split

A helper function to extract a sparse submatrix from a dense one accorind to the provided indices.

In [6]:
from sgimc.utils import get_submatrix

The scores collected in the experiment

In [7]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from scipy.sparse import coo_matrix


def mc_get_scores(R_true, R_prob):
    R_pred = np.where(R_prob.data > 0.5, 1, -1)

    # compute the confusion matrix for ±1 labels (`-1` is negative)
    ii, jj = ((R_pred + 1) // 2).astype(int), ((R_true.data + 1) // 2).astype(int)
    cnfsn = confusion_matrix(y_true=jj, y_pred=ii)

    return {"tn": cnfsn[0, 0], "fn": cnfsn[1, 0],
            "fp": cnfsn[0, 1], "tp": cnfsn[1, 1],
            "auc": roc_auc_score(R_true.data, R_prob.data)}

Fix the seed

In [8]:
random_state = np.random.RandomState(0x0BADCAFE)

Load the dataset

In [14]:
from sgimc.utils import load, save

X, Y, R_full = load(filename_input)

Get the development and test datasets

In [10]:
dvlp_size, test_size = 0.9, 0.1

ind_dvlp, ind_test = next(mc_split(R_full, n_splits=1, random_state=random_state,
                                   train_size=dvlp_size, test_size=test_size))

R_test = get_submatrix(R_full, ind_test)

Set up the parameter grid

In [11]:
from sklearn.model_selection import ParameterGrid

grid_dataset = ParameterGrid({
    "train_size": np.arange(0.00001, 0.001001, 0.00003),
    "n_splits": [5],
})

grid_model = ParameterGrid({
    "C_lasso": [0],
    "C_group": [2e-5, 2e-4, 2e-3],
    "C_ridge": [1e-5],
    "rank": [5]
})

Load the Sparse Group IMC class for binary classification.

In [12]:
from sgimc import SparseGroupIMCClassifier

Run the experiment: fit the IMC model on a train, and compute scores on a validation set

In [13]:
from tqdm import tqdm
from sklearn.model_selection import ShuffleSplit, KFold
from sklearn.model_selection import train_test_split


results = []
for par_dtst in tqdm(grid_dataset):
    # prepare the train dataset: take the specified share from the beginnig of the index array
    ind_train_all, _ = train_test_split(ind_dvlp, shuffle=False, random_state=random_state,
                                        test_size=(1 - (par_dtst["train_size"] / dvlp_size)))

    # Run the experiment: the model 
    for par_mdl in grid_model:  # tqdm.tqdm(, desc="cv %02d" % (cv,))
        # set up the model
        C_lasso, C_group, C_ridge = par_mdl["C_lasso"], par_mdl["C_group"], par_mdl["C_ridge"]
        imc = SparseGroupIMCClassifier(par_mdl["rank"], n_threads=4, random_state=42,
                                       C_lasso=C_lasso, C_group=C_group, C_ridge=C_ridge)

        # fit on the whole development dataset
        R_train = get_submatrix(R_full, ind_train_all)
        imc.fit(X, Y, R_train)

        # get the score
        prob_full = imc.predict_proba(X, Y)
        prob_test = get_submatrix(prob_full, ind_test)
        scores_test = mc_get_scores(R_test, prob_test)

        # run the k-fold CV
        # splt = ShuffleSplit(**par_dtst, random_state=random_state)
        splt = KFold(par_dtst["n_splits"], shuffle=True, random_state=random_state)
        for cv, (ind_train, ind_valid) in enumerate(splt.split(ind_train_all)):

            # prepare the train and test indices
            ind_train, ind_valid = ind_train_all[ind_train], ind_train_all[ind_valid]
            R_train = get_submatrix(R_full, ind_train)
            R_valid = get_submatrix(R_full, ind_valid)

            # fit the model
            imc = SparseGroupIMCClassifier(par_mdl["rank"], n_threads=4, random_state=42,
                                           C_lasso=C_lasso, C_group=C_group, C_ridge=C_ridge)
            imc.fit(X, Y, R_train)

            # compute the class probabilities
            prob_full = imc.predict_proba(X, Y)  # uses own copies of W, H
            prob_valid = get_submatrix(prob_full, ind_valid)

            scores_valid = mc_get_scores(R_valid, prob_valid)

            # record the results
            results.append({"train_size": par_dtst["train_size"],
                            "C_group": par_mdl["C_group"],
                            "cv": cv,
                            "val_score": scores_valid["auc"],
                            "test_score": scores_test["auc"]}
                          )
        # end for
    # end for
# end for

# Save the results in a pickle

with gzip.open(filename_output, "wb+", 4) as fout:
    pickle.dump(results, fout)

100%|██████████| 34/34 [1:39:47<00:00, 176.10s/it]


In [None]:
# save()

<br/>
<hr/>