# Chapter 7: stability selection plots
Plot the result of the variable selection procedure.

In [38]:
import os
import cPickle as pkl
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

plt.style.use('seaborn-paper')
plt.rc('text', usetex=False)

import warnings
warnings.filterwarnings('ignore')

# 0. load `.pkl` files

In [92]:
def load_pkl(filename):
    """Load the input pkl file."""
    with open(filename,'rb') as f:
        dd = pkl.load(f)
    return dd

# Model names
names = ['gradient_boosting', 'random_forests', 'l1l2',
        'l2_logistic_regression', 'l1_logistic_regression',
        'linear_svc_l2', 'linear_svc_l1']

tails = ['_scores', '_coefs']

scores = {}
coefs = {}

# Load the pickles
for name in names:
    try:
        scores[name] = load_pkl(os.path.join('..', '..', 'scripts', 'aism', name+tails[0]+'.pkl'))
        coefs[name] = load_pkl(os.path.join('..', '..', 'scripts', 'aism', name+tails[1]+'.pkl'))
    except:
        print('{} not finished yet'.format(name))

random_forests not finished yet


# 1. Plot classification cv scores (step 1)

In [93]:
def plot_score(scores, show_figure=False, tag=''):
    """Plot the scores exploring the scores dictionary.
    
    scores: list of dicts having metrics (recall, AUC, MCC, precision, accuracy) as keys
    save_figure: bool
    tag: filename tail
    """    
    # Init the empty lists
    rcll = []
    auc = []
    mcc = []
    prec = []
    acc = []
    
    
    # Explore each cv iter
    for cv in scores:
        rcll.append(cv['recall'])
        auc.append(cv['AUC'])
        mcc.append(cv['MCC'])
        prec.append(cv['precision'])
        acc.append(cv['accuracy'])
    cv_results_ = {'recall': rcll, 'AUC': auc, 'MCC': mcc,
                   'precision': prec, 'accuracy': acc}
    cv_results_ = pd.DataFrame(cv_results_)

    if show_figure:
        plt.figure(dpi=300)
    
        # Plot each metric
        for i, metric in enumerate(cv_results_.columns):
            plt.subplot(3, 2, i+1)
            plt.hist(cv_results_[metric], bins=20)
            plt.title(r'{} = ({:.3f} $\pm$ {:.3f})'.format(metric,
                                                          np.mean(cv_results_[metric]),
                                                          np.std(cv_results_[metric])))
        plt.suptitle('{}'.format(tag.replace('_', ' ').title()), y=1.05)
        plt.tight_layout()
        plt.savefig(os.path.join('..', '..', 'images', 'aism_scores_{}.png'.format(tag)))
    
    return cv_results_

In [95]:
# Create the scoreboard
scoreboard = pd.DataFrame(index=names, columns=[u'AUC', u'MCC', u'accuracy',
                                                u'precision', u'recall'])
for name in names:
    try:
        df = plot_score(scores[name], show_figure=False, tag=name)
        for col in scoreboard.columns:
            scoreboard[col].loc[name] = r'({:.3f} $\pm$ {:.3f})'.format(df.mean()[col],
                                                                df.std()[col])
    except:
        print('{} not finished yet'.format(name))
    


scoreboard

random_forests not finished yet


Unnamed: 0,AUC,MCC,accuracy,precision,recall
gradient_boosting,(0.802 $\pm$ 0.013),(0.606 $\pm$ 0.026),(0.805 $\pm$ 0.013),(0.821 $\pm$ 0.022),(0.832 $\pm$ 0.039)
random_forests,,,,,
l1l2,(0.807 $\pm$ 0.016),(0.613 $\pm$ 0.032),(0.806 $\pm$ 0.016),(0.845 $\pm$ 0.031),(0.799 $\pm$ 0.042)
l2_logistic_regression,(0.811 $\pm$ 0.015),(0.619 $\pm$ 0.029),(0.807 $\pm$ 0.015),(0.864 $\pm$ 0.020),(0.775 $\pm$ 0.027)
l1_logistic_regression,(0.817 $\pm$ 0.014),(0.630 $\pm$ 0.028),(0.813 $\pm$ 0.014),(0.865 $\pm$ 0.018),(0.787 $\pm$ 0.023)
linear_svc_l2,(0.810 $\pm$ 0.016),(0.616 $\pm$ 0.032),(0.806 $\pm$ 0.016),(0.858 $\pm$ 0.023),(0.782 $\pm$ 0.028)
linear_svc_l1,(0.818 $\pm$ 0.015),(0.632 $\pm$ 0.029),(0.814 $\pm$ 0.015),(0.870 $\pm$ 0.021),(0.782 $\pm$ 0.027)


# 2. Plot variable ranking

In [109]:
def flatten(x):
    """Flatten a list."""
    return [y for l in x for y in flatten(l)] if type(x) in (list, np.ndarray) else [x]

In [111]:
from collections import Counter

for name in names:
    try:
        print Counter(flatten([c.tolist() for c in coefs[name]]))
    except:
        print('{} not finished yet'.format(name))

Counter({'FIM (012': 74, 'FIM (SUB6': 26})
random_forests not finished yet
Counter({'ABILH001': 2915})
Counter({'ABILH001': 16498})
Counter({'ABILH001': 6132})
Counter({'FIM (012': 100, 'FIM (011': 100, 'FIM (010': 100, 'HADS 007': 100, 'SDMT TOT': 100, 'FIM (006': 99, 'FIM (SUB5': 98, 'ABILH018': 97, 'HADS 002': 95, 'FIM (SUB6': 94, 'HADS 013': 92, 'FIM (016': 92, 'FIM (017': 91, 'ABILH012': 90, 'FIM (008': 89, 'EDINB001_2': 88, 'ABILH021': 86, 'ABILH009': 80, 'FIM (003': 79, 'MFIS 002': 74, 'HADS 005': 72, 'FIM (002': 69, 'FIM (009': 65, 'ABILH007': 63, 'MFIS 003': 61, 'MFIS 009': 61, 'MFIS 012': 60, 'EDINB001_1': 60, 'MFIS 014': 59, 'FIM (004': 57, 'EDINB002_0': 55, 'MOCA 005': 55, 'ABILHTOT': 55, 'HEIG': 55, 'ABILHSUB1': 54, 'LIFE 004': 53, 'FIM (014': 52, 'FIM (SUB3': 50, 'HADS 012': 49, 'MOCA 001': 49, 'ABILH020': 49, 'LIFE 010': 49, 'MFIS 019': 49, 'MFIS SUB1': 49, 'HADS 003': 48, 'EDINB002_2': 48, 'HADS SUB1': 47, 'FIM (015': 46, 'OAB-Q002': 45, 'HADS SUB2': 45, 'MOCA 010': 44,