In [1]:
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_approximation import RBFSampler
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import BaggingClassifier

from sklearn.pipeline import Pipeline

import ipywidgets as widgets
from ipywidgets import Layout
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
def get_model(model,
              sampler = None,
              pca = False,
              ensemble = None,
              box_type = None):
    '''
    Parameters
    ----------
    model: string, 'dt', 'linear_svc' or 'logit'
    sampler: string, 'rbf' or 'nystroem', or  None
    pca: bool
    ensemble: integer or None
    box_type: string, 'black', 'grey' or None (ignored if ensemble = None)
    
    Returns
    -------
    clf: a model with the parameters specified
    '''
    if model not in ['dt', 'linear_svc', 'logit']:
        raise ValueError('model {0} is not supported'.format(model))
    if sampler not in ['rbf', 'nystroem', None]:
        raise ValueError('sampler {0} is not supported'.format(sampler))
    if type(pca) != bool:
        raise ValueError('pca is a boolean')
    if type(ensemble) not in [int, type(None)]:
        raise ValueError('Wrong value for ensemble')
    if isinstance(ensemble,int) and ensemble < 1:
        raise ValueError('Number of estimators must be greater than 0')
    if box_type not in ['black', 'grey', None]:
        raise ValueError("box_type must be 'black', 'grey' or None")
    if box_type is not None and ensemble is None:
        raise ValueError("box_type doesn't match with ensemble")
    
    #s = RBFSampler() if sampler == 'rbf' else  
    #   Nystroem() if sampler == 'nystroem' else FunctionTransformer(None, validate = False)
    #p = PCA() if pca else FunctionTransformer(None, validate = False)
    #m = DecisionTreeClassifier() if model == 'dt' 
    #      else LinearSVC() if model == 'linear_svc' else LogisticRegression()
    
    
    if sampler == 'rbf':
        s = RBFSampler(gamma = 0.2)
    elif sampler == 'nystroem':
        s = Nystroem(gamma = 0.2)
    elif sampler is None:
        s = FunctionTransformer(None, validate = False)
        
    
    
    if pca:
        p = PCA(n_components = 0.9, svd_solver = "full")
    else:
        p = FunctionTransformer(None, validate = False)
    
    if model == 'dt':
        m = DecisionTreeClassifier()
    elif model == 'linear_svc':
        m = LinearSVC(C = 1)
    elif model == 'logit':
        m = LogisticRegression(C = 1, multi_class = 'multinomial', solver = 'lbfgs')
    
    
    if not ensemble:
        clf = Pipeline([
            ('sampler', s),
            ('pca', p),
            ('model', m),
        ])
    elif box_type == 'black':
        bag = BaggingClassifier(base_estimator = m, n_estimators = ensemble)
        clf = Pipeline([
            ('sampler', s),
            ('pca', p),
            ('model', bag),
        ])
    elif box_type == 'grey':
        pipe = Pipeline([
            ('sampler', s),
            ('pca', p),
            ('model', m),
        ])
        clf = BaggingClassifier(base_estimator = pipe, n_estimators = ensemble)
    
    return clf

In [3]:
def get_label(model, sampler, pca, box_type, train_test):
    '''Returns a string with the correct label
    Parameters
    ----------
    model: string, model name
    sampler: string, 'rbf' or 'nystroem', or  None
    pca: bool
    box_type: string, 'black', 'grey' or None
    train_test: string, 'train' or 'test'
    '''
    if sampler is not None and sampler not in ['rbf', 'nystroem']:
        raise ValueError("sampler must be 'rbf', 'nystroem' or None")
    if train_test not in ['train', 'test']:
        raise ValueError("train_test must be 'train' or 'test'")
    m = model + "_"
    s = "" if sampler is None else sampler + "_"
    p = "" if not pca else "pca_"
    b = "" if box_type is None else box_type + "_"
    t =  train_test + " score"
    
    r = m + s + p + b + t
    return r

In [4]:
def get_model_scores(model,
              dataset,
              features = None,
              sampler = None,
              pca = False,
              ensemble = None,
              box_type = None):
    '''
    Parameters
    ----------
    model: string, 'dt', 'rf, 'linear_svc' or 'logit'
    dataset: dictionary with keys 'data_train', 'data_test', 'target_train', 'target_test'
    features: array with features to test or None
    sampler: string, 'rbf' or 'nystroem', or  None
    pca: bool
    ensemble: integer or None
    box_type: string, 'black', 'grey' or None (ignored if ensemble = None)
    
    Returns
    -------
    A tuple of two dictionarys, (train_dic,test_dic), each one with
    keys 'abs', 'ord', 'label'
    '''
    if not isinstance(features, (list, type(None))):
        raise ValueError('features must be an ordered list of integers or None')
        
    if features is None and sampler is not None:
        raise ValueError('features is needed with sampler')
    
    data_train = dataset['data_train']
    data_test = dataset['data_test']
    target_train = dataset['target_train']
    target_test = dataset['target_test']
    
    clf = get_model(model, sampler, pca, ensemble, box_type)
    if features is None:
        clf.fit(data_train, target_train)
        train_score = clf.score(data_train, target_train)
        test_score = clf.score(data_test, target_test)
        train_dic = {
            'absi': list(features_selector.value), #los dos valores
            'ord': [train_score, train_score],
            'label': get_label(model, sampler, pca, box_type, 'train')
        }
        test_dic = {
            'absi': list(features_selector.value),
            'ord': [test_score, test_score],
            'label': get_label(model, sampler, pca, box_type, 'test')
        }
    else:
        train_scores = []
        test_scores = []
        for f in features:
            clf.set_params(sampler__n_components = f)
            clf.fit(data_train, target_train)
            train_score = clf.score(data_train, target_train)
            test_score = clf.score(data_test, target_test)
            
            train_scores.append(train_score)
            test_scores.append(test_score)
            
        train_dic = {
            'absi': features,
            'ord': train_scores,
            'label': get_label(model, sampler, pca, box_type, 'train')
        }
        test_dic = {
            'absi': features,
            'ord': test_scores,
            'label': get_label(model, sampler, pca, box_type, 'test')
        }
    return train_dic, test_dic

In [5]:
# real currentwork
def get_data(dataset_name, prop_train = 2/3, n_ins = None):
    '''
    Returns a dictionary with keys ['data_train', 'data_test', 'target_train', 'target_test'],
    with the data according to dataset name
    
    Parameters
    ----------
    dataset_name: str, name of the dataset. Possible values:
        [
        "covertype",
        "digits",
        "fall_detection",
        "mnist",
        "pen_digits",
        "satellite",
        "segment",
        "vowel",
        ]
    prop_train: float, the proportion of instances in the train dataset, default: 2/3
    n_ins: get a subset of n_ins instances, if total number is smaller, is ignored
    Return
    ------
    A dictionary with the keys
    '''
    options = [
        "covertype",
        "digits",
        "fall_detection",
        "mnist",
        "pen_digits",
        "satellite",
        "segment",
        "vowel",
        ]
    if dataset_name not in options:
        raise ValueError("{} dataset is not available".format(dataset_name))
    if not 0 < prop_train < 1:
        raise ValueError("prop_train must be 0 < prop_train < 1")
    if not isinstance(n_ins, (type(None),int)):
        raise ValueError("Bad type for n_ins")
    if isinstance(n_ins, int) and n_ins <= 0:
        raise ValueError("n_ins must be positive")
        
        
    dir_path = '../../datasets/'
    file_path = '{0}{1}/{1}.csv'.format(dir_path, dataset_name)
    
    df = pd.read_csv(file_path)
    
    N = df.shape[0]
    if n_ins is not None:
        N = min(N, n_ins)
    N_train = np.ceil(N * prop_train).astype(np.int64)
    N_test = N - N_train
    
    data = df.drop(labels = 'target', axis = 1)
    target = df.target
    
    data_train = data.iloc[:N_train]
    data_test = data.iloc[N_train:N]

    target_train = target[:N_train]
    target_test = target[N_train:N]
    
    ret_dic = {
        'data_train': data_train,
        'data_test': data_test,
        'target_train': target_train,
        'target_test': target_test,
    }
    
    return ret_dic
    

In [6]:
def get_params_from_models_bar(mod_bar):
    '''
    Returns a dictionary with the needed keys to pass to get_model_scores
    
    Parameters
    ----------
    mod_bar: a HBox with the widgets
    
    Return
    ------
    A dictionary with keys [model, dataset, features, sampler, pca, ensemble, box_type] 
    '''
    # Todo Ahora mismo es funcional, pero es feo y poco seguro. Habrá que retocarlo
    '''
        hb = widgets.HBox([
        model_selector,
        sampler_selector,
        box_type_selector,
        n_estimators_selector,
        pca_checkbox,
    ])
    '''
    dataset_name = dataset_selector.value
    di = get_data(dataset_name, n_ins = size_selector.value)
    d = {
        'model': mod_bar.children[0].value,
        'dataset': di,
        'features':  None if mod_bar.children[1].value == "None" else np.linspace(*(features_selector.value), dtype = np.int64).tolist() if np.ediff1d(features_selector.value)[0] > 50 else np.arange(*(features_selector.value)).tolist(),
        #todo Si el rango en features selector es de menos de 50, tendremos problemas
        'sampler': mod_bar.children[1].value if mod_bar.children[1].value != "None" else None,
        'pca': mod_bar.children[4].value,
        'ensemble': None if mod_bar.children[1].value == "None" else mod_bar.children[3].value,
        'box_type': None if mod_bar.children[1].value == "None" else mod_bar.children[2].value if mod_bar.children[2].value != "None" else mod_bar.children[2].value,
    }
    return d

In [7]:
def get_all_model_scores():
    '''
    Return two lists, one for all the train_dicts, and the other for all the test_dicts,
    based on the models bars in the GUI
    
    Returns
    -------
    A tuple with (test_dicts, train_dicts), where each one is a list with dictionarys
    '''
    train_dics = []
    test_dics = []
    for c in models_bar.children:
        # c es un HBox
        train_dic, test_dic =  get_model_scores(**get_params_from_models_bar(c))
        train_dics.append(train_dic)
        test_dics.append(test_dic)
    
    fig = plt.figure(figsize = (12.8,4.8))
    test_sp = fig.add_subplot(121)
    train_sp = fig.add_subplot(122, sharey = test_sp)
    
    
    test_sp.set_title("Test scores")
    train_sp.set_title("Train scores")
    # Todo add the correct title
    fig.suptitle("{}, {} instances".format(dataset_selector.value, size_selector.value))
    #fig.suptitle("Esto es un subtítulo")
    for te, tr in zip(test_dics, train_dics):
        test_sp.plot(te['absi'], te['ord'], label = te['label'])
        train_sp.plot(tr['absi'], tr['ord'], label = tr['label'])
    test_sp.legend()
    train_sp.legend()
    plt.close()
    return fig

In [8]:
'''
from sklearn.preprocessing import FunctionTransformer
import numpy as np
from sklearn.decomposition import PCA
'''

'\nfrom sklearn.preprocessing import FunctionTransformer\nimport numpy as np\nfrom sklearn.decomposition import PCA\n'

In [9]:
# def get_new_model_bar():
#     '''
#     Returns
#     -------
#     Returns a new HBox with the widgets to define a new training model
#     '''
#     model_selector = widgets.Dropdown(
#         options=['dt', 'logit', 'linear_svc'],
#         value='dt',
#         layout = Layout(flex = '0 3 auto'),
#         #description=':',
#     )
# 
#     sampler_selector = widgets.Dropdown(
#         options=['None', 'rbf', 'nystroem'],
#         value='rbf',
#         layout = Layout(flex = '1 3 auto'),
#         #description=':',
#     )
#     box_type_selector = widgets.Dropdown(
#         options=['None', 'black', 'grey'],
#         value='black',
#         layout = Layout(flex = '1 3 auto'),
#         #description=':',
#     )
#     '''
#     features_selector = widgets.IntRangeSlider(
#         value=[30, 100],
#         min=30,
#         max=400,
#         step=10,
#         layout = Layout(flex = '0 1 auto'),
#         #description=':',
#     )
#     '''
#     n_estimators_selector = widgets.IntSlider(
#         value=30,
#         min=2,
#         max=200,
#         step=1,
#         layout = Layout(flex = '1 1 auto'),
#         #description=':',
#     )
#     pca_checkbox = widgets.Checkbox(
#         value=False,
#         layout = Layout(flex = '0 3 auto'),
#         #description='',
#     )
#     hb = widgets.HBox([
#         model_selector,
#         sampler_selector,
#         box_type_selector,
#         #features_selector,
#         n_estimators_selector,
#         pca_checkbox,
#     ])
#     return hb

In [10]:
# def add_model_bar(m):
#     '''
#     Append a new model bar to m, which has the same values as the last
#     model bar in m
#     Parameters
#     ----------
#     m: Is a VBox containing 1 or more HBox describing the new model
#     '''
#     if len(m.children) < 1:
#         raise ValueError('At least one model bar is needed')
#     copy_bar = m.children[-1]
# 
#     new_model_bar = get_new_model_bar()
# 
#     for i,c in enumerate(copy_bar.children):
#         new_model_bar.children[i].value = c.value
# 
#     m.children = tuple(list(m.children) + [new_model_bar])

In [11]:
# def remove_model_bar(m):
#     '''
#     Remove the las model bar of m, if there are at least 2
#     Parameters
#     ----------
#     m: Is a VBox containing 2 or more HBox describing models
#     '''
#     if len(m.children) < 2:
#         raise ValueError('minimum number of model bars reached')
#     m.children = tuple(list(m.children)[:-1])

In [12]:
'''headers = widgets.HBox([
    widgets.Label("Model"),
    widgets.Label("Sampling"),
    widgets.Label("Box Type"),
    widgets.Label("Number estimators"),
    widgets.Label("PCA"),
], layout = widgets.Layout(justify_content = 'space-between'))'''

'headers = widgets.HBox([\n    widgets.Label("Model"),\n    widgets.Label("Sampling"),\n    widgets.Label("Box Type"),\n    widgets.Label("Number estimators"),\n    widgets.Label("PCA"),\n], layout = widgets.Layout(justify_content = \'space-between\'))'

In [13]:
'''features_selector = widgets.IntRangeSlider(
    value=[30, 100],
    min=30,
    max=400,
    step=10,
    layout = Layout(flex = '0 1 auto'),
    #description=':',
)'''

"features_selector = widgets.IntRangeSlider(\n    value=[30, 100],\n    min=30,\n    max=400,\n    step=10,\n    layout = Layout(flex = '0 1 auto'),\n    #description=':',\n)"

In [14]:
'''models_bar = widgets.VBox([get_new_model_bar()])'''

'models_bar = widgets.VBox([get_new_model_bar()])'

In [15]:
'''cool_models_bar = widgets.VBox([headers, models_bar])'''

'cool_models_bar = widgets.VBox([headers, models_bar])'

In [16]:
'''def add_model_bar_wraper(e):
    add_model_bar(models_bar)

add_model_bar_bt = widgets.Button(
    description='Add model',
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Add a new moder bar to train',
    #icon='check'
)
add_model_bar_bt.on_click(add_model_bar_wraper)'''



In [17]:
'''def remove_model_bar_wraper(e):
    if len(models_bar.children) > 1:
        remove_model_bar(models_bar)

remove_model_bar_bt = widgets.Button(
    description='Remove model',
    button_style='warning', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Remove the las model bar, if possible',
    #icon='check'
)
remove_model_bar_bt.on_click(remove_model_bar_wraper)'''



In [18]:
def calculate_bt_wrapper(e):
    l1 = get_all_model_scores()
    if clear_output_button.value:
        graphs_output.clear_output(wait = True)
    with graphs_output:
        display(l1)

In [19]:
'''calculate_bt = widgets.Button(
    description='Calculate',
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Calculate the models',
    #icon='check'
)
calculate_bt.on_click(calculate_bt_wrapper)'''



In [20]:
clear_output_button = widgets.ToggleButton(
    value=True,
    description='Clear Previous',
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Clear Previous Output',
    icon='check'
)

In [21]:
'''
size_selector = widgets.RadioButtons(
    options=[1000,2000,5000,10000],
    value=2000,
    #description='Pizza topping:',
    disabled=False,
    #orientation = 'vertical'
)

cool_size_selector = widgets.VBox([widgets.Label("Size of the dataset"), size_selector])
'''

'\nsize_selector = widgets.RadioButtons(\n    options=[1000,2000,5000,10000],\n    value=2000,\n    #description=\'Pizza topping:\',\n    disabled=False,\n    #orientation = \'vertical\'\n)\n\ncool_size_selector = widgets.VBox([widgets.Label("Size of the dataset"), size_selector])\n'

In [22]:
'''
dataset_selector = widgets.Dropdown(
    options = [
        "covertype",
        "digits",
        "fall_detection",
        "mnist",
        "pen_digits",
        "satellite",
        "segment",
        "vowel",
        ],
    value = 'digits',
    descripttion = 'Dataset:'
)
'''

'\ndataset_selector = widgets.Dropdown(\n    options = [\n        "covertype",\n        "digits",\n        "fall_detection",\n        "mnist",\n        "pen_digits",\n        "satellite",\n        "segment",\n        "vowel",\n        ],\n    value = \'digits\',\n    descripttion = \'Dataset:\'\n)\n'

In [23]:
'''gui = widgets.VBox([
    dataset_selector,
    cool_size_selector,
    widgets.HBox([add_model_bar_bt, remove_model_bar_bt]),
    features_selector,
    cool_models_bar,
    calculate_bt,
    clear_output_button,
])'''

'gui = widgets.VBox([\n    dataset_selector,\n    cool_size_selector,\n    widgets.HBox([add_model_bar_bt, remove_model_bar_bt]),\n    features_selector,\n    cool_models_bar,\n    calculate_bt,\n    clear_output_button,\n])'

In [24]:
graphs_output = widgets.Output(layout={'border': '1px solid black'})

In [25]:
#display(gui)
#display(graphs_output)

In [26]:
from demo_utils.user_interface import get_gui

In [27]:
a = get_gui()

In [28]:
a

VBox(children=(Dropdown(index=1, options=('covertype', 'digits', 'fall_detection', 'mnist', 'pen_digits', 'sat…