# Imitación del uso de RBFSampler en Scikit-learn


Seguir el ejemplo que está en http://scikit-learn.org/stable/auto_examples/plot_kernel_approximation.html

### Imports

In [157]:
%matplotlib inline

In [158]:
import matplotlib.pyplot as plt
import numpy as np
from time import time
from sklearn import datasets, svm, pipeline
from sklearn.kernel_approximation import (RBFSampler,
                                          Nystroem)
from sklearn.decomposition import PCA
from sklearn.kernel_approximation import RBFSampler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import math

### Sacar el dataset

In [159]:
digits = datasets.load_digits(n_class=9)
data = digits.data
target = digits.target
N = len(data)

### Separar en train y test

In [160]:
prop_train = 2 / 3
N_train = math.ceil(N * prop_train)
N_test = N - N_train
data_train = data[:N_train]
data_test = data[N_train:]
target_train = target[:N_train]
target_test = target[N_train:]

### Definir el sampler

In [161]:
sampler = RBFSampler(n_components = 100)

In [162]:
sampler.fit(data_train)

RBFSampler(gamma=1.0, n_components=100, random_state=None)

In [163]:
mapped_data_train = sampler.transform(data_train)
mapped_data_test = sampler.transform(data_test)

## Hemos definido los siguentes datos:
- data_train
- data_test
- target_train
- target_test
- mapped_data_train
- mapped_data_test

## Testearemos varios clasificadores, primero usando los datos originales, y luego los datos mapeados

### Testeo con los datos originales

###  Árbol de decisión

In [164]:
arbol = DecisionTreeClassifier()

In [165]:
arbol

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [166]:
arbol.fit(data_train, target_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [167]:
arbol_test_score = arbol.score(data_test, target_test)

In [168]:
arbol_train_score = arbol.score(data_train, target_train)

In [169]:
arbol_train_score

1.0

In [170]:
arbol_test_score

0.7977736549165121

### SVM

In [171]:
parameters = {
    'C' : np.arange(1, 5, 1),
    'gamma' : np.arange(0,1,0.1)
}

In [172]:
parameters['C']

array([1, 2, 3, 4])

In [173]:
svm = SVC()

In [174]:
clf = GridSearchCV(svm, parameters, cv = 10)

In [175]:
clf.fit(data_train, target_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([1, 2, 3, 4]), 'gamma': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [176]:
clf.best_score_

0.11410018552875696

In [177]:
clf.best_params_

{'C': 1, 'gamma': 0.0}

In [137]:
clf.get_params()

{'cv': None,
 'error_score': 'raise',
 'estimator__C': 1.0,
 'estimator__cache_size': 200,
 'estimator__class_weight': None,
 'estimator__coef0': 0.0,
 'estimator__decision_function_shape': 'ovr',
 'estimator__degree': 3,
 'estimator__gamma': 'auto',
 'estimator__kernel': 'rbf',
 'estimator__max_iter': -1,
 'estimator__probability': False,
 'estimator__random_state': None,
 'estimator__shrinking': True,
 'estimator__tol': 0.001,
 'estimator__verbose': False,
 'estimator': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False),
 'fit_params': None,
 'iid': True,
 'n_jobs': 1,
 'param_grid': {'C': array([2, 3, 4]),
  'gamma': array([0. , 0.2, 0.4, 0.6, 0.8])},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': 'warn',
 'scoring': None,
 'verbose': 0}

In [138]:
clf.score(data_test, target_test)

0.10946196660482375

In [None]:
svm.fit(data_train, target_train)

In [93]:
svm_train_score = svm.score(data_train, target_train)

In [94]:
svm_test_score = svm.score(data_test, target_test)

In [95]:
svm_test_score

0.9628942486085343

In [96]:
svm_train_score

1.0

In [97]:
scores = cross_val_score(svm, data_train, target_train, cv = 10)

In [83]:
scores

array([0.11711712, 0.11926606, 0.11926606, 0.11111111, 0.11111111,
       0.11111111, 0.11111111, 0.11214953, 0.11320755, 0.11538462])

### Aprendizaje con los datos mapeados

In [66]:
arbol.fit(mapped_data_train, target_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [67]:
test_score_datos_mapeados = arbol.score(mapped_data_test, target_test)
train_score_datos_mapeados = arbol.score(mapped_data_train, target_train)