En este notebook vamos a hacer exactamente lo mismo que en DecisionTreeAgosto, pero vamos a hacer las pruebas con varios datasets de juguete de clasificación. Hay que destacar que los otros que vamos a usar son bastante más pequeños que el que venimos usando: 150 instancias frente a las 1500 que veníamos usando hasta ahora. A ver qué tal sale

## Iris 

150 instancias <br>
3 clases <br>
4 dimensionalidad <br>
valores positivos reales, entre 0 y 8

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from time import time
import math

# Import datasets, classifiers and performance metrics
from sklearn import datasets, pipeline
from sklearn.kernel_approximation import (RBFSampler,
                                          Nystroem)
from sklearn.tree import DecisionTreeClassifier

In [2]:
iris = datasets.load_iris()

In [3]:
data = iris.data
target = iris.target
N = data.shape[0]
prop_train = 3 / 4
N_train = math.ceil(N * prop_train)
N_test = N - N_train

#### No cambio los datos para centrarlos en 0 con varianza 1

In [4]:
data_train = data[:N_train]
data_test = data[N_train:]

target_train = target[:N_train]
target_test = target[N_train:]

### Decision Tree normal

In [5]:
dtc = DecisionTreeClassifier()

In [6]:
dtc.fit(data_train, target_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [7]:
train_score_normal_dt = dtc.score(data_train, target_train)
test_score_nomal_dt = dtc.score(data_test, target_test)

In [8]:
train_score_normal_dt, test_score_nomal_dt

(1.0, 0.7027027027027027)

### Decision Tree con RBFSampler

In [9]:
feature_map_fourier = RBFSampler(gamma=.2, random_state=1)

In [10]:
dtc_rbf = pipeline.Pipeline([("feature_map", feature_map_fourier),
                             ("ctf", DecisionTreeClassifier())])

In [11]:
D = 500

In [12]:
dtc_rbf.set_params(feature_map__n_components=D)

Pipeline(memory=None,
     steps=[('feature_map', RBFSampler(gamma=0.2, n_components=500, random_state=1)), ('ctf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])

In [13]:
dtc_rbf.fit(data_train, target_train)

Pipeline(memory=None,
     steps=[('feature_map', RBFSampler(gamma=0.2, n_components=500, random_state=1)), ('ctf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])

In [14]:
train_score = dtc_rbf.score(data_train, target_train)
test_score = dtc_rbf.score(data_test, target_test)

In [15]:
train_score, test_score

(1.0, 0.8108108108108109)

Con D = 500 parece dar mejores resultados que con D = 5000. Con 500 parece que mejora un poquitín, mientras que 500 se queda igual o empeora.

## Wine

178 instancias <br>
3 clases <br>
13 dimensionalidad <br>
valores positivos reales, entre 0 y 1680

In [16]:
wine = datasets.load_wine()

In [17]:
data = wine.data
target = wine.target
N = data.shape[0]
prop_train = 2 / 4
N_train = math.ceil(N * prop_train)
N_test = N - N_train

In [18]:
data_train = data[:N_train]
data_test = data[N_train:]

target_train = target[:N_train]
target_test = target[N_train:]

### Decision Tree Normal

In [19]:
dtc = DecisionTreeClassifier()

In [20]:
dtc.fit(data_train, target_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [21]:
train_score_normal_dt = dtc.score(data_train, target_train)
test_score_nomal_dt = dtc.score(data_test, target_test)

In [22]:
train_score_normal_dt, test_score_nomal_dt

(1.0, 0.38202247191011235)

### Decision Tree con RBFSampler

In [23]:
feature_map_fourier = RBFSampler(gamma=.2, random_state=1)

In [24]:
dtc_rbf = pipeline.Pipeline([("feature_map", feature_map_fourier),
                             ("ctf", DecisionTreeClassifier())])

In [25]:
D = 500

In [26]:
dtc_rbf.set_params(feature_map__n_components=D)

Pipeline(memory=None,
     steps=[('feature_map', RBFSampler(gamma=0.2, n_components=500, random_state=1)), ('ctf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])

In [27]:
dtc_rbf.fit(data_train, target_train)

Pipeline(memory=None,
     steps=[('feature_map', RBFSampler(gamma=0.2, n_components=500, random_state=1)), ('ctf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])

In [28]:
train_score = dtc_rbf.score(data_train, target_train)
test_score = dtc_rbf.score(data_test, target_test)

In [29]:
train_score, test_score

(1.0, 0.2247191011235955)

## Cancer

569 instancias <br>
2 clases <br>
30 dimensionalidad <br>
valores positivos reales, entre 0 y 4254

In [30]:
cancer = datasets.load_breast_cancer()

In [31]:
data = cancer.data
target = cancer.target
N = data.shape[0]
prop_train = 2 / 4
N_train = math.ceil(N * prop_train)
N_test = N - N_train

In [32]:
data_train = data[:N_train]
data_test = data[N_train:]

target_train = target[:N_train]
target_test = target[N_train:]

### Decision Tree Normal

In [33]:
dtc = DecisionTreeClassifier()

In [34]:
dtc.fit(data_train, target_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [35]:
train_score_normal_dt = dtc.score(data_train, target_train)
test_score_nomal_dt = dtc.score(data_test, target_test)

In [36]:
train_score_normal_dt, test_score_nomal_dt

(1.0, 0.8908450704225352)

### Decision Tree con RBFSampler

In [37]:
feature_map_fourier = RBFSampler(gamma=.2, random_state=1)

In [38]:
dtc_rbf = pipeline.Pipeline([("feature_map", feature_map_fourier),
                             ("ctf", DecisionTreeClassifier())])

In [39]:
D = 500

In [40]:
dtc_rbf.set_params(feature_map__n_components=D)

Pipeline(memory=None,
     steps=[('feature_map', RBFSampler(gamma=0.2, n_components=500, random_state=1)), ('ctf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])

In [41]:
dtc_rbf.fit(data_train, target_train)

Pipeline(memory=None,
     steps=[('feature_map', RBFSampler(gamma=0.2, n_components=500, random_state=1)), ('ctf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])

In [42]:
train_score = dtc_rbf.score(data_train, target_train)
test_score = dtc_rbf.score(data_test, target_test)

In [43]:
train_score, test_score

(1.0, 0.5070422535211268)