# Block 6 Exercise 1: Non-Linear Classification

## MNIST Data
We return to the MNIST data set on handwritten digits to compare non-linear classification algorithms ...   

In [1]:
#imports 
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_openml

In [2]:
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)


In [3]:
#the full MNIST data set contains 70k samples of digits 0-9 as 28*28 gray scale images (represented as 784 dim vectors)
np.shape(X)

(70000, 784)

In [4]:
X.min()

0.0

In [5]:
#look at max/min value in the data
X.max()

255.0

### E1.1: Cross-Validation and Support Vector Machines
Train and optimize  C-SVM classifier on MNIST (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC)
* use a RBF kernel
* use *random search* with cross-validation to find the best settings for *gamma* and *C* (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV)

In [31]:
#split in train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [8]:
#imports
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform


In [44]:
#test default SVM 
svm = SVC()
svm.fit(X_train,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [45]:
svm.score(X_test,y_test)

0.9790857142857143

In [46]:
#see what we get with only parts ofthe data
svm.fit(X_train[:1000,:],y_train[:1000])

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [47]:
svm.score(X_test,y_test)

0.9156571428571428

In [48]:
svm = SVC() #rbf is default kernel
parameters = dict(C=[0.1,1,2,5,10,100],gamma=[10,5, 1,0.1,'scale']) #create search space
search = RandomizedSearchCV(svm, parameters, n_jobs=4,n_iter=20, cv=5,random_state=0)
#set state to get reproducable experiments
#use 4 jobs in parallel
#do 20 itteration
#number of cv foldes: 5

In [49]:
#start serach
res=search.fit(X_train[:1000,:],y_train[:1000]) #use subset of the date to speed things up (use full data for real experiment!)

In [50]:
#get best parameters
res.best_params_

{'gamma': 'scale', 'C': 2}

In [51]:
#get best score
res.best_score_

0.924

In [52]:
#get the full experiment
res.cv_results_

{'mean_fit_time': array([1.35058694, 1.35399504, 1.3662179 , 1.41237798, 1.35056891,
        0.77285037, 1.39339161, 1.39500165, 1.38919196, 1.39730558,
        1.36389647, 1.43803172, 1.46460943, 0.77585039, 1.40573502,
        1.40230646, 1.40124378, 0.77807813, 1.41461   , 1.1725184 ]),
 'std_fit_time': array([0.04089051, 0.02660707, 0.01657317, 0.03835837, 0.03562915,
        0.0226934 , 0.02931192, 0.024014  , 0.03614655, 0.04282403,
        0.024333  , 0.03991168, 0.05959843, 0.03181399, 0.01593967,
        0.01558785, 0.01027626, 0.02817054, 0.01669424, 0.11730277]),
 'mean_score_time': array([0.20376859, 0.19633608, 0.20086279, 0.19717579, 0.20150042,
        0.16203618, 0.19815879, 0.19999352, 0.20443554, 0.19607372,
        0.20010166, 0.20365028, 0.21576571, 0.16151285, 0.19886379,
        0.20086184, 0.19866281, 0.16096673, 0.20074658, 0.18205552]),
 'std_score_time': array([0.01409166, 0.00513725, 0.00971663, 0.00395101, 0.00916717,
        0.00466845, 0.00506305, 0.007656

### E1.2: Pipelines and simple Neural Networks
Split the MNIST data into  train- and test-sets and then train and evaluate a simple Multi Layer Perceptron (MLP) network. Since the non-linear activation functions of MLPs are sensitive to the scaling on the input (recall the *sigmoid* function), we need to scale all input values to [0,1] 

* combine all steps of your training in a SKL pipeline (https://scikit-learn.org/stable/modules/compose.html#pipeline)
* use a SKL-scaler to scale the data (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)
* MLP Parameters: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier
    * use a *SGD* solver
    * use *tanh* as activation function
    * compare networks with 1, 2 and 3 layers, use different numbers of neurons per layer
    * adjust training parameters *alpha* (regularization) and *learning rate* - how sensitive is the model to these parameters?
    * Hint: do not change all parameters at the same time, split into several experiments
* How hard is it to find the best parameters? How many experiments would you need to find the best parameters?
    


In [29]:
#imports
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [36]:
#make pipeline
clf = make_pipeline(StandardScaler(), MLPClassifier(random_state=1,activation='tanh', hidden_layer_sizes=(64,64), solver='sgd', alpha=0.0001, max_iter=300))
#MLP parameters:
#hidden_layer_sizes=(64,64) - tuple gives number of layers with number of neurons each

In [37]:
clf.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('mlpclassifier',
                 MLPClassifier(activation='tanh', alpha=0.0001,
                               batch_size='auto', beta_1=0.9, beta_2=0.999,
                               early_stopping=False, epsilon=1e-08,
                               hidden_layer_sizes=(64, 64),
                               learning_rate='constant',
                               learning_rate_init=0.001, max_fun=15000,
                               max_iter=300, momentum=0.9, n_iter_no_change=10,
                               nesterovs_momentum=True, power_t=0.5,
                               random_state=1, shuffle=True, solver='sgd',
                               tol=0.0001, validation_fraction=0.1,
                               verbose=False, warm_start=False))],
         verbose=False)

In [38]:
#get test accuracy
clf.score(X_test,y_test)

0.9561142857142857