# 1. Normalization

In [3]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)
print(X_train.shape)
print(X_test.shape)

(426, 30)
(143, 30)


In [4]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [5]:
scaler.fit(X_train)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [6]:
# transform data
X_train_scaled = scaler.transform(X_train)
#print dataset properties before and after scaling
print(X_train_scaled.shape)
print(X_train.min(axis=0))
print(X_train.max(axis=0))
print(X_train_scaled.min(axis=0))
print(X_train_scaled.max(axis=0))

(426, 30)
[6.981e+00 9.710e+00 4.379e+01 1.435e+02 5.263e-02 1.938e-02 0.000e+00
 0.000e+00 1.060e-01 4.996e-02 1.115e-01 3.628e-01 7.570e-01 7.228e+00
 1.713e-03 2.252e-03 0.000e+00 0.000e+00 7.882e-03 8.948e-04 7.930e+00
 1.202e+01 5.041e+01 1.852e+02 7.117e-02 2.729e-02 0.000e+00 0.000e+00
 1.565e-01 5.504e-02]
[2.811e+01 3.381e+01 1.885e+02 2.501e+03 1.447e-01 3.114e-01 4.268e-01
 2.012e-01 3.040e-01 9.744e-02 2.873e+00 4.885e+00 2.198e+01 5.422e+02
 2.333e-02 1.064e-01 3.960e-01 5.279e-02 6.146e-02 2.984e-02 3.604e+01
 4.954e+01 2.512e+02 4.254e+03 2.226e-01 1.058e+00 1.252e+00 2.903e-01
 6.638e-01 2.075e-01]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1.]


In [8]:
# transform test data
X_test_scaled = scaler.transform(X_test)
# print test data properties after scaling
print(X_test_scaled.min(axis=0))
print(X_test_scaled.max(axis=0))

[ 0.03540158  0.04190871  0.02895446  0.01497349  0.14260888  0.04999658
  0.          0.          0.07222222  0.00589722  0.00105015 -0.00057494
  0.00067851 -0.0007963   0.05148726  0.01434497  0.          0.
  0.04195752  0.01113138  0.03678406  0.01252665  0.03366702  0.01400904
  0.08531995  0.01833687  0.          0.          0.00749064  0.02367834]
[0.76809125 1.22697095 0.75813696 0.64750795 1.20310633 1.11643038
 0.99906279 0.90606362 0.93232323 0.94903117 0.45573058 0.72623944
 0.48593507 0.31641282 1.36082713 1.2784499  0.36313131 0.77476795
 1.32643996 0.72672498 0.82106012 0.87553305 0.77887345 0.67803775
 0.78603975 0.87843331 0.93450479 1.0024113  0.76384782 0.58743277]


In [9]:
from sklearn.svm import SVC
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)
svm = SVC(C=100)
svm.fit(X_train, y_train)
print(svm.score(X_test, y_test))

0.6293706293706294




In [10]:
X_test_scaled = scaler.transform(X_test)
# learning an SVM on the scaled training data:
svm.fit(X_train_scaled, y_train)
# scoring on the scaled test set:
print(svm.score(X_test_scaled, y_test))

0.965034965034965




# 2. Parameter selection using a validation set and cross-validation

In [12]:
from sklearn.datasets import load_iris
iris = load_iris()
# naive grid search implementation
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)
best_score = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1 , 1, 10, 100]:
        # for each combination of parameters, train an SVM
        svm = SVC(gamma=gamma, C=C)
        svm.fit(X_train, y_train)
        # evaluate the SVM on the test set
        score = svm.score(X_test, y_test)
        # if we got a better score, store the score and parameters
        if score > best_score:
            best_score = score
            best_C = C
            best_gamma = gamma
print("Best score:", best_score)
print("Best parameters C and gamma:", best_C, best_gamma)
    

Best score: 0.9736842105263158
Best parameters C and gamma: 100 0.001


In [16]:
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)
# split train set into train set proper and validation set
X_train_pr, X_valid, y_train_pr, y_valid = train_test_split(X_train, y_train, random_state=1)
print("Sizes of train_pr, valid, and test sets:", X_train_pr.shape[0], X_valid.shape[0], X_test.shape[0])
best_score = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        # for each combination if parameters, train an SVM
        svm = SVC(gamma=gamma, C=C)
        svm.fit(X_train_pr, y_train_pr)
        #evaluate the SVM on the validation set
        score = svm.score(X_valid, y_valid)
        # if we got a better score, store the score and parameters
        if score > best_score:
            best_score = score
            best_C = C
            best_gamma = gamma
        # rebuild a model on the full training set and evaluate it on the test set
        svm = SVC(C=best_C, gamma=best_gamma)
        svm.fit(X_train, y_train)
        test_score = svm.score(X_test, y_test)
print("Best score on validation set:", best_score)
print("Best parameters C and gamma:", best_gamma)
print("Test set score with best parameters:", test_score)

Sizes of train_pr, valid, and test sets: 84 28 38
Best score on validation set: 0.9642857142857143
Best parameters C and gamma: 0.001
Test set score with best parameters: 0.9210526315789473


In [20]:
import numpy as np
from sklearn.model_selection import cross_val_score
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        svm = SVC(gamma=gamma, C=C)
        scores = cross_val_score(svm, X_train, y_train, cv=5)
        score = np.mean(scores)
        if score > best_score:
            best_score = score
            best_C = C
            best_gamma = gamma
    
svm = SVC(C=best_C, gamma=best_gamma)
svm.fit(X_train, y_train)
test_score = svm.score(X_test, y_test)
print("Best CV score:", best_score)
print("Best parameters C and gamma:", best_C, best_gamma)
print("Test set score with best parameters:", test_score)

Best CV score: 0.9726896292113683
Best parameters C and gamma: 100 0.01
Test set score with best parameters: 0.9736842105263158


In [21]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

In [22]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(SVC(), param_grid, cv=5)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)

In [24]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [25]:
grid_search.score(X_test, y_test)

0.9736842105263158

In [26]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'C': 100, 'gamma': 0.01}
0.9732142857142857


In [27]:
grid_search.best_estimator_

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)