In [1]:
import numpy as np
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVC

In [2]:
X_train, y_train = load_svmlight_file("datasets/svmguide4")
X_test, y_test = load_svmlight_file("datasets/svmguide4.t")
X_train = X_train.toarray()
X_test = X_test.toarray()

### wrong way: use different scaler for training and testing sets (MinMaxScaler)

In [3]:
sc = MinMaxScaler(feature_range=(0, 1))
Xt_train = sc.fit_transform(X_train)
sc = MinMaxScaler(feature_range=(0, 1))
Xt_test = sc.fit_transform(X_test)
params = {"C": np.logspace(-5, 15, num=11, base=2),
          "gamma": np.logspace(3, -15, num=10, base=2)}
clf = GridSearchCV(SVC(), params, n_jobs=-1)
clf.fit(Xt_train, y_train)
clf.score(Xt_test, y_test)

0.6923076923076923

In [4]:
print(np.min(Xt_train, axis=0))
print(np.max(Xt_train, axis=0))

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [5]:
print(np.min(Xt_test, axis=0))
print(np.max(Xt_test, axis=0))

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


### right way: use same scaler for training and testing sets (MinMaxScaler)

In [6]:
sc = MinMaxScaler(feature_range=(0, 1))
Xt_train = sc.fit_transform(X_train)
Xt_test = sc.transform(X_test)
params = {"C": np.logspace(-5, 15, num=11, base=2),
          "gamma": np.logspace(3, -15, num=10, base=2)}
clf = GridSearchCV(SVC(), params, n_jobs=-1)
clf.fit(Xt_train, y_train)
clf.score(Xt_test, y_test)

0.875

In [7]:
print(np.min(Xt_train, axis=0))
print(np.max(Xt_train, axis=0))

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [8]:
print(np.min(Xt_test, axis=0))
print(np.max(Xt_test, axis=0))

[ 1.98617077e-03 -1.36386783e-04  9.42343962e-04  7.92917368e-05
  1.92776424e-01 -3.21503503e-02  4.35573035e-02  4.76190476e-02
  2.07920792e-01  3.58490566e-01]
[0.74017051 0.44209724 0.62910789 0.85825154 0.53847893 0.74069492
 0.39820837 1.         0.82178218 0.98742138]


### wrong way: use different scaler for training and testing sets (StandardScaler)

In [9]:
sc = StandardScaler()
Xt_train = sc.fit_transform(X_train)
sc = StandardScaler()
Xt_test = sc.fit_transform(X_test)
params = {"C": np.logspace(-5, 15, num=11, base=2),
          "gamma": np.logspace(3, -15, num=10, base=2)}
clf = GridSearchCV(SVC(), params, n_jobs=-1)
clf.fit(Xt_train, y_train)
clf.score(Xt_test, y_test)

0.782051282051282

### right way: use same scaler for training and testing sets (StandardScaler)

In [10]:
sc = StandardScaler()
Xt_train = sc.fit_transform(X_train)
Xt_test = sc.transform(X_test)
params = {"C": np.logspace(-5, 15, num=11, base=2),
          "gamma": np.logspace(3, -15, num=10, base=2)}
clf = GridSearchCV(SVC(), params, n_jobs=-1)
clf.fit(Xt_train, y_train)
clf.score(Xt_test, y_test)

0.8942307692307693