In [1]:
import numpy as np
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVC

In [2]:
X_train, y_train = load_svmlight_file("datasets/svmguide2")
X_train = X_train.toarray()

### Original sets with default parameters (default in libsvm and old default in scikit-learn)

In [3]:
clf = SVC(C=1, gamma=1 / X_train.shape[1])
scores = cross_val_score(clf, X_train, y_train)
print(np.mean(scores), "+/-", np.std(scores))

0.5652843464552324 +/- 0.0055482109710208154


### Original sets with new default in scikit-learn

In [4]:
clf = SVC(C=1, gamma="scale")
scores = cross_val_score(clf, X_train, y_train)
print(np.mean(scores), "+/-", np.std(scores))

0.8186611384396194 +/- 0.03694885491830515


### Scaled sets with default parameters (MinMaxScaler)

In [5]:
sc = MinMaxScaler(feature_range=(-1, 1))
Xt_train = sc.fit_transform(X_train)
clf = SVC(C=1, gamma=1 / Xt_train.shape[1])
scores = cross_val_score(clf, Xt_train, y_train)
print(np.mean(scores), "+/-", np.std(scores))

0.7827220353486176 +/- 0.052338148653898776


### Scaled sets with parameter selection (MinMaxScaler)

In [6]:
sc = MinMaxScaler(feature_range=(-1, 1))
Xt_train = sc.fit_transform(X_train)
params = {"C": np.logspace(-5, 15, num=11, base=2),
          "gamma": np.logspace(3, -15, num=10, base=2)}
clf = GridSearchCV(SVC(), params, n_jobs=-1)
scores = cross_val_score(clf, Xt_train, y_train)
print(np.mean(scores), "+/-", np.std(scores))

0.8470677529221833 +/- 0.04151813236539166


### Scaled sets with parameter selection (StandardScaler)

In [7]:
sc = StandardScaler()
Xt_train = sc.fit_transform(X_train)
clf = SVC(C=1, gamma=1 / Xt_train.shape[1])
scores = cross_val_score(clf, X_train, y_train)
print(np.mean(scores), "+/-", np.std(scores))

0.5652843464552324 +/- 0.0055482109710208154


### Scaled sets with parameter selection (StandardScaler)

In [8]:
sc = StandardScaler()
Xt_train = sc.fit_transform(X_train)
params = {"C": np.logspace(-5, 15, num=11, base=2),
          "gamma": np.logspace(3, -15, num=10, base=2)}
clf = GridSearchCV(SVC(), params, n_jobs=-1)
scores = cross_val_score(clf, Xt_train, y_train)
print(np.mean(scores), "+/-", np.std(scores))

0.841548282940688 +/- 0.025466762220560928
