In [5]:
from scipy.io import loadmat
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import OneClassSVM
import numpy as np

In [10]:
shuttle_data = loadmat('cardio 1.mat')

X = shuttle_data['X']
y = shuttle_data['y'].ravel()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)

contamination = np.sum(y_train) / len(y_train)
print(f"Contamination rate: {contamination:.4f}")

Contamination rate: 0.0929


In [11]:
pipeline = Pipeline([("scaler", StandardScaler()), ("ocsvm", OneClassSVM())])

params = {
    "ocsvm__kernel": ["linear", "rbf", "poly", "sigmoid"],
    "ocsvm__gamma": ["scale", "auto", 0.001, 0.01, 0.1, 1.0],
    "ocsvm__nu": [0.01, 0.05, 0.1, 0.2, 0.5, contamination]
}

y_train = np.where(y_train == 0, 1, -1)
y_test = np.where(y_test == 0, 1, -1)

In [12]:
from sklearn.metrics import make_scorer, balanced_accuracy_score

grid_search = GridSearchCV(
    pipeline,
    params,
    scoring=make_scorer(balanced_accuracy_score),
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.4f}")

y_pred = grid_search.predict(X_test)
test_score = balanced_accuracy_score(y_test, y_pred)
print(f"Test Balanced Accuracy: {test_score:.4f}")

Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best parameters: {'ocsvm__gamma': 0.001, 'ocsvm__kernel': 'rbf', 'ocsvm__nu': 0.2}
Best score: 0.8601
Test Balanced Accuracy: 0.8761
