In [21]:
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

### Load Data

In [22]:
# Load the data as numpy arrays
img_1 = pd.read_csv('./../data/feaSubEImg_1.csv', header=None).to_numpy()
img_2 = pd.read_csv('./../data/feaSubEImg_2.csv', header=None).to_numpy()
overt_1 = pd.read_csv('./../data/feaSubEOvert_1.csv', header=None).to_numpy()
overt_2 = pd.read_csv('./../data/feaSubEOvert_2.csv', header=None).to_numpy()

In [23]:
# Combine the features
img_X = np.hstack((img_1, img_2)).T
img_y = np.array([0] * img_1.shape[1] + [1] * img_2.shape[1])

overt_X = np.hstack((overt_1, overt_2)).T
overt_y = np.array([0] * overt_1.shape[1] + [1] * overt_2.shape[1])

### Two-level Cross-validation function

In [None]:
# Inner cross-validation
def inner_cv(X_train, y_train, C_values=[0.01, 1, 100, 10000], n_splits=5):
    # Stratified K-Fold cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True)
    val_accuracies = {C: [] for C in C_values}

    # Iterate through each fold
    for train_idx, val_idx in skf.split(X_train, y_train):
        # Get the data for the fold
        X_inner_train, X_val = X_train[train_idx], X_train[val_idx]
        y_inner_train, y_val = y_train[train_idx], y_train[val_idx]

        # For each fold, train and evaluate the model for each C value
        for C in C_values:
            clf = SVC(kernel='linear', C=C)
            clf.fit(X_inner_train, y_inner_train)
            y_val_pred = clf.predict(X_val)
            acc = accuracy_score(y_val, y_val_pred)
            val_accuracies[C].append(acc)

    avg_acc = {C: np.mean(accs) for C, accs in val_accuracies.items()}
    best_C = max(avg_acc, key=avg_acc.get)
    return best_C, val_accuracies

In [25]:
def two_level_cross_validation(X, y, outer_splits=6, inner_splits=5, C_values=[0.01, 1, 100, 10000]):
    # Split the data into 6 folds
    outer_skf = StratifiedKFold(n_splits=outer_splits, shuffle=True)

    accuracies = []
    best_Cs = []
    y_true_all = []
    y_pred_all = []
    decision_scores_all = []

    # Iterate through each outer fold
    for fold_idx, (train_idx, test_idx) in enumerate(outer_skf.split(X, y)):
        # Get the training and test data for the outer fold
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Inner CV to find best C
        best_C, _ = inner_cv(X_train, y_train, C_values=C_values, n_splits=inner_splits)
        best_Cs.append(best_C)

        # Train final model on all outer training data using best C
        model = SVC(kernel='linear', C=best_C)
        model.fit(X_train, y_train)

        # Evaluate on outer test fold
        y_pred = model.predict(X_test)
        decision_scores = model.decision_function(X_test)
        acc = accuracy_score(y_test, y_pred)

        # Store results
        accuracies.append(acc)
        y_true_all.extend(y_test)
        y_pred_all.extend(y_pred)
        decision_scores_all.extend(decision_scores)

        print(f"[Fold {fold_idx+1}] Accuracy: {acc:.2f}, Best C: {best_C}")

    return {
        'accuracies': accuracies,
        'best_Cs': best_Cs,
        'y_true_all': np.array(y_true_all),
        'y_pred_all': np.array(y_pred_all),
        'decision_scores_all': np.array(decision_scores_all)
    }

In [26]:
results = two_level_cross_validation(overt_X, overt_y)

print("\nAverage outer CV accuracy:", np.mean(results['accuracies']))
print("Best C values per fold:", results['best_Cs'])

[Fold 1] Accuracy: 0.95, Best C: 0.01
[Fold 2] Accuracy: 0.97, Best C: 0.01
[Fold 3] Accuracy: 0.95, Best C: 0.01
[Fold 4] Accuracy: 0.93, Best C: 0.01
[Fold 5] Accuracy: 0.97, Best C: 0.01
[Fold 6] Accuracy: 0.97, Best C: 0.01

Average outer CV accuracy: 0.9583333333333331
Best C values per fold: [0.01, 0.01, 0.01, 0.01, 0.01, 0.01]
