In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GroupShuffleSplit
import wandb


csv_path = "../../data/csv/expanded_dataset_roi.csv"  
# Load dataset
df = pd.read_csv(csv_path)  # Replace with your actual file path

# Train/test split
splitter = GroupShuffleSplit(test_size=.20, n_splits=2, random_state=0)
split = splitter.split(df, groups=df['Participant'])

train_inds, test_inds = next(split)

train = df.iloc[train_inds]
test = df.iloc[test_inds]

print("Train participants: ", set(train["Participant"]))
print("Test participants: ", set(test["Participant"]))


Train participants:  {2.0, 4.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 18.0, 20.0, 21.0, 23.0, 24.0, 29.0, 30.0, 32.0, 9003.0, 9004.0, 9005.0, 9008.0, 1010.0, 1012.0, 1013.0, 1014.0, 1017.0, 1018.0, 1019.0, 1020.0}
Test participants:  {33.0, 7.0, 9001.0, 9002.0, 19.0, 1011.0, 25.0, 28.0}


In [2]:
# Encode 'yes' as 1 and 'no' as 0
def encode_sequence(seq):
    return [1 if token.strip().lower() == "yes" else 0 for token in seq.split(',')]

# Encode all sequences
train_seq = train['roi_sequence'].apply(encode_sequence)
test_seq = test['roi_sequence'].apply(encode_sequence)

# Find max sequence length
max_len = max(len(seq) for seq in train_seq)

# Pad sequences with 0s (or trim if too long)
def pad_or_trim(seq, target_len):
    if len(seq) < target_len:
        return seq + [0] * (target_len - len(seq))  # pad with 0s
    else:
        return seq[:target_len]  # trim if too long

Xtrain = np.array([pad_or_trim(seq, max_len) for seq in train_seq])
y_train = train['experience'].values

Xtest = np.array([pad_or_trim(seq, max_len) for seq in test_seq])
y_test = test['experience'].values

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(Xtrain)
X_test = scaler.transform(Xtest)


# Train SVM
clf = SVC(kernel='rbf', C=1.0, gamma='scale', decision_function_shape='ovo')
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Confusion Matrix:
 [[130   0   0   0]
 [219   0   0   0]
 [ 89   0   0   0]
 [256   0   0   0]]

Classification Report:
               precision    recall  f1-score   support

         CSI       0.19      1.00      0.32       130
     Control       0.00      0.00      0.00       219
   FirstYear       0.00      0.00      0.00        89
   ThirdYear       0.00      0.00      0.00       256

    accuracy                           0.19       694
   macro avg       0.05      0.25      0.08       694
weighted avg       0.04      0.19      0.06       694



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [3]:
print(len(X_train), len(X_test))

2310 694


In [4]:
train

Unnamed: 0,Participant,roi_sequence,experience
0,2.0,02298000008,Control
1,2.0,87171580902317,Control
2,2.0,88000000012,Control
3,2.0,013291229210131215,Control
4,2.0,5722988880,Control
...,...,...,...
2999,9008.0,2241619191318915,Control
3000,9008.0,5137167131919813,Control
3001,9008.0,1111111111112245,Control
3002,9008.0,513121355591313,Control


In [5]:
test

Unnamed: 0,Participant,roi_sequence,experience
213,7.0,018800000127,Control
214,7.0,15227900922,Control
215,7.0,1717170200990,Control
216,7.0,0999999988,Control
217,7.0,9889889779,Control
...,...,...,...
2679,9002.0,151377613121255,Control
2680,9002.0,86513141365915,Control
2681,9002.0,1591515151515131313,Control
2682,9002.0,13271671111111111,Control


In [6]:
import wandb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np

# Initialize wandb
wandb.init(
    project="svm-random-search",
    name="svc_hyperparam_tuning",
    config={
        "model": "SVC",
        "search_type": "RandomizedSearchCV",
        "param_dist": {
            'C': [0.01, 0.1, 1, 10, 100, 1000],
            'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
            'gamma': ['scale', 'auto', 1, 0.1, 0.01, 0.001, 0.0001],
            'degree': [2, 3, 4, 5],
            'coef0': [0.0, 0.1, 0.5, 1.0],
            'shrinking': [True, False],
            'decision_function_shape': ['ovo', 'ovr'],
            'class_weight': [None, 'balanced']
        },
        "n_iter": 100,
        "cv": 5,
        "scoring": "accuracy"
    }
)

# Define parameter space
param_dist = wandb.config["param_dist"]

# Initialize and fit RandomizedSearchCV
svc = SVC()
random_search = RandomizedSearchCV(
    estimator=svc,
    param_distributions=param_dist,
    n_iter=wandb.config["n_iter"],
    scoring=wandb.config["scoring"],
    cv=wandb.config["cv"],
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

# Log best parameters
wandb.log({"best_params": random_search.best_params_})

# Predict and evaluate
best_svc = random_search.best_estimator_
y_pred = best_svc.predict(X_test)

# Log confusion matrix, accuracy, and classification report
accuracy = accuracy_score(y_test, y_pred)
wandb.log({
    "test_accuracy": accuracy
})

# Also print to console

# Finish wandb run
wandb.finish()


[34m[1mwandb[0m: Currently logged in as: [33msam-michiels[0m ([33msam-michiels-open-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Fitting 5 folds for each of 100 candidates, totalling 500 fits


0,1
test_accuracy,▁

0,1
test_accuracy,0.18732


In [7]:
print("Best from Random Search:", random_search.best_params_)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Best from Random Search: {'shrinking': False, 'kernel': 'rbf', 'gamma': 'scale', 'degree': 4, 'decision_function_shape': 'ovo', 'coef0': 1.0, 'class_weight': None, 'C': 100}
Confusion Matrix:
 [[130   0   0   0]
 [219   0   0   0]
 [ 89   0   0   0]
 [256   0   0   0]]

Classification Report:
               precision    recall  f1-score   support

         CSI       0.19      1.00      0.32       130
     Control       0.00      0.00      0.00       219
   FirstYear       0.00      0.00      0.00        89
   ThirdYear       0.00      0.00      0.00       256

    accuracy                           0.19       694
   macro avg       0.05      0.25      0.08       694
weighted avg       0.04      0.19      0.06       694



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
import joblib
joblib.dump(best_svc, 'svm_random_search_model_roi.pkl')

['svm_random_search_model_roi.pkl']