### Imports

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
# import matplotlib.pyplot as plt
from dna_kernel_binding.training.training import create_k_folds, compute_accuracy

from dna_kernel_binding.kernels.kernels import SpectrumKernel
from dna_kernel_binding.models.svm import KernelSVM
from dna_kernel_binding.data.load_data import DNADataLoader

### Retrieve data

In [3]:
data_dir = "dna_kernel_binding/data/data_challenge/"
dataloader = DNADataLoader(data_dir=data_dir)
X_train, Y_train, X_test = dataloader.get_train_and_test_data()

### Test the Spectrum kernel on a short example

In [4]:
# number of samples in the training set and test set
# smaller values are used for testing purposes
length_tr, length_te = 400, 200

# train set
X_tr = X_train.iloc[:length_tr, :]
Y_tr = Y_train.iloc[:length_tr, 1].to_numpy()

# test set
# X_te is taken from X_train to have labels for the test set and to be able to calculate the accuracy
X_te = X_train.iloc[length_tr:length_tr+length_te, :] 
X_te = X_te.reset_index(drop=True)
X_te["Id"] = X_te.index
Y_te = Y_train.iloc[length_tr:length_tr+length_te, 1].to_numpy()

# create the kernel
kernel = SpectrumKernel(k=5, center=True)

# compute the gram matrix of the training set
K_tr = kernel.compute_gram_matrix(X_tr, center=False) # center=False because we want to center the test set with the training set
# center the kernel matrix of the training set
K_tr_centered = kernel._center_gram_matrix(K_tr)

# create the SVM model
svm = KernelSVM(C=0.1) # C is the regularization parameter
# fit the model with the centered kernel matrix of the training set
svm.fit(K_tr_centered, Y_tr)

# test set
# the kernel is computed between the test set and the support vectors of training set
# however the kernel is centered with the training set kernel matrix K_tr, which is why K_tr is passed as an argument
Y_pred = svm.predict(kernel, X_tr, X_te, K_train=K_tr)

# calculate the accuracy
accuracy = compute_accuracy(Y_te, Y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.62


### Hyperparameter tunning of the Spectrum Kernel

In [None]:
def hyperparameter_tuning(X_train, Y_train, nb_points=100, C=np.logspace(-3, 0, 15), k=[6], n_splits=10):
    
    res = {'acc': [], 'C': [], 'k': [], 'fold': []}
    folds = create_k_folds(X_train.loc[:nb_points, :], Y_train.loc[:nb_points, :], n_splits=n_splits, shuffle=False)
    
    for k_val in k:
        print(f"\nK = {k_val}")
        kernel = SpectrumKernel(k=k_val, center=True)
        kernel.preprocess_data(X_train.loc[:nb_points, :])
        K_train = kernel.compute_gram_matrix(X_train.loc[:nb_points, :], center=False)
        np.save(f"results/K_train_k_{k_val}_nbPoints_{nb_points}.npy", K_train)
        
        for fold_idx, (X_train_fold, X_val_fold, y_train_fold, y_val_fold) in enumerate(folds):
            K_tr_fold = K_train[np.ix_(X_train_fold.index, X_train_fold.index)]
            K_tr_fold_centered = kernel._center_gram_matrix(K_tr_fold, is_train=True)
            
            for c in C:
                svm = KernelSVM(C=c)
                svm.fit(K_tr_fold_centered, y_train_fold)
                y_pred = svm.predict(kernel, X_train_fold, X_val_fold, "validation", K_tr_fold)
                acc = compute_accuracy(y_pred, y_val_fold)
                
                res['acc'].append(acc)
                res['C'].append(c)
                res['k'].append(k_val)
                res['fold'].append(fold_idx)
                
                print(f"Fold {fold_idx}: Accuracy = {acc}, C = {c}, k = {k_val}")
    
    return res

# Example usage:
results = hyperparameter_tuning(X_train, Y_train)
df_res = pd.DataFrame(results)
df_res.to_csv(f"results/df_res_k_{k_val}_nbPoints_{nb_points}.csv", index=False)
print(df_res.groupby(['C', 'k']).mean().loc[:, 'acc'])

In [5]:
C = np.logspace(-3, 0, 15)
k = [6]
res = {'acc':[], 'C':[], 'k':[], 'fold':[]}
nb_points = 100

folds = create_k_folds(X_train.loc[:nb_points,:], Y_train.loc[:nb_points,:], n_splits=10, shuffle=False)
for k_val in k:
    print(f"\nK = {k_val}")
    kernel = SpectrumKernel(k=k_val, center=True)
    kernel.preprocess_data(X_train.loc[:nb_points,:])
    K_train = kernel.compute_gram_matrix(X_train.loc[:nb_points,:], center=False) # np.load(f"results/K_train_k_{k_val}_nbPoints_{nb_points}.npy")# 
    np.save(f"results/K_train_k_{k_val}_nbPoints_{nb_points}.npy", K_train)
    for fold_idx, (X_train_fold, X_val_fold, y_train_fold, y_val_fold) in enumerate(folds):
        K_tr_fold = K_train[np.ix_(X_train_fold.index, X_train_fold.index)]
        K_tr_fold_centered = kernel._center_gram_matrix(K_tr_fold, is_train=True)
        for c in C:            
            svm = KernelSVM(C=c)
            svm.fit(K_tr_fold_centered, y_train_fold)
            y_pred = svm.predict(kernel, X_train_fold, X_val_fold, "validation", K_tr_fold)
            acc = compute_accuracy(y_pred, y_val_fold)
            res['acc'].append(acc)
            res['C'].append(c)
            res['k'].append(k_val)
            res['fold'].append(fold_idx)
            print(f"Fold {fold_idx}: Accuracy = {acc}, C = {c}, k = {k_val}")
    


K = 6


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Fold 0: Accuracy = 0.2727272727272727, C = 0.001, k = 6
Fold 0: Accuracy = 0.7272727272727273, C = 0.0016378937069540646, k = 6
Fold 0: Accuracy = 0.7272727272727273, C = 0.0026826957952797246, k = 6
Fold 0: Accuracy = 0.7272727272727273, C = 0.004393970560760791, k = 6
Fold 0: Accuracy = 0.2727272727272727, C = 0.0071968567300115215, k = 6
Fold 0: Accuracy = 0.18181818181818182, C = 0.011787686347935873, k = 6
Fold 0: Accuracy = 0.2727272727272727, C = 0.019306977288832496, k = 6
Fold 0: Accuracy = 0.2727272727272727, C = 0.03162277660168379, k = 6
Fold 0: Accuracy = 0.36363636363636365, C = 0.0517947467923121, k = 6
Fold 0: Accuracy = 0.36363636363636365, C = 0.08483428982440717, k = 6
Fold 0: Accuracy = 0.36363636363636365, C = 0.13894954943731375, k = 6
Fold 0: Accuracy = 0.36363636363636365, C = 0.22758459260747887, k = 6
Fold 0: Accuracy = 0.36363636363636365, C = 0.3727593720314938, k = 6
Fold 0: Accuracy = 0.36363636363636365, C = 0.6105402296585326, k = 6
Fold 0: Accuracy = 0.

In [None]:
df_res = pd.DataFrame(res)
df_res.to_csv(f"results/df_res_k_{k_val}_nbPoints_{nb_points}.csv", index=False)
df_res.groupby(['C', 'k']).mean().loc[:,'acc']