In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib import cm
from numpy import linalg as LA
from tqdm import tqdm

from sklearn import svm

from kernel_functions import gram_phi, count_kuplet_k, count_kuplet_3
from preprocessing import preprocessing

In [2]:
# load all data as the numpy array type
X = pd.read_csv('data/Xtr1_mat50.csv', sep=' ', header=None).values
X_raw = pd.read_csv('data/Xtr1.csv', sep=' ', header=None).values
# transform to an array of string
X_raw = np.array([x[0] for x in X_raw])
Y = pd.read_csv('data/Ytr1.csv', sep=',', header=0)['Bound'].values

print('numerical features shape', X.shape)
print('numerical features first row', X[0])
print('sequences shape: ', X_raw.shape)
print('sequence first row: ', X_raw[0])
print('labels shape', Y.shape)

numerical features shape (2000, 50)
numerical features first row [ 0.          0.02173913  0.01086957  0.          0.02173913  0.0326087   0.
  0.0326087   0.06521739  0.01086957  0.          0.          0.
  0.05434783  0.01086957  0.          0.          0.04347826  0.
  0.08695652  0.0326087   0.01086957  0.0326087   0.01086957  0.01086957
  0.          0.          0.          0.02173913  0.          0.02173913
  0.02173913  0.01086957  0.02173913  0.02173913  0.0326087   0.01086957
  0.01086957  0.08695652  0.04347826  0.01086957  0.01086957  0.0326087
  0.01086957  0.04347826  0.01086957  0.0326087   0.0326087   0.02173913
  0.        ]
sequences shape:  (2000,)
sequence first row:  CGGGCCTCCTCCAGGCTCAGAATCGACCCCCCCCCATCCTGATAGACCACAACGAAAGCCGTAGCGACGGCCGCAGGAGCTGGCGCGACAGCCCAGAGCTC
labels shape (2000,)


In [3]:
X_train, Y_train, X_test, Y_test = preprocessing(X_raw, Y)
print('train shape', X_train.shape)
print('test shape', X_test.shape)


train shape (1600,)
test shape (400,)


In [6]:
from kernel import *

def solve_svm_kernel(X_train, X_test, Y_train, Y_test, kernel='k_gram_gaussian', k=3, lamb=0.1, gamma=0.1, kktreg=1e-9):
    """
    kernel in [k_gram, k_gram_gaussian]
    """
    if kernel in ['k_gram', 'k_gram_gaussian']:
        if k == 3:
            X_train_process = np.array([count_kuplet_3(x) for x in X_train])
            X_test_process = np.array([count_kuplet_3(x) for x in X_test])
        else:
            X_train_process = np.array([count_kuplet_k(x,k=k) for x in X_train])
            X_test_process = np.array([count_kuplet_k(x,k=k) for x in X_test])
        
        # Adding 1 for the sake of the bias
        X_train_process = np.concatenate((X_train_process, np.ones((X_train_process.shape[0], 1))), axis=1)
        X_test_process = np.concatenate((X_test_process, np.ones((X_test_process.shape[0], 1))), axis=1)
        
    if kernel=="k_gram":
        # Computing the Gram-Matrix
        K = X_train_process.dot(X_train_process.T)
        w = solve_svm(K, Y_train, lamb=lamb, kktreg=kktreg)
        K_test = np.dot(X_test_process, np.transpose(X_train_process))
    
    if kernel=="k_gram_gaussian":
        # Computing the Gram-Matrix
        K = np.array([LA.norm(X_train_process - y, axis=1) for y in X_train_process])
        K = np.exp(-K/gamma)
        # K = X_train_process.dot(X_train_process.T)
        w = solve_svm(K, Y_train, lamb=lamb)
        K_test = np.array([LA.norm(X_train_process - y, axis=1) for y in X_test_process])
        K_test = np.exp(-K_test/gamma)
    
    n = K.shape[0]
    Y_predicted = np.dot(K_test, w[:n]) > 0.
    result = ((Y_test+1)/ 2 == np.transpose(Y_predicted))
    Y_predicted_train = np.dot(K, w[:n]) > 0.
    result_train = ((Y_train+1)/ 2 == np.transpose(Y_predicted_train))
    if np.alltrue(Y_predicted):
        print("Toute les valeurs sont TRUE")
    if np.alltrue(Y_predicted==False):
        print("Toute les valeurs sont FALSE")
    return np.mean(result), np.mean(result_train)

    

In [15]:
acc_test, acc_train = solve_svm_kernel(X_train, 
                                       X_test, 
                                       Y_train, 
                                       Y_test, 
                                       kernel='k_gram', 
                                       k=3, lamb=.05, gamma=3.)

print('accuracy for train : {}'.format(acc_train))
print('accuracy for test : {}'.format(acc_test))



     pcost       dcost       gap    pres   dres
 0:  3.8091e-01  5.2402e+02  1e+04  3e+00  1e+06
 1:  2.4769e+00 -1.8882e+02  2e+02  4e-02  1e+04
 2:  2.4345e+00 -8.2675e+00  1e+01  2e-03  7e+02
 3:  1.8661e+00 -3.1729e-01  2e+00  2e-04  7e+01
 4:  6.8717e-01  4.6789e-01  2e-01  6e-06  2e+00
 5:  6.0234e-01  5.4354e-01  6e-02  1e-06  5e-01
 6:  5.8274e-01  5.6256e-01  2e-02  4e-07  1e-01
 7:  5.7657e-01  5.6861e-01  8e-03  9e-08  3e-02
 8:  5.7346e-01  5.7142e-01  2e-03  2e-08  7e-03
 9:  5.7269e-01  5.7212e-01  6e-04  2e-09  1e-03
10:  5.7243e-01  5.7235e-01  8e-05  9e-10  1e-04
11:  5.7240e-01  5.7239e-01  6e-06  1e-09  9e-06
12:  5.7239e-01  5.7239e-01  3e-07  1e-09  4e-07
13:  5.7239e-01  5.7239e-01  4e-09  1e-09  6e-09
Optimal solution found.
accuracy for train : 0.768125
accuracy for test : 0.7675
