In [28]:
import pandas as pd
import numpy as np
# from numba import njit

import matplotlib.pyplot as plt
from matplotlib import cm
from numpy import linalg as LA
from tqdm import tqdm

from kernel import *

from kernel_functions import * # gram_phi, count_kuplet_k, count_kuplet_3
from preprocessing import preprocessing

In [107]:
def linear_kernel(x, z):
    K = np.dot(x, z.T)
    return K

def gaussian_kernel(x, z):
    K = np.array([LA.norm(x - z1, axis=1)**2 for z1 in z])
    return K

def fit(K, y, lamb = 0.1): 
    # We solve the Dual
    NUM = K.shape[0]
    P = matrix(2 * K * y.reshape((-1,1)).dot(y.reshape((1,-1))))
    q = matrix(-np.ones((NUM, 1)))
    G = matrix(np.concatenate((-np.eye(NUM), np.eye(NUM)), axis=0))
    h = matrix(np.concatenate((np.zeros(NUM), lamb* np.ones(NUM)),axis=0))
    A = matrix(y.reshape(1, -1))
    b = matrix(np.zeros(1))
    solvers.options['show_progress'] = False
    sol = solvers.qp(P, q, G, h, A, b)
    alphas = np.array(sol['x']) * y[:, None]
    bias = np.mean(y - np.dot(K, alphas))
    return alphas, bias


def predict(alphas, bias, K_test):
    mat = np.dot(K_test, alphas)
    mat = ((mat + bias>0.)-0.5) * 2
    return mat.reshape(-1)

def testing_lambda(X_train, Y_train, X_test, Y_test, lamb=0.1, kernel='linear', gamma=1000, Kernel=None):
    if kernel=='linear':
        K = linear_kernel(X_train, X_train)
        K_test = linear_kernel(X_test, X_train)
    if kernel=='gaussian':
        K = gaussian_kernel(X_train, X_train)
        K_test = gaussian_kernel(X_test, X_train)
    if kernel=='custom':
        K = Kernel[0]
        K = Kernel[1]
        
    alphas, bias = fit(K, Y_train, lamb=lamb)
    
    Y_pred = predict(alphas, bias, K_test)
    acc_test = np.sum(Y_pred == Y_test)/Y_test.shape[0]
    
    Y_pred_train = predict(alphas, bias, K)
    acc_train = np.sum(Y_pred_train == Y_train)/X_train.shape[0]
    
    return acc_train, acc_test



In [4]:
# load all data as the numpy array type
#X = pd.read_csv('data/Xtr1_mat50.csv', sep=' ', header=None).values
X_raw0 = pd.read_csv('data/Xtr0.csv', sep= ' ', header = None).values.reshape((-1))
X_raw1 = pd.read_csv('data/Xtr1.csv', sep=' ', header=None).values.reshape((-1))
X_raw2 = pd.read_csv('data/Xtr2.csv', sep=' ', header=None).values.reshape((-1))

# transform k-fold
X_0 = to_k_fold(X_raw0, fold=32)
X_1 = to_k_fold(X_raw1, fold=32)
X_2 = to_k_fold(X_raw2, fold=32)

# transform to an array of string
X_valid0 = pd.read_csv('data/Xte0.csv', sep=' ', header=None).values.reshape((-1))
X_valid1 = pd.read_csv('data/Xte1.csv', sep=' ', header=None).values.reshape((-1))
X_valid2 = pd.read_csv('data/Xte1.csv', sep=' ', header=None).values.reshape((-1))


Y0 = pd.read_csv('data/Ytr0.csv', sep=',', header=0)['Bound'].values
Y1 = pd.read_csv('data/Ytr1.csv', sep=',', header=0)['Bound'].values
Y2 = pd.read_csv('data/Ytr2.csv', sep=',', header=0)['Bound'].values

#print('numerical features shape', X.shape)
#print('numerical features first row', X[0])
# print('sequences shape: ', X_raw0.shape)
# print('sequence first row: ', X_raw0[0])
# print('labels shape', Y0.shape)

100%|██████████| 2000/2000 [00:27<00:00, 73.89it/s]
100%|██████████| 2000/2000 [00:28<00:00, 70.29it/s]
100%|██████████| 2000/2000 [00:27<00:00, 73.23it/s]


sequences shape:  (2000,)
sequence first row:  TCCTCAACTTTTATTGGGCCGCTGTGGCACCAGAATCTACGAATGGCGCCCTCTAGAGTTGTGTAAAGAAGTGGCGTCACCTCATTATAAATAAAAGGTTG
labels shape (2000,)


In [116]:
# bias_term = 1e6
X_train, Y_train, X_test, Y_test = preprocessing(X_0, Y0, percent=0.8)
# X_train = np.concatenate((X_train, bias_term*np.ones((X_train.shape[0], 1))), axis=1)
# X_test = np.concatenate((X_test, bias_term*np.ones((X_test.shape[0], 1))), axis=1)
for lamb in np.arange(1e-6, 1e-5, 1e-6):
    acc_train, acc_test = testing_lambda(X_train, Y_train, X_test, Y_test, lamb=lamb)
    print("lamb = {}, acc train = {}, acc_test = {}".format(lamb, acc_train, acc_test))

lamb = 1e-06, acc train = 0.78375, acc_test = 0.71
lamb = 2e-06, acc train = 0.79375, acc_test = 0.7225
lamb = 3e-06, acc train = 0.811875, acc_test = 0.7325
lamb = 4e-06, acc train = 0.82, acc_test = 0.75
lamb = 4.9999999999999996e-06, acc train = 0.823125, acc_test = 0.7475
lamb = 5.999999999999999e-06, acc train = 0.8275, acc_test = 0.7525
lamb = 7e-06, acc train = 0.830625, acc_test = 0.7475
lamb = 8e-06, acc train = 0.83625, acc_test = 0.745
lamb = 9e-06, acc train = 0.841875, acc_test = 0.7475


In [117]:
X_train, Y_train, X_test, Y_test = preprocessing(X_1, Y1, percent=0.8)
for lamb in np.arange(1e-6, 1e-5, 1e-6):
    acc_train, acc_test = testing_lambda(X_train, Y_train, X_test, Y_test, lamb=lamb)
    print("lamb = {}, acc train = {}, acc_test = {}".format(lamb, acc_train, acc_test))

lamb = 1e-06, acc train = 0.73625, acc_test = 0.71
lamb = 2e-06, acc train = 0.773125, acc_test = 0.7375
lamb = 3e-06, acc train = 0.80875, acc_test = 0.78
lamb = 4e-06, acc train = 0.843125, acc_test = 0.8
lamb = 4.9999999999999996e-06, acc train = 0.86375, acc_test = 0.8225
lamb = 5.999999999999999e-06, acc train = 0.8775, acc_test = 0.84
lamb = 7e-06, acc train = 0.888125, acc_test = 0.8475
lamb = 8e-06, acc train = 0.895, acc_test = 0.85
lamb = 9e-06, acc train = 0.905, acc_test = 0.8525


In [119]:
X_train, Y_train, X_test, Y_test = preprocessing(X_2, Y2, percent=0.8)
for lamb in np.arange(1e-6, 1e-5, 1e-6):
    acc_train, acc_test = testing_lambda(X_train, Y_train, X_test, Y_test, lamb=lamb)
    print("lamb = {}, acc train = {}, acc_test = {}".format(lamb, acc_train, acc_test))

lamb = 1e-06, acc train = 0.695625, acc_test = 0.625
lamb = 2e-06, acc train = 0.7025, acc_test = 0.63
lamb = 3e-06, acc train = 0.71625, acc_test = 0.655
lamb = 4e-06, acc train = 0.736875, acc_test = 0.6475
lamb = 4.9999999999999996e-06, acc train = 0.756875, acc_test = 0.665
lamb = 5.999999999999999e-06, acc train = 0.7675, acc_test = 0.6575
lamb = 7e-06, acc train = 0.77, acc_test = 0.6575
lamb = 8e-06, acc train = 0.778125, acc_test = 0.66
lamb = 9e-06, acc train = 0.786875, acc_test = 0.65


## TESTING


In [27]:
def test(X_train, Y_train, X_test, lamb=1.):
    # X_train, Y_train, X_test, Y_test = preprocessing(X, Y, percent=0.8)
    X_train_preprocess = np.concatenate((X_train, np.ones((X_train.shape[0], 1))), axis=1)
    X_test_preprocess = np.concatenate((X_test, np.ones((X_test.shape[0], 1))), axis=1)
    K = X_train_preprocess.dot(X_train_preprocess.T)
    K_test = X_test_preprocess.dot(X_train_preprocess.T)
    w = solve_svm(K, Y_train, lamb=lamb, kktreg = 1e-9)
    n = K.shape[0]
    Y_predicted = np.dot(K_test, w[:n]) > 0.
    Y_predicted = Y_predicted + 0.0
    # result = ((Y_test+1.)/ 2. == np.transpose(Y_predicted))
    Y_predicted_train = np.dot(K, w[:n]) > 0.
    result_train = ((Y_train+1)/ 2 == np.transpose(Y_predicted_train))
    if np.alltrue(Y_predicted):
        print("Toute les valeurs sont TRUE")
    if np.alltrue(Y_predicted==False):
        print("Toute les valeurs sont FALSE")
    print("lambda = {}".format(lamb))
    print("train : {}".format(np.mean(result_train)))
    return Y_predicted

In [22]:
X_test0 = to_k_fold(X_valid0, fold=128)
X_test1 = to_k_fold(X_valid1, fold=128)
X_test2 = to_k_fold(X_valid2, fold=128)

100%|██████████| 1000/1000 [00:33<00:00, 29.52it/s]
100%|██████████| 1000/1000 [00:31<00:00, 32.20it/s]
100%|██████████| 1000/1000 [00:31<00:00, 31.45it/s]


In [28]:
Y0_t = (Y0 - 0.5) *2
w, bias = fit(X_0, Y0_t, lamb=1.)
Y_pred0 = predict(w, bias, X_test0)

     pcost       dcost       gap    pres   dres
 0:  9.2621e+00  1.2864e+01  5e+03  2e+00  3e+06
 1:  7.2937e+00 -2.8847e+02  3e+02  1e-01  2e+05
 2:  2.5566e+00 -5.0199e+01  5e+01  2e-02  3e+04
 3:  1.6728e+00 -5.7320e+00  7e+00  2e-03  3e+03
 4:  1.3318e+00  8.2074e-02  1e+00  2e-04  3e+02
 5:  6.3867e-01  5.2630e-01  1e-01  4e-06  8e+00
 6:  5.9189e-01  5.6471e-01  3e-02  9e-07  2e+00
 7:  5.8054e-01  5.7490e-01  6e-03  1e-07  2e-01
 8:  5.7782e-01  5.7717e-01  7e-04  9e-09  2e-02
 9:  5.7752e-01  5.7743e-01  9e-05  1e-09  2e-03
10:  5.7747e-01  5.7747e-01  4e-06  7e-10  7e-05
11:  5.7747e-01  5.7747e-01  8e-08  7e-10  1e-06
12:  5.7747e-01  5.7747e-01  2e-09  7e-10  2e-08
Optimal solution found.
lambda = 1.5
train : 0.833


In [29]:
Y1_t = (Y1 - 0.5) *2
w, bias = fit(X_1, Y1_t, lamb=1.)
Y_pred1 = predict(w, bias, X_test1)

     pcost       dcost       gap    pres   dres
 0:  6.9083e+00  1.0034e+01  5e+03  2e+00  2e+07
 1:  5.5102e+00 -2.2544e+02  2e+02  8e-02  1e+06
 2:  1.8893e+00 -2.8892e+01  3e+01  9e-03  1e+05
 3:  1.4867e+00 -3.5336e+00  5e+00  1e-03  2e+04
 4:  1.1001e+00 -8.0780e-02  1e+00  2e-04  2e+03
 5:  4.9247e-01  3.3941e-01  2e-01  1e-05  2e+02
 6:  4.2388e-01  3.8429e-01  4e-02  3e-06  4e+01
 7:  4.0657e-01  3.9766e-01  9e-03  5e-07  7e+00
 8:  4.0255e-01  4.0082e-01  2e-03  8e-08  1e+00
 9:  4.0166e-01  4.0152e-01  1e-04  5e-09  8e-02
10:  4.0159e-01  4.0158e-01  7e-06  7e-10  3e-03
11:  4.0159e-01  4.0159e-01  3e-07  8e-10  1e-04
12:  4.0159e-01  4.0159e-01  8e-09  8e-10  2e-06
13:  4.0159e-01  4.0159e-01  3e-10  8e-10  2e-08
Optimal solution found.
lambda = 2.0
train : 0.9205


In [30]:
Y2_t = (Y2 - 0.5) *2
w, bias = fit(X_2, Y2_t, lamb=1.)
Y_pred2 = predict(w, bias, X_test2)

     pcost       dcost       gap    pres   dres
 0:  1.5420e+01  2.1448e+01  5e+03  2e+00  6e+06
 1:  1.0812e+01 -3.7609e+02  4e+02  1e-01  5e+05
 2:  2.9521e+00 -5.2604e+01  6e+01  2e-02  6e+04
 3:  1.8370e+00 -6.9842e+00  9e+00  2e-03  8e+03
 4:  1.5149e+00  1.9806e-01  1e+00  1e-04  5e+02
 5:  8.2708e-01  6.2501e-01  2e-01  2e-05  7e+01
 6:  7.2171e-01  6.8425e-01  4e-02  3e-06  1e+01
 7:  7.0418e-01  6.9742e-01  7e-03  3e-07  1e+00
 8:  7.0081e-01  7.0005e-01  8e-04  3e-08  1e-01
 9:  7.0044e-01  7.0035e-01  9e-05  3e-09  1e-02
10:  7.0039e-01  7.0039e-01  6e-06  7e-10  7e-04
11:  7.0039e-01  7.0039e-01  2e-07  8e-10  2e-05
12:  7.0039e-01  7.0039e-01  5e-09  8e-10  3e-07
13:  7.0039e-01  7.0039e-01  2e-10  8e-10  3e-09
Optimal solution found.
lambda = 1.8
train : 0.769


In [33]:
test0 = np.transpose(Y_pred0)[:][0]
test1 = np.transpose(Y_pred1)[:][0]
test2 = np.transpose(Y_pred2)[:][0]

bound = np.concatenate((test0,test1,test2), axis=0).reshape((-1)).astype(int)
final = pd.DataFrame(np.arange(3000), columns=['Id'])
final['Bound'] = bound
final.to_csv('resultk_fold.csv', index= None)