In [1]:
import pandas as pd
import numpy as np
# from numba import njit

import matplotlib.pyplot as plt
from matplotlib import cm
from numpy import linalg as LA
from tqdm import tqdm

from kernel import *

from kernel_functions import * # gram_phi, count_kuplet_k, count_kuplet_3
from preprocessing import preprocessing

In [4]:
from itertools import product

def binary_space(k):
    """
    return all logical possibilities
    binary_space(2) = [[0,0], [0,1], [1, 0], [1, 1]]
    """
    space = np.zeros((2**k, k))
    for i in range(2**k):
        tmp = bin(i)[2:]
        for j in range(len(tmp)):
            space[i,k+j-len(tmp)] = int(tmp[j])
    return space

def count_k_fold(sent, fold = 16):
    k_grams_count = dict()
    for fold in np.arange(3, fold, 2):
        k_grams_count = count_kuplet_gap(sent, k = 3, fold = fold, k_grams_count=k_grams_count)
    return np.array([k_grams_count[key] for key in sorted(k_grams_count)])

def count_kuplet_k(seq, k=3):
    base_azote = ['A', 'C', 'T', 'G']
    dic_k = [''.join(i) for i in product(base_azote, repeat=k)]
    nb_feat = len(dic_k)

    k_grams_count = np.zeros(nb_feat)
    for i, e in enumerate(dic_k):
        k_grams_count[i] = seq.count(e)
    return k_grams_count

def count_kuplet_gap(seq, k=3, fold=5, k_grams_count=None):
    assert fold%2 == 1 or fold ==2, "fold must be odd"
    fold_size = len(bin(fold)[2:])
    fold = bin(fold)[2:]
    base_azote = ['A', 'C', 'T', 'G']
    
    if k_grams_count is None:
        k_grams_count = dict()
    tab = [''.join(i) for i in product(base_azote, repeat=np.sum([int(i) for i in fold]))]
    for code in tab:
        # print(''.join([code, '_', fold]))
        k_grams_count[''.join([code, '_', fold])] = 0.
        
    for i in range(len(seq) - fold_size+1):
        l = seq[i:(i+fold_size)]
        # print(l)
        l = ''.join(l[i]*int(fold[i]) for i in range(len(fold)))
        k_grams_count[''.join([l, '_', fold])] += 1
    return k_grams_count

def is_power(n):
    n = n/2
    if n == 2:
        return True
    elif n > 2:
        return is_power(n)
    else:
        return False
    
def to_k_fold(X, fold=64):
    X_process = []
    for sent in tqdm(X):
        X_process.append(count_k_fold(sent, fold = 64))
    return np.array(X_process)
    
# count_kuplet_gap(X_raw0[0], k =3, fold = 7)

In [5]:
# load all data as the numpy array type
#X = pd.read_csv('data/Xtr1_mat50.csv', sep=' ', header=None).values
X_raw0 = pd.read_csv('data/Xtr0.csv', sep= ' ', header = None).values.reshape((-1))
X_raw1 = pd.read_csv('data/Xtr1.csv', sep=' ', header=None).values.reshape((-1))
X_raw2 = pd.read_csv('data/Xtr2.csv', sep=' ', header=None).values.reshape((-1))

X_0 = to_k_fold(X_raw0, fold=128)
X_1 = to_k_fold(X_raw1, fold=128)
X_2 = to_k_fold(X_raw2, fold=128)

# transform to an array of string
X_valid0 = pd.read_csv('data/Xte0.csv', sep=' ', header=None).values.reshape((-1))
X_valid1 = pd.read_csv('data/Xte1.csv', sep=' ', header=None).values.reshape((-1))
X_valid2 = pd.read_csv('data/Xte1.csv', sep=' ', header=None).values.reshape((-1))


Y0 = pd.read_csv('data/Ytr0.csv', sep=',', header=0)['Bound'].values
Y1 = pd.read_csv('data/Ytr1.csv', sep=',', header=0)['Bound'].values
Y2 = pd.read_csv('data/Ytr2.csv', sep=',', header=0)['Bound'].values

#print('numerical features shape', X.shape)
#print('numerical features first row', X[0])
print('sequences shape: ', X_raw0.shape)
print('sequence first row: ', X_raw0[0])
print('labels shape', Y0.shape)

100%|██████████| 2000/2000 [01:01<00:00, 32.62it/s]
100%|██████████| 2000/2000 [00:59<00:00, 33.59it/s]
100%|██████████| 2000/2000 [00:58<00:00, 33.99it/s]


sequences shape:  (2000,)
sequence first row:  TCCTCAACTTTTATTGGGCCGCTGTGGCACCAGAATCTACGAATGGCGCCCTCTAGAGTTGTGTAAAGAAGTGGCGTCACCTCATTATAAATAAAAGGTTG
labels shape (2000,)


In [13]:
def test_lambda(X_train, Y_train, X_test, Y_test, lamb=1.):
    # X_train, Y_train, X_test, Y_test = preprocessing(X, Y, percent=0.8)
    X_train_preprocess = np.concatenate((X_train, np.ones((X_train.shape[0], 1))), axis=1)
    X_test_preprocess = np.concatenate((X_test, np.ones((X_test.shape[0], 1))), axis=1)
    K = X_train_preprocess.dot(X_train_preprocess.T)
    K_test = X_test_preprocess.dot(X_train_preprocess.T)
    w = solve_svm(K, Y_train, lamb=lamb, kktreg = 1e-9)
    n = K.shape[0]
    Y_predicted = np.dot(K_test, w[:n]) > 0.
    result = ((Y_test+1.)/ 2. == np.transpose(Y_predicted))
    Y_predicted_train = np.dot(K, w[:n]) > 0.
    result_train = ((Y_train+1)/ 2 == np.transpose(Y_predicted_train))
    if np.alltrue(Y_predicted):
        print("Toute les valeurs sont TRUE")
    if np.alltrue(Y_predicted==False):
        print("Toute les valeurs sont FALSE")
    print("lambda = {}".format(lamb))
    print("test : {}".format(np.mean(result)))
    print("train : {}".format(np.mean(result_train)))

In [17]:
X_train, Y_train, X_test, Y_test = preprocessing(X_0, Y0, percent=0.8)
for lamb in [1.4, 1.5, 1.6, 1.7]:
    test_lambda(X_train, Y_train, X_test, Y_test, lamb=lamb)

     pcost       dcost       gap    pres   dres
 0:  4.9961e+00  6.7059e+00  4e+03  1e+00  2e+06
 1:  4.9969e+00 -1.4797e+02  2e+02  7e-02  7e+04
 2:  2.0543e+00 -2.5059e+01  3e+01  1e-02  1e+04
 3:  1.5281e+00 -2.4387e+00  4e+00  1e-03  1e+03
 4:  1.1122e+00  2.7107e-01  8e-01  1e-04  1e+02
 5:  5.9369e-01  5.0664e-01  9e-02  8e-06  9e+00
 6:  5.5666e-01  5.3876e-01  2e-02  1e-06  1e+00
 7:  5.4841e-01  5.4619e-01  2e-03  1e-07  1e-01
 8:  5.4733e-01  5.4714e-01  2e-04  7e-09  8e-03
 9:  5.4723e-01  5.4722e-01  8e-06  6e-10  3e-04
10:  5.4723e-01  5.4723e-01  3e-07  7e-10  8e-06
11:  5.4723e-01  5.4723e-01  1e-08  7e-10  2e-07
12:  5.4723e-01  5.4723e-01  5e-10  7e-10  2e-09
Optimal solution found.
lambda = 1.4
test : 0.7375
train : 0.855625
     pcost       dcost       gap    pres   dres
 0:  5.3066e+00  7.1081e+00  4e+03  1e+00  2e+06
 1:  5.1927e+00 -1.5405e+02  2e+02  7e-02  8e+04
 2:  2.0859e+00 -2.5865e+01  3e+01  1e-02  1e+04
 3:  1.5422e+00 -2.5156e+00  4e+00  1e-03  1e+03
 4:

In [19]:
X_train, Y_train, X_test, Y_test = preprocessing(X_1, Y1, percent=0.8)
for lamb in [1.7, 1.9, 2.1, 2.3]:
    test_lambda(X_train, Y_train, X_test, Y_test, lamb=lamb)

     pcost       dcost       gap    pres   dres
 0:  3.6114e+00  5.1712e+00  3e+03  1e+00  2e+07
 1:  3.8436e+00 -1.3358e+02  1e+02  6e-02  6e+05
 2:  1.7047e+00 -2.0705e+01  2e+01  9e-03  1e+05
 3:  1.3715e+00 -3.0647e+00  4e+00  1e-03  1e+04
 4:  1.0449e+00 -2.5857e-01  1e+00  3e-04  3e+03
 5:  4.6377e-01  3.2468e-01  1e-01  1e-09  2e-08
 6:  3.9624e-01  3.5808e-01  4e-02  9e-10  8e-09
 7:  3.7833e-01  3.6937e-01  9e-03  8e-10  3e-09
 8:  3.7391e-01  3.7237e-01  2e-03  7e-10  6e-10
 9:  3.7305e-01  3.7299e-01  7e-05  7e-10  2e-10
10:  3.7302e-01  3.7301e-01  2e-06  8e-10  4e-11
11:  3.7302e-01  3.7302e-01  4e-08  8e-10  9e-12
Optimal solution found.
lambda = 1.7
test : 0.87
train : 0.9375
     pcost       dcost       gap    pres   dres
 0:  3.9754e+00  5.6540e+00  4e+03  1e+00  2e+07
 1:  4.0634e+00 -1.4099e+02  2e+02  6e-02  7e+05
 2:  1.7385e+00 -2.1771e+01  2e+01  9e-03  1e+05
 3:  1.3964e+00 -3.2799e+00  5e+00  1e-03  2e+04
 4:  1.0674e+00 -2.4346e-01  1e+00  3e-04  3e+03
 5:  4.

In [21]:
X_train, Y_train, X_test, Y_test = preprocessing(X_2, Y2, percent=0.7)
for lamb in [1.2, 1.9, 2.5, 2.8]:
    test_lambda(X_train, Y_train, X_test, Y_test, lamb=lamb)

     pcost       dcost       gap    pres   dres
 0:  4.5135e+00  5.9504e+00  3e+03  1e+00  6e+06
 1:  4.8616e+00 -1.2577e+02  1e+02  7e-02  3e+05
 2:  2.1163e+00 -2.1309e+01  2e+01  1e-02  4e+04
 3:  1.5880e+00 -1.8123e+00  3e+00  1e-03  4e+03
 4:  1.0707e+00  5.0819e-01  6e-01  4e-05  1e+02
 5:  6.7055e-01  5.9518e-01  8e-02  5e-06  2e+01
 6:  6.2882e-01  6.1826e-01  1e-02  5e-07  2e+00
 7:  6.2312e-01  6.2200e-01  1e-03  2e-08  8e-02
 8:  6.2251e-01  6.2243e-01  8e-05  1e-09  5e-03
 9:  6.2246e-01  6.2246e-01  3e-06  7e-10  1e-04
10:  6.2246e-01  6.2246e-01  1e-07  7e-10  2e-06
11:  6.2246e-01  6.2246e-01  6e-09  7e-10  3e-08
Optimal solution found.
lambda = 1.2
test : 0.6666666666666666
train : 0.8292857142857143
     pcost       dcost       gap    pres   dres
 0:  6.8106e+00  8.8087e+00  3e+03  2e+00  6e+06
 1:  6.4093e+00 -1.6799e+02  2e+02  9e-02  3e+05
 2:  2.3631e+00 -2.8806e+01  3e+01  1e-02  5e+04
 3:  1.7121e+00 -2.4984e+00  4e+00  1e-03  5e+03
 4:  1.2116e+00  5.8457e-01  6

In [128]:
X_process = np.array(X_process)
X_train = X_process[:1600]
X_test = X_process[1600:]
Y_train = (Y0[:1600] - 0.5) * 2.
Y_test = (Y0[1600:] - 0.5) * 2.
X_train_preprocess = np.concatenate((X_train, np.ones((X_train.shape[0], 1))), axis=1)
X_test_preprocess = np.concatenate((X_test, np.ones((X_test.shape[0], 1))), axis=1)

# compute gram matrices from features and solve svm
K = X_train_preprocess.dot(X_train_preprocess.T)
K_test = X_test_preprocess.dot(X_train_preprocess.T)
w = solve_svm(K, Y_train, lamb=1., kktreg = 1e-9)


n = K.shape[0]
Y_predicted = np.dot(K_test, w[:n]) > 0.
result = ((Y_test+1.)/ 2. == np.transpose(Y_predicted))
Y_predicted_train = np.dot(K, w[:n]) > 0.
result_train = ((Y_train+1)/ 2 == np.transpose(Y_predicted_train))
if np.alltrue(Y_predicted):
    print("Toute les valeurs sont TRUE")
if np.alltrue(Y_predicted==False):
    print("Toute les valeurs sont FALSE")
print(np.mean(result))
print(np.mean(result_train))

     pcost       dcost       gap    pres   dres
 0:  3.8250e+00  5.2381e+00  3e+03  1e+00  3e+06
 1:  4.2335e+00 -1.5158e+02  2e+02  7e-02  1e+05
 2:  2.0568e+00 -2.8655e+01  3e+01  1e-02  2e+04
 3:  1.5721e+00 -3.8152e+00  5e+00  2e-03  3e+03
 4:  1.1877e+00  6.6894e-02  1e+00  2e-04  3e+02
 5:  5.7140e-01  4.5960e-01  1e-01  9e-06  2e+01
 6:  5.1980e-01  4.9650e-01  2e-02  1e-06  3e+00
 7:  5.0905e-01  5.0477e-01  4e-03  2e-07  4e-01
 8:  5.0693e-01  5.0642e-01  5e-04  2e-08  3e-02
 9:  5.0665e-01  5.0663e-01  3e-05  8e-10  1e-03
10:  5.0664e-01  5.0664e-01  8e-07  7e-10  3e-05
11:  5.0664e-01  5.0664e-01  3e-08  7e-10  5e-07
12:  5.0664e-01  5.0664e-01  1e-09  7e-10  7e-09
Optimal solution found.
0.75
0.8675


## TESTING


In [27]:
def test(X_train, Y_train, X_test, lamb=1.):
    # X_train, Y_train, X_test, Y_test = preprocessing(X, Y, percent=0.8)
    X_train_preprocess = np.concatenate((X_train, np.ones((X_train.shape[0], 1))), axis=1)
    X_test_preprocess = np.concatenate((X_test, np.ones((X_test.shape[0], 1))), axis=1)
    K = X_train_preprocess.dot(X_train_preprocess.T)
    K_test = X_test_preprocess.dot(X_train_preprocess.T)
    w = solve_svm(K, Y_train, lamb=lamb, kktreg = 1e-9)
    n = K.shape[0]
    Y_predicted = np.dot(K_test, w[:n]) > 0.
    Y_predicted = Y_predicted + 0.0
    # result = ((Y_test+1.)/ 2. == np.transpose(Y_predicted))
    Y_predicted_train = np.dot(K, w[:n]) > 0.
    result_train = ((Y_train+1)/ 2 == np.transpose(Y_predicted_train))
    if np.alltrue(Y_predicted):
        print("Toute les valeurs sont TRUE")
    if np.alltrue(Y_predicted==False):
        print("Toute les valeurs sont FALSE")
    print("lambda = {}".format(lamb))
    print("train : {}".format(np.mean(result_train)))
    return Y_predicted

In [22]:
X_test0 = to_k_fold(X_valid0, fold=128)
X_test1 = to_k_fold(X_valid1, fold=128)
X_test2 = to_k_fold(X_valid2, fold=128)

100%|██████████| 1000/1000 [00:33<00:00, 29.52it/s]
100%|██████████| 1000/1000 [00:31<00:00, 32.20it/s]
100%|██████████| 1000/1000 [00:31<00:00, 31.45it/s]


In [28]:
Y0_t = (Y0 - 0.5) *2
Y_pred0 = test(X_0, Y0_t, X_test0, lamb=1.5)

     pcost       dcost       gap    pres   dres
 0:  9.2621e+00  1.2864e+01  5e+03  2e+00  3e+06
 1:  7.2937e+00 -2.8847e+02  3e+02  1e-01  2e+05
 2:  2.5566e+00 -5.0199e+01  5e+01  2e-02  3e+04
 3:  1.6728e+00 -5.7320e+00  7e+00  2e-03  3e+03
 4:  1.3318e+00  8.2074e-02  1e+00  2e-04  3e+02
 5:  6.3867e-01  5.2630e-01  1e-01  4e-06  8e+00
 6:  5.9189e-01  5.6471e-01  3e-02  9e-07  2e+00
 7:  5.8054e-01  5.7490e-01  6e-03  1e-07  2e-01
 8:  5.7782e-01  5.7717e-01  7e-04  9e-09  2e-02
 9:  5.7752e-01  5.7743e-01  9e-05  1e-09  2e-03
10:  5.7747e-01  5.7747e-01  4e-06  7e-10  7e-05
11:  5.7747e-01  5.7747e-01  8e-08  7e-10  1e-06
12:  5.7747e-01  5.7747e-01  2e-09  7e-10  2e-08
Optimal solution found.
lambda = 1.5
train : 0.833


In [29]:
Y1_t = (Y1 - 0.5) *2
Y_pred1 = test_lambda(X_1, Y1_t, X_test1, lamb=2.)

     pcost       dcost       gap    pres   dres
 0:  6.9083e+00  1.0034e+01  5e+03  2e+00  2e+07
 1:  5.5102e+00 -2.2544e+02  2e+02  8e-02  1e+06
 2:  1.8893e+00 -2.8892e+01  3e+01  9e-03  1e+05
 3:  1.4867e+00 -3.5336e+00  5e+00  1e-03  2e+04
 4:  1.1001e+00 -8.0780e-02  1e+00  2e-04  2e+03
 5:  4.9247e-01  3.3941e-01  2e-01  1e-05  2e+02
 6:  4.2388e-01  3.8429e-01  4e-02  3e-06  4e+01
 7:  4.0657e-01  3.9766e-01  9e-03  5e-07  7e+00
 8:  4.0255e-01  4.0082e-01  2e-03  8e-08  1e+00
 9:  4.0166e-01  4.0152e-01  1e-04  5e-09  8e-02
10:  4.0159e-01  4.0158e-01  7e-06  7e-10  3e-03
11:  4.0159e-01  4.0159e-01  3e-07  8e-10  1e-04
12:  4.0159e-01  4.0159e-01  8e-09  8e-10  2e-06
13:  4.0159e-01  4.0159e-01  3e-10  8e-10  2e-08
Optimal solution found.
lambda = 2.0
train : 0.9205


In [30]:
Y2_t = (Y2 - 0.5) *2
Y_pred2 = test_lambda(X_2, Y2_t, X_test2, lamb=1.8)

     pcost       dcost       gap    pres   dres
 0:  1.5420e+01  2.1448e+01  5e+03  2e+00  6e+06
 1:  1.0812e+01 -3.7609e+02  4e+02  1e-01  5e+05
 2:  2.9521e+00 -5.2604e+01  6e+01  2e-02  6e+04
 3:  1.8370e+00 -6.9842e+00  9e+00  2e-03  8e+03
 4:  1.5149e+00  1.9806e-01  1e+00  1e-04  5e+02
 5:  8.2708e-01  6.2501e-01  2e-01  2e-05  7e+01
 6:  7.2171e-01  6.8425e-01  4e-02  3e-06  1e+01
 7:  7.0418e-01  6.9742e-01  7e-03  3e-07  1e+00
 8:  7.0081e-01  7.0005e-01  8e-04  3e-08  1e-01
 9:  7.0044e-01  7.0035e-01  9e-05  3e-09  1e-02
10:  7.0039e-01  7.0039e-01  6e-06  7e-10  7e-04
11:  7.0039e-01  7.0039e-01  2e-07  8e-10  2e-05
12:  7.0039e-01  7.0039e-01  5e-09  8e-10  3e-07
13:  7.0039e-01  7.0039e-01  2e-10  8e-10  3e-09
Optimal solution found.
lambda = 1.8
train : 0.769


In [33]:
test0 = np.transpose(Y_pred0)[:][0]
test1 = np.transpose(Y_pred1)[:][0]
test2 = np.transpose(Y_pred2)[:][0]

bound = np.concatenate((test0,test1,test2), axis=0).reshape((-1)).astype(int)
final = pd.DataFrame(np.arange(3000), columns=['Id'])
final['Bound'] = bound
final.to_csv('resultk_fold.csv', index= None)