In [1]:
import pandas as pd
import numpy as np
# from numba import njit

import matplotlib.pyplot as plt
from matplotlib import cm
from numpy import linalg as LA
from tqdm import tqdm

from kernel import *

from kernel_functions import * # gram_phi, count_kuplet_k, count_kuplet_3
from preprocessing import preprocessing

In [2]:
from itertools import product
from collections import defaultdict

def count_kuplet_k(seq, k=3):
    base_azote = ['A', 'C', 'T', 'G']
    dic_k = [''.join(i) for i in product(base_azote, repeat=k)]
    nb_feat = len(dic_k)

    k_grams_count = np.zeros(nb_feat)
    for i, e in enumerate(dic_k):
        k_grams_count[i] = seq.count(e)
    return k_grams_count

def count_kuplet_gap(seq, k_tmp, k=3, fold=5, k_grams_count=None):
    assert fold%2 == 1 or fold ==2, "fold must be odd"
    fold_size = len(bin(fold)[2:])
    fold = bin(fold)[2:]
    base_azote = ['A', 'C', 'T', 'G']
    
    if k_grams_count is None:
        k_grams_count = dict()
        
    tab = [''.join(i) for i in product(base_azote, repeat=np.sum([int(i) for i in fold]))]
    for code in tab:
        # print(''.join([code, '_', fold]))
        k_grams_count[''.join([code, '_', fold])] = 0.
        k_tmp[''.join([code, '_', fold])] = 0.
        
    for i in range(len(seq) - fold_size+1):
        l = seq[i:(i+fold_size)]
        # print(l)
        l = ''.join(l[i]*int(fold[i]) for i in range(len(fold)))
        k_grams_count[''.join([l, '_', fold])] += 1./(len(seq) - fold_size+1)
        k_tmp[''.join([l, '_', fold])] += 1.
    return k_grams_count, k_tmp

def count_k_fold(sent, k_tmp, fold = 16):
    k_grams_count = dict()
    k_tmp = dict()
    for fold in np.arange(3, fold, 2):
        k_grams_count, k_tmp = count_kuplet_gap(sent, k_tmp, k = 3, fold = fold, k_grams_count=k_grams_count)
    return np.array([k_grams_count[key] for key in sorted(k_grams_count)]), np.array([k_tmp[key] for key in sorted(k_tmp)])
    
def to_k_fold(X, fold=64):
    X_process = []
    X_count = []
    k_tmp = defaultdict(int)
    for sent in tqdm(X):
        sent_process, k_tmp = count_k_fold(sent, k_tmp, fold = fold)
        X_process.append(sent_process)
        X_count.append(k_tmp)
    
    X_process = np.array(X_process) * len(X) * len(X[0])
    sum_tmp = np.sum(np.array(X_count), axis=0)
    
    X_process = np.divide(X_process, sum_tmp, out=np.zeros_like(X_process), where=sum_tmp!=0)
    
    return X_process
    

In [3]:
def linear_kernel(x, z):
    K = np.dot(x, z.T)
    return K

def fit(K, y, lamb = 0.1): 
    NUM = x.shape[0]
    DIM = x.shape[1]
    K = 2 * K * y.reshape((-1,1)).dot(y.reshape((1,-1)))
    P = matrix(K)
    q = matrix(-np.ones((NUM, 1)))
    G = matrix(-np.eye(NUM))
    h = matrix(np.zeros(NUM))
    A = matrix(y.reshape(1, -1))
    b = matrix(np.zeros(1))
    solvers.options['show_progress'] = False
    sol = solvers.qp(P, q, G, h, A, b)
    alphas = np.array(sol['x'])
    w = np.sum(alphas * y[:, None] * x, axis = 0)
    cond = (alphas > 1e-4).reshape(-1)
    b = y[cond] - np.dot(x[cond], w)
    bias = b[0]
    norm = np.linalg.norm(w)
    return w / norm, bias / norm

def predict(w, bias, x_test):
    mat = ((np.dot(x_test,w) + bias >0.)-0.5) * 2
    return mat

def testing_lambda(X_train, Y_train, X_test, Y_test, lamb=0.1):
    K = linear_kernel(X_train, X_train)
    w, bias = fit(X_train, Y_train, lamb=lamb)
    Y_pred = predict(w, bias, X_test)
    acc_test = np.sum(Y_pred == Y_test)/Y_test.shape[0]
    Y_pred_train = predict(w, bias, X_train)
    acc_train = np.sum(Y_pred_train == Y_train)/X_train.shape[0]
    return acc_train, acc_test

# testing_lambda(X_train, Y_train, X_test, Y_test, lamb=2.)

In [24]:
K

array([[ 35150.554567  ,   3611.81827955,   3987.99942574, ...,
         -4243.14455086,  -3256.5459857 ,  -6112.84313764],
       [  3611.81827955,  49385.88482724,   6588.01782678, ...,
         -4412.59551977,  -4419.16421517,  -6354.28470252],
       [  3987.99942574,   6588.01782678,  33198.76177274, ...,
         -7668.77191137,  -6606.96865718,  -6216.65937262],
       ..., 
       [ -4243.14455086,  -4412.59551977,  -7668.77191137, ...,
         46120.79528956,   2663.90169565,   4616.64490538],
       [ -3256.5459857 ,  -4419.16421517,  -6606.96865718, ...,
          2663.90169565,  35333.74581713,   6566.93772247],
       [ -6112.84313764,  -6354.28470252,  -6216.65937262, ...,
          4616.64490538,   6566.93772247,  48274.8599448 ]])

In [27]:
np.alltrue(K2 == K)

True

In [4]:
# load all data as the numpy array type
#X = pd.read_csv('data/Xtr1_mat50.csv', sep=' ', header=None).values
X_raw0 = pd.read_csv('data/Xtr0.csv', sep= ' ', header = None).values.reshape((-1))
X_raw1 = pd.read_csv('data/Xtr1.csv', sep=' ', header=None).values.reshape((-1))
X_raw2 = pd.read_csv('data/Xtr2.csv', sep=' ', header=None).values.reshape((-1))

X_0 = to_k_fold(X_raw0, fold=32)
X_1 = to_k_fold(X_raw1, fold=32)
X_2 = to_k_fold(X_raw2, fold=32)

# transform to an array of string
X_valid0 = pd.read_csv('data/Xte0.csv', sep=' ', header=None).values.reshape((-1))
X_valid1 = pd.read_csv('data/Xte1.csv', sep=' ', header=None).values.reshape((-1))
X_valid2 = pd.read_csv('data/Xte1.csv', sep=' ', header=None).values.reshape((-1))


Y0 = pd.read_csv('data/Ytr0.csv', sep=',', header=0)['Bound'].values
Y1 = pd.read_csv('data/Ytr1.csv', sep=',', header=0)['Bound'].values
Y2 = pd.read_csv('data/Ytr2.csv', sep=',', header=0)['Bound'].values

#print('numerical features shape', X.shape)
#print('numerical features first row', X[0])
print('sequences shape: ', X_raw0.shape)
print('sequence first row: ', X_raw0[0])
print('labels shape', Y0.shape)

100%|██████████| 2000/2000 [00:27<00:00, 73.89it/s]
100%|██████████| 2000/2000 [00:28<00:00, 70.29it/s]
100%|██████████| 2000/2000 [00:27<00:00, 73.23it/s]


sequences shape:  (2000,)
sequence first row:  TCCTCAACTTTTATTGGGCCGCTGTGGCACCAGAATCTACGAATGGCGCCCTCTAGAGTTGTGTAAAGAAGTGGCGTCACCTCATTATAAATAAAAGGTTG
labels shape (2000,)


In [5]:
X_train, Y_train, X_test, Y_test = preprocessing(X_0, Y0, percent=0.8)
for lamb in [2.05, 2.1, 2.15, 2.2]:
    acc_train, acc_test = testing_lambda(X_train, Y_train, X_test, Y_test, lamb=lamb)
    print("lamb = {}, acc train = {}, acc_test = {}".format(lamb, acc_train, acc_test))

lamb = 2.05, acc train = 1.0, acc_test = 0.68
lamb = 2.1, acc train = 1.0, acc_test = 0.68
lamb = 2.15, acc train = 1.0, acc_test = 0.68
lamb = 2.2, acc train = 1.0, acc_test = 0.68


In [6]:
X_train, Y_train, X_test, Y_test = preprocessing(X_1, Y1, percent=0.8)
for lamb in [2., 2.05, 2.15, 2.2]:
    acc_train, acc_test = testing_lambda(X_train, Y_train, X_test, Y_test, lamb=lamb)
    print("lamb = {}, acc train = {}, acc_test = {}".format(lamb, acc_train, acc_test))

lamb = 2.0, acc train = 1.0, acc_test = 0.775
lamb = 2.05, acc train = 1.0, acc_test = 0.775
lamb = 2.15, acc train = 1.0, acc_test = 0.775
lamb = 2.2, acc train = 1.0, acc_test = 0.775


In [7]:
X_train, Y_train, X_test, Y_test = preprocessing(X_2, Y2, percent=0.7)
for lamb in [2.05, 2.1, 2.15, 2.2]:
    acc_train, acc_test = testing_lambda(X_train, Y_train, X_test, Y_test, lamb=lamb)
    print("lamb = {}, acc train = {}, acc_test = {}".format(lamb, acc_train, acc_test))

lamb = 2.05, acc train = 1.0, acc_test = 0.5916666666666667
lamb = 2.1, acc train = 1.0, acc_test = 0.5916666666666667
lamb = 2.15, acc train = 1.0, acc_test = 0.5916666666666667
lamb = 2.2, acc train = 1.0, acc_test = 0.5916666666666667


## TESTING


In [27]:
def test(X_train, Y_train, X_test, lamb=1.):
    # X_train, Y_train, X_test, Y_test = preprocessing(X, Y, percent=0.8)
    X_train_preprocess = np.concatenate((X_train, np.ones((X_train.shape[0], 1))), axis=1)
    X_test_preprocess = np.concatenate((X_test, np.ones((X_test.shape[0], 1))), axis=1)
    K = X_train_preprocess.dot(X_train_preprocess.T)
    K_test = X_test_preprocess.dot(X_train_preprocess.T)
    w = solve_svm(K, Y_train, lamb=lamb, kktreg = 1e-9)
    n = K.shape[0]
    Y_predicted = np.dot(K_test, w[:n]) > 0.
    Y_predicted = Y_predicted + 0.0
    # result = ((Y_test+1.)/ 2. == np.transpose(Y_predicted))
    Y_predicted_train = np.dot(K, w[:n]) > 0.
    result_train = ((Y_train+1)/ 2 == np.transpose(Y_predicted_train))
    if np.alltrue(Y_predicted):
        print("Toute les valeurs sont TRUE")
    if np.alltrue(Y_predicted==False):
        print("Toute les valeurs sont FALSE")
    print("lambda = {}".format(lamb))
    print("train : {}".format(np.mean(result_train)))
    return Y_predicted

In [22]:
X_test0 = to_k_fold(X_valid0, fold=128)
X_test1 = to_k_fold(X_valid1, fold=128)
X_test2 = to_k_fold(X_valid2, fold=128)

100%|██████████| 1000/1000 [00:33<00:00, 29.52it/s]
100%|██████████| 1000/1000 [00:31<00:00, 32.20it/s]
100%|██████████| 1000/1000 [00:31<00:00, 31.45it/s]


In [28]:
Y0_t = (Y0 - 0.5) *2
w, bias = fit(X_0, Y0_t, lamb=1.)
Y_pred0 = predict(w, bias, X_test0)

     pcost       dcost       gap    pres   dres
 0:  9.2621e+00  1.2864e+01  5e+03  2e+00  3e+06
 1:  7.2937e+00 -2.8847e+02  3e+02  1e-01  2e+05
 2:  2.5566e+00 -5.0199e+01  5e+01  2e-02  3e+04
 3:  1.6728e+00 -5.7320e+00  7e+00  2e-03  3e+03
 4:  1.3318e+00  8.2074e-02  1e+00  2e-04  3e+02
 5:  6.3867e-01  5.2630e-01  1e-01  4e-06  8e+00
 6:  5.9189e-01  5.6471e-01  3e-02  9e-07  2e+00
 7:  5.8054e-01  5.7490e-01  6e-03  1e-07  2e-01
 8:  5.7782e-01  5.7717e-01  7e-04  9e-09  2e-02
 9:  5.7752e-01  5.7743e-01  9e-05  1e-09  2e-03
10:  5.7747e-01  5.7747e-01  4e-06  7e-10  7e-05
11:  5.7747e-01  5.7747e-01  8e-08  7e-10  1e-06
12:  5.7747e-01  5.7747e-01  2e-09  7e-10  2e-08
Optimal solution found.
lambda = 1.5
train : 0.833


In [29]:
Y1_t = (Y1 - 0.5) *2
w, bias = fit(X_1, Y1_t, lamb=1.)
Y_pred1 = predict(w, bias, X_test1)

     pcost       dcost       gap    pres   dres
 0:  6.9083e+00  1.0034e+01  5e+03  2e+00  2e+07
 1:  5.5102e+00 -2.2544e+02  2e+02  8e-02  1e+06
 2:  1.8893e+00 -2.8892e+01  3e+01  9e-03  1e+05
 3:  1.4867e+00 -3.5336e+00  5e+00  1e-03  2e+04
 4:  1.1001e+00 -8.0780e-02  1e+00  2e-04  2e+03
 5:  4.9247e-01  3.3941e-01  2e-01  1e-05  2e+02
 6:  4.2388e-01  3.8429e-01  4e-02  3e-06  4e+01
 7:  4.0657e-01  3.9766e-01  9e-03  5e-07  7e+00
 8:  4.0255e-01  4.0082e-01  2e-03  8e-08  1e+00
 9:  4.0166e-01  4.0152e-01  1e-04  5e-09  8e-02
10:  4.0159e-01  4.0158e-01  7e-06  7e-10  3e-03
11:  4.0159e-01  4.0159e-01  3e-07  8e-10  1e-04
12:  4.0159e-01  4.0159e-01  8e-09  8e-10  2e-06
13:  4.0159e-01  4.0159e-01  3e-10  8e-10  2e-08
Optimal solution found.
lambda = 2.0
train : 0.9205


In [30]:
Y2_t = (Y2 - 0.5) *2
w, bias = fit(X_2, Y2_t, lamb=1.)
Y_pred2 = predict(w, bias, X_test2)

     pcost       dcost       gap    pres   dres
 0:  1.5420e+01  2.1448e+01  5e+03  2e+00  6e+06
 1:  1.0812e+01 -3.7609e+02  4e+02  1e-01  5e+05
 2:  2.9521e+00 -5.2604e+01  6e+01  2e-02  6e+04
 3:  1.8370e+00 -6.9842e+00  9e+00  2e-03  8e+03
 4:  1.5149e+00  1.9806e-01  1e+00  1e-04  5e+02
 5:  8.2708e-01  6.2501e-01  2e-01  2e-05  7e+01
 6:  7.2171e-01  6.8425e-01  4e-02  3e-06  1e+01
 7:  7.0418e-01  6.9742e-01  7e-03  3e-07  1e+00
 8:  7.0081e-01  7.0005e-01  8e-04  3e-08  1e-01
 9:  7.0044e-01  7.0035e-01  9e-05  3e-09  1e-02
10:  7.0039e-01  7.0039e-01  6e-06  7e-10  7e-04
11:  7.0039e-01  7.0039e-01  2e-07  8e-10  2e-05
12:  7.0039e-01  7.0039e-01  5e-09  8e-10  3e-07
13:  7.0039e-01  7.0039e-01  2e-10  8e-10  3e-09
Optimal solution found.
lambda = 1.8
train : 0.769


In [33]:
test0 = np.transpose(Y_pred0)[:][0]
test1 = np.transpose(Y_pred1)[:][0]
test2 = np.transpose(Y_pred2)[:][0]

bound = np.concatenate((test0,test1,test2), axis=0).reshape((-1)).astype(int)
final = pd.DataFrame(np.arange(3000), columns=['Id'])
final['Bound'] = bound
final.to_csv('resultk_fold.csv', index= None)