In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib import cm
from numpy import linalg as LA
from tqdm import tqdm

from sklearn import svm

from kernel_functions import gram_phi, count_kuplet_k, count_kuplet_3
from preprocessing import preprocessing

In [3]:
# load all data as the numpy array type
#X = pd.read_csv('data/Xtr1_mat50.csv', sep=' ', header=None).values
X_raw0 = pd.read_csv('data/Xtr0.csv', sep= ' ', header = None).values.reshape((-1))
X_raw1 = pd.read_csv('data/Xtr1.csv', sep=' ', header=None).values.reshape((-1))
X_raw2 = pd.read_csv('data/Xtr2.csv', sep=' ', header=None).values.reshape((-1))

# transform to an array of string
X_valid0 = pd.read_csv('data/Xte0.csv', sep=' ', header=None).values.reshape((-1))
X_valid1 = pd.read_csv('data/Xte1.csv', sep=' ', header=None).values.reshape((-1))
X_valid2 = pd.read_csv('data/Xte1.csv', sep=' ', header=None).values.reshape((-1))


Y0 = pd.read_csv('data/Ytr0.csv', sep=',', header=0)['Bound'].values
Y1 = pd.read_csv('data/Ytr1.csv', sep=',', header=0)['Bound'].values
Y2 = pd.read_csv('data/Ytr2.csv', sep=',', header=0)['Bound'].values

#print('numerical features shape', X.shape)
#print('numerical features first row', X[0])
print('sequences shape: ', X_raw0.shape)
print('sequence first row: ', X_raw0[0])
print('labels shape', Y0.shape)

sequences shape:  (2000,)
sequence first row:  TCCTCAACTTTTATTGGGCCGCTGTGGCACCAGAATCTACGAATGGCGCCCTCTAGAGTTGTGTAAAGAAGTGGCGTCACCTCATTATAAATAAAAGGTTG
labels shape (2000,)


In [4]:
from kernel import *

def solve_svm_kernel(X_train, X_test, Y_train, Y_test, kernel='k_gram_gaussian', k=3, lamb=0.1, gamma=0.1, kktreg=1e-9):
    """
    kernel in [k_gram, k_gram_gaussian]
    """
    if kernel in ['k_gram', 'k_gram_gaussian']:
        if k == 3:
            X_train_process = np.array([count_kuplet_3(x) for x in X_train])
            X_test_process = np.array([count_kuplet_3(x) for x in X_test])
        else:
            X_train_process = np.array([count_kuplet_k(x,k=k) for x in X_train])
            X_test_process = np.array([count_kuplet_k(x,k=k) for x in X_test])
        
        # Adding 1 for the sake of the bias
        X_train_process = np.concatenate((X_train_process, np.ones((X_train_process.shape[0], 1))), axis=1)
        X_test_process = np.concatenate((X_test_process, np.ones((X_test_process.shape[0], 1))), axis=1)
        
    if kernel=="k_gram":
        # Computing the Gram-Matrix
        K = X_train_process.dot(X_train_process.T)
        w = solve_svm(K, Y_train, lamb=lamb, kktreg=kktreg)
        K_test = np.dot(X_test_process, np.transpose(X_train_process))
    
    if kernel=="k_gram_gaussian":
        # Computing the Gram-Matrix
        K = np.array([LA.norm(X_train_process - y, axis=1) for y in X_train_process])
        K = np.exp(-K/gamma)
        # K = X_train_process.dot(X_train_process.T)
        w = solve_svm(K, Y_train, lamb=lamb)
        K_test = np.array([LA.norm(X_train_process - y, axis=1) for y in X_test_process])
        K_test = np.exp(-K_test/gamma)
    
    n = K.shape[0]
    Y_predicted = np.dot(K_test, w[:n]) > 0.
    result = ((Y_test+1.)/ 2. == np.transpose(Y_predicted))
    Y_predicted_train = np.dot(K, w[:n]) > 0.
    result_train = ((Y_train+1)/ 2 == np.transpose(Y_predicted_train))
    if np.alltrue(Y_predicted):
        print("Toute les valeurs sont TRUE")
    if np.alltrue(Y_predicted==False):
        print("Toute les valeurs sont FALSE")
    return np.mean(result), np.mean(result_train)


In [7]:
X_train, Y_train, X_test, Y_test = preprocessing(X_raw1, Y1)
print('train shape', X_train.shape)
print('test shape', X_test.shape)

train shape (1600,)
test shape (400,)


In [8]:
X1 = X_train[:]
X2 = X_train[:]
k = min(len(X1[0]), len(X2[0]))

B = np.zeros((k, len(X1), len(X2)))



In [15]:
# With Cross validation

X_train0, Y_train0, X_test0, Y_test0 = preprocessing(X_raw0, Y0)
X_train1, Y_train1, X_test1, Y_test1 = preprocessing(X_raw1, Y1)
X_train2, Y_train2, X_test2, Y_test2 = preprocessing(X_raw2, Y2)
print('train shape', X_train0.shape)
print('test shape', X_test0.shape)

print('train shape', X_train1.shape)
print('test shape', X_test1.shape)

print('train shape', X_train2.shape)
print('test shape', X_test2.shape)


train shape (1600,)
test shape (400,)
train shape (1600,)
test shape (400,)
train shape (1600,)
test shape (400,)


In [32]:
acc_test0, acc_train0 = solve_svm_kernel(X_train0, 
                                       X_test0, 
                                       Y_train0, 
                                       Y_test0, 
                                       kernel='k_gram_gaussian', 
                                       k=3, lamb=0.001, gamma=5)

print('accuracy for train : {}'.format(acc_train0))
print('accuracy for test : {}'.format(acc_test0))

KeyboardInterrupt: 

In [9]:
acc_test1, acc_train1 = solve_svm_kernel(X_train1, 
                                       X_test1, 
                                       Y_train1, 
                                       Y_test1, 
                                       kernel='k_gram', 
                                       k=3, lamb=0.01, gamma=1.)

print('accuracy for train : {}'.format(acc_train1))
print('accuracy for test : {}'.format(acc_test1))

     pcost       dcost       gap    pres   dres
 0:  3.3586e-01  5.1306e+02  1e+04  3e+00  1e+06
 1:  2.4435e+00 -1.5721e+02  2e+02  3e-02  2e+04
 2:  2.3893e+00 -6.4279e+00  9e+00  2e-03  7e+02
 3:  1.7351e+00 -2.8486e-01  2e+00  2e-04  9e+01
 4:  6.6843e-01  4.2466e-01  2e-01  1e-05  5e+00
 5:  5.6983e-01  4.9009e-01  8e-02  3e-06  1e+00
 6:  5.4689e-01  5.0801e-01  4e-02  1e-06  6e-01
 7:  5.3331e-01  5.1829e-01  2e-02  3e-07  2e-01
 8:  5.2768e-01  5.2256e-01  5e-03  9e-08  4e-02
 9:  5.2586e-01  5.2402e-01  2e-03  3e-08  1e-02
10:  5.2510e-01  5.2461e-01  5e-04  1e-09  7e-04
11:  5.2492e-01  5.2477e-01  2e-04  9e-10  1e-04
12:  5.2485e-01  5.2483e-01  1e-05  1e-09  9e-06
13:  5.2484e-01  5.2484e-01  3e-07  1e-09  2e-07
14:  5.2484e-01  5.2484e-01  6e-09  1e-09  3e-09
Optimal solution found.
accuracy for train : 0.7875
accuracy for test : 0.74


NameError: name 'Y_train1' is not defined

In [26]:
acc_test2, acc_train2 = solve_svm_kernel(X_train2, 
                                       X_test2, 
                                       Y_train2, 
                                       Y_test2, 
                                       kernel='k_gram', 
                                       k=3, lamb=0.00001, gamma=1.)

print('accuracy for train : {}'.format(acc_train2))
print('accuracy for test : {}'.format(acc_test2))

     pcost       dcost       gap    pres   dres
 0:  4.3065e-01  6.9207e+02  1e+04  3e+00  2e+05
 1:  2.5330e+00 -1.3803e+02  1e+02  3e-02  2e+03
 2:  2.4922e+00 -3.1660e+00  6e+00  8e-04  5e+01
 3:  1.5364e+00  4.6532e-01  1e+00  6e-05  4e+00
 4:  8.3139e-01  6.8039e-01  2e-01  8e-06  5e-01
 5:  7.7428e-01  7.3890e-01  4e-02  1e-06  9e-02
 6:  7.6480e-01  7.4822e-01  2e-02  5e-07  3e-02
 7:  7.6004e-01  7.5252e-01  8e-03  2e-07  1e-02
 8:  7.5767e-01  7.5463e-01  3e-03  6e-08  4e-03
 9:  7.5671e-01  7.5549e-01  1e-03  2e-08  1e-03
10:  7.5635e-01  7.5581e-01  5e-04  6e-09  4e-04
11:  7.5614e-01  7.5599e-01  1e-04  1e-09  8e-05
12:  7.5610e-01  7.5604e-01  6e-05  8e-10  3e-05
13:  7.5607e-01  7.5606e-01  1e-05  1e-09  3e-06
14:  7.5607e-01  7.5606e-01  2e-06  1e-09  7e-07
15:  7.5607e-01  7.5607e-01  4e-08  1e-09  1e-08
Optimal solution found.
accuracy for train : 0.663125
accuracy for test : 0.6425


## Best parameters found for k_gram kernel
* Set0: lambda=1e-3, gamma = 5 (gaussian)
* Set1: lambda=0.01
* Set2: lambda= 1e-5

In [17]:
def solve_svm_test(X_train, X_test, Y_train, kernel='k_gram_gaussian', k=3, lamb=0.1, gamma=0.1, kktreg=1e-9):
    """
    kernel in [k_gram, k_gram_gaussian]
    """
    Y_train_process = (Y_train-0.5) * 2
    if kernel in ['k_gram', 'k_gram_gaussian']:
        if k == 3:
            X_train_process = np.array([count_kuplet_3(x) for x in X_train])
            X_test_process = np.array([count_kuplet_3(x) for x in X_test])
        else:
            X_train_process = np.array([count_kuplet_k(x,k=k) for x in X_train])
            X_test_process = np.array([count_kuplet_k(x,k=k) for x in X_test])
        
        # Adding 1 for the sake of the bias
        X_train_process = np.concatenate((X_train_process, np.ones((X_train_process.shape[0], 1))), axis=1)
        X_test_process = np.concatenate((X_test_process, np.ones((X_test_process.shape[0], 1))), axis=1)
        
        
    if kernel=="k_gram":
        # Computing the Gram-Matrix
        K = X_train_process.dot(X_train_process.T)
        w = solve_svm(K, Y_train_process, lamb=lamb, kktreg=kktreg)
        K_test = np.dot(X_test_process, np.transpose(X_train_process))
    
    if kernel=="k_gram_gaussian":
        # Computing the Gram-Matrix
        K = np.array([LA.norm(X_train_process - y, axis=1) for y in X_train_process])
        K = np.exp(-K/gamma)
        # K = X_train_process.dot(X_train_process.T)
        w = solve_svm(K, Y_train_process, lamb=lamb)
        K_test = np.array([LA.norm(X_train_process - y, axis=1) for y in X_test_process])
        K_test = np.exp(-K_test/gamma)
    
    n = K.shape[0]
    Y_predicted = np.dot(K_test, w[:n]) > 0.
    Y_predicted = (Y_predicted + 0.)
    Y_predicted_train = np.dot(K, w[:n]) > 0.
    result_train = (Y_train == np.transpose(Y_predicted_train))
    print('accuracy on train : {}'.format(np.mean(result_train)))
    # result = ((Y_test+1)/ 2 == np.transpose(Y_predicted))
    
    return np.transpose(Y_predicted), np.transpose(Y_predicted_train)


In [18]:
print('train shape : {}'.format(X_raw0.shape))
print('train shape 2 : {}'.format(Y0.shape))
print('test shape : {}'.format(X_valid0.shape))

Y_predict0, Y_predicted_train0 = solve_svm_test(X_raw0, 
                           X_valid0, 
                           Y0, 
                           kernel='k_gram_gaussian', 
                           k=3, lamb=.001, gamma=5.)

train shape : (2000,)
train shape 2 : (2000,)
test shape : (1000,)
     pcost       dcost       gap    pres   dres
 0:  1.9546e+00  2.9746e+00  4e+03  1e+00  2e+02
 1:  2.9161e+00 -5.7463e+01  6e+01  2e-02  2e+00
 2:  2.0943e+00 -9.6812e+00  1e+01  4e-03  4e-01
 3:  1.5247e+00  2.3207e-01  1e+00  2e-04  2e-02
 4:  8.5340e-01  7.5174e-01  1e-01  6e-07  7e-05
 5:  8.0636e-01  7.8541e-01  2e-02  1e-07  1e-05
 6:  7.9629e-01  7.9397e-01  2e-03  7e-09  8e-07
 7:  7.9516e-01  7.9504e-01  1e-04  6e-10  3e-08
 8:  7.9510e-01  7.9510e-01  3e-06  7e-10  6e-10
 9:  7.9510e-01  7.9510e-01  6e-08  7e-10  1e-11
Optimal solution found.
accuracy on train : 0.861


In [13]:
print(Y0)

[1 0 1 ..., 1 0 0]


In [None]:
print(Y_predicted_train0)

In [34]:
print('train shape : {}'.format(X_raw1.shape))
print('train shape 2 : {}'.format(Y1.shape))
print('test shape : {}'.format(X_valid1.shape))

Y_predict1 = solve_svm_test(X_raw1, 
                           X_valid1, 
                           Y1, 
                           kernel='k_gram', 
                           k=3, lamb=.001, gamma=3.)

train shape : (2000,)
train shape 2 : (2000,)
test shape : (1000,)
     pcost       dcost       gap    pres   dres
 0:  2.5061e-01  5.0175e+02  8e+03  2e+00  1e+07
 1:  1.7349e+00 -8.5085e+01  9e+01  2e-02  1e+05
 2:  1.6842e+00 -8.6288e-01  3e+00  3e-04  2e+03
 3:  6.6725e-01  4.3023e-01  2e-01  6e-06  3e+01
 4:  5.0455e-01  4.9724e-01  7e-03  2e-07  9e-01
 5:  5.0007e-01  4.9997e-01  1e-04  1e-09  9e-03
 6:  5.0001e-01  5.0001e-01  4e-06  8e-10  9e-05
 7:  5.0001e-01  5.0001e-01  2e-06  4e-10  4e-05
 8:  5.0001e-01  5.0001e-01  1e-06  3e-10  2e-05
 9:  5.0001e-01  5.0001e-01  1e-06  2e-10  1e-05
10:  5.0001e-01  5.0001e-01  6e-07  4e-10  4e-06
11:  5.0001e-01  5.0001e-01  2e-07  4e-10  5e-07
12:  5.0001e-01  5.0001e-01  6e-08  6e-10  1e-07
13:  5.0001e-01  5.0001e-01  7e-09  7e-10  1e-08
Optimal solution found.
accuracy on train : 0.5


In [37]:
print('train shape : {}'.format(X_raw2.shape))
print('train shape 2 : {}'.format(Y2.shape))
print('test shape : {}'.format(X_valid2.shape))

Y_predict2 = solve_svm_test(X_raw2, 
                           X_valid2, 
                           Y2, 
                           kernel='k_gram', 
                           k=3, lamb=.00001, gamma=3.)

train shape : (2000,)
train shape 2 : (2000,)
test shape : (1000,)
     pcost       dcost       gap    pres   dres
 0:  2.4964e-01  5.0175e+02  8e+03  2e+00  1e+07
 1:  1.7342e+00 -8.3672e+01  9e+01  2e-02  1e+05
 2:  1.6827e+00 -4.3572e-01  2e+00  2e-04  1e+03
 3:  6.0035e-01  4.3850e-01  2e-01  6e-06  3e+01
 4:  5.0132e-01  4.9918e-01  2e-03  8e-08  4e-01
 5:  5.0005e-01  4.9997e-01  8e-05  2e-09  1e-02
 6:  5.0000e-01  5.0000e-01  1e-06  1e-09  1e-04
 7:  5.0000e-01  5.0000e-01  5e-08  8e-10  1e-06
 8:  5.0000e-01  5.0000e-01  2e-08  5e-10  5e-07
 9:  5.0000e-01  5.0000e-01  2e-08  3e-10  2e-07
10:  5.0000e-01  5.0000e-01  1e-08  1e-10  7e-08
Optimal solution found.
accuracy on train : 0.5


In [42]:
test0 = Y_predict0[:]
test1 = Y_predict1[:]
test2 = Y_predict2[:]

bound = np.concatenate((test0,test1,test2), axis=1).reshape(-1).astype(int)
final = pd.DataFrame(np.arange(3000), columns=['Id'])
final['Bound'] = bound
final.to_csv('result.csv', index= None)

In [43]:
final

Unnamed: 0,Id,Bound
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1
5,5,1
6,6,1
7,7,1
8,8,1
9,9,1
