In [65]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib import cm
from numpy import linalg as LA
from tqdm import tqdm

from sklearn import svm

from kernel_functions import gram_phi, count_kuplet_k, count_kuplet_3
from preprocessing import preprocessing

In [66]:
# load all data as the numpy array type
X = pd.read_csv('data/Xtr0_mat50.csv', sep=' ', header=None).values
X_raw = pd.read_csv('data/Xtr0.csv', sep=' ', header=None).values
# transform to an array of string
X_raw = X_raw.reshape((-1))
Y = pd.read_csv('data/Ytr0.csv', sep=',', header=0)['Bound'].values

print('numerical features shape', X.shape)
print('numerical features first row', X[0])
print('sequences shape: ', X_raw.shape)
print('sequence first row: ', X_raw[0])
print('labels shape', Y.shape)

numerical features shape (2000, 50)
numerical features first row [ 0.          0.04347826  0.          0.04347826  0.02173913  0.02173913
  0.          0.02173913  0.01086957  0.0326087   0.02173913  0.02173913
  0.04347826  0.02173913  0.04347826  0.01086957  0.02173913  0.02173913
  0.02173913  0.02173913  0.          0.          0.01086957  0.
  0.01086957  0.01086957  0.01086957  0.0326087   0.01086957  0.
  0.02173913  0.06521739  0.02173913  0.01086957  0.04347826  0.
  0.01086957  0.          0.01086957  0.0326087   0.02173913  0.0326087
  0.02173913  0.02173913  0.04347826  0.05434783  0.01086957  0.02173913
  0.01086957  0.01086957]
sequences shape:  (2000,)
sequence first row:  TCCTCAACTTTTATTGGGCCGCTGTGGCACCAGAATCTACGAATGGCGCCCTCTAGAGTTGTGTAAAGAAGTGGCGTCACCTCATTATAAATAAAAGGTTG
labels shape (2000,)


In [67]:
from kernel import *

def solve_svm_kernel(X_train, X_test, Y_train, Y_test, kernel='k_gram_gaussian', k=3, lamb=0.1, gamma=0.1, kktreg=1e-9):
    """
    kernel in [k_gram, k_gram_gaussian]
    """
    if kernel in ['k_gram', 'k_gram_gaussian']:
        if k == 3:
            X_train_process = np.array([count_kuplet_3(x) for x in X_train])
            X_test_process = np.array([count_kuplet_3(x) for x in X_test])
        else:
            X_train_process = np.array([count_kuplet_k(x,k=k) for x in X_train])
            X_test_process = np.array([count_kuplet_k(x,k=k) for x in X_test])
        
        # Adding 1 for the sake of the bias
        X_train_process = np.concatenate((X_train_process, np.ones((X_train_process.shape[0], 1))), axis=1)
        X_test_process = np.concatenate((X_test_process, np.ones((X_test_process.shape[0], 1))), axis=1)
        
    if kernel=="k_gram":
        # Computing the Gram-Matrix
        K = X_train_process.dot(X_train_process.T)
        w = solve_svm(K, Y_train, lamb=lamb, kktreg=kktreg)
        K_test = np.dot(X_test_process, np.transpose(X_train_process))
    
    if kernel=="k_gram_gaussian":
        # Computing the Gram-Matrix
        K = np.array([LA.norm(X_train_process - y, axis=1) for y in X_train_process])
        K = np.exp(-K/gamma)
        # K = X_train_process.dot(X_train_process.T)
        w = solve_svm(K, Y_train, lamb=lamb)
        K_test = np.array([LA.norm(X_train_process - y, axis=1) for y in X_test_process])
        K_test = np.exp(-K_test/gamma)
    
    n = K.shape[0]
    Y_predicted = np.dot(K_test, w[:n]) > 0.
    result = ((Y_test+1.)/ 2. == np.transpose(Y_predicted))
    Y_predicted_train = np.dot(K, w[:n]) > 0.
    result_train = ((Y_train+1)/ 2 == np.transpose(Y_predicted_train))
    if np.alltrue(Y_predicted):
        print("Toute les valeurs sont TRUE")
    if np.alltrue(Y_predicted==False):
        print("Toute les valeurs sont FALSE")
    return np.mean(result), np.mean(result_train)

    

In [None]:
# With Cross validation

X_train, Y_train, X_test, Y_test = preprocessing(X_raw, Y)
print('train shape', X_train.shape)
print('test shape', X_test.shape)

acc_test, acc_train = solve_svm_kernel(X_train, 
                                       X_test, 
                                       Y_train, 
                                       Y_test, 
                                       kernel='k_gram', 
                                       k=3, lamb=.000005, gamma=3.)

print('accuracy for train : {}'.format(acc_train))
print('accuracy for test : {}'.format(acc_test))



train shape (1600,)
test shape (400,)
     pcost       dcost       gap    pres   dres
 0:  3.9190e-01  6.3001e+02  1e+04  3e+00  2e+05
 1:  2.3523e+00 -1.1935e+02  1e+02  3e-02  2e+03
 2:  2.3054e+00 -2.4594e+00  5e+00  7e-04  5e+01
 3:  1.3713e+00  3.9868e-01  1e+00  6e-05  5e+00
 4:  7.5952e-01  6.0768e-01  2e-01  1e-05  7e-01
 5:  7.0354e-01  6.5582e-01  5e-02  3e-06  2e-01
 6:  6.8799e-01  6.6861e-01  2e-02  9e-07  6e-02
 7:  6.8178e-01  6.7381e-01  8e-03  2e-07  2e-02
 8:  6.7874e-01  6.7633e-01  2e-03  5e-08  4e-03
 9:  6.7799e-01  6.7696e-01  1e-03  2e-08  1e-03
10:  6.7761e-01  6.7728e-01  3e-04  4e-09  3e-04
11:  6.7749e-01  6.7739e-01  1e-04  1e-09  8e-05
12:  6.7747e-01  6.7740e-01  7e-05  6e-10  3e-05


In [63]:


def solve_svm_test(X_train, X_test, Y_train, kernel='k_gram_gaussian', k=3, lamb=0.1, gamma=0.1, kktreg=1e-9):
    """
    kernel in [k_gram, k_gram_gaussian]
    """
    if kernel in ['k_gram', 'k_gram_gaussian']:
        if k == 3:
            X_train_process = np.array([count_kuplet_3(x) for x in X_train])
            X_test_process = np.array([count_kuplet_3(x) for x in X_test])
        else:
            X_train_process = np.array([count_kuplet_k(x,k=k) for x in X_train])
            X_test_process = np.array([count_kuplet_k(x,k=k) for x in X_test])
        
        # Adding 1 for the sake of the bias
        X_train_process = np.concatenate((X_train_process, np.ones((X_train_process.shape[0], 1))), axis=1)
        X_test_process = np.concatenate((X_test_process, np.ones((X_test_process.shape[0], 1))), axis=1)
        
    if kernel=="k_gram":
        # Computing the Gram-Matrix
        K = X_train_process.dot(X_train_process.T)
        w = solve_svm(K, Y_train, lamb=lamb, kktreg=kktreg)
        K_test = np.dot(X_test_process, np.transpose(X_train_process))
    
    if kernel=="k_gram_gaussian":
        # Computing the Gram-Matrix
        K = np.array([LA.norm(X_train_process - y, axis=1) for y in X_train_process])
        K = np.exp(-K/gamma)
        # K = X_train_process.dot(X_train_process.T)
        w = solve_svm(K, Y_train, lamb=lamb)
        K_test = np.array([LA.norm(X_train_process - y, axis=1) for y in X_test_process])
        K_test = np.exp(-K_test/gamma)
    
    n = K.shape[0]
    Y_predicted = np.dot(K_test, w[:n]) > 0.
    Y_predicted = (Y_predicted + 0.)
    Y_predicted_train = np.dot(K, w[:n]) > 0.
    result_train = ((Y_train+1.)/ 2. == np.transpose(Y_predicted_train))
    print('accuracy on train : {}'.format(np.mean(result_train)))
    # result = ((Y_test+1)/ 2 == np.transpose(Y_predicted))
    
    return np.transpose(Y_predicted)


In [64]:
X_raw = pd.read_csv('data/Xtr0.csv', sep=' ', header=None).values.reshape((-1))
Y = pd.read_csv('data/Ytr0.csv', sep=',', header=0)['Bound'].values
Y_train = (Y[:] - 0.5) * 2
X_test = pd.read_csv('data/Xte0.csv', sep=' ', header=None).values.reshape((-1))

print('train shape : {}'.format(X_train.shape))
print('train shape 2 : {}'.format(Y_train.shape))
print('test shape : {}'.format(X_test.shape))

Y_predict0 = solve_svm_test(X_train, 
                           X_test, 
                           Y_train, 
                           kernel='k_gram', 
                           k=3, lamb=.001, gamma=3.)

train shape : (2000,)
train shape 2 : (2000,)
test shape : (1000,)
     pcost       dcost       gap    pres   dres
 0:  4.9121e-01  9.7050e+02  1e+04  3e+00  2e+05
 1:  2.2651e+00 -1.2016e+02  1e+02  3e-02  2e+03
 2:  2.2372e+00 -1.5330e+00  4e+00  5e-04  3e+01
 3:  1.2963e+00  7.7630e-01  5e-01  3e-06  2e-01
 4:  1.0060e+00  8.6479e-01  1e-01  7e-07  4e-02
 5:  9.4800e-01  8.9451e-01  5e-02  2e-07  1e-02
 6:  9.2826e-01  9.0596e-01  2e-02  8e-08  5e-03
 7:  9.1899e-01  9.1176e-01  7e-03  2e-08  1e-03
 8:  9.1611e-01  9.1362e-01  2e-03  5e-09  3e-04
 9:  9.1520e-01  9.1427e-01  9e-04  1e-09  9e-05
10:  9.1481e-01  9.1456e-01  3e-04  8e-10  2e-05
11:  9.1471e-01  9.1464e-01  7e-05  8e-10  2e-10
12:  9.1468e-01  9.1467e-01  3e-06  1e-09  6e-11
13:  9.1468e-01  9.1468e-01  5e-08  1e-09  2e-11
Optimal solution found.
accuracy on train : 0.557


In [36]:
X_raw = pd.read_csv('data/Xtr1.csv', sep=' ', header=None).values.reshape((-1))
Y = pd.read_csv('data/Ytr1.csv', sep=',', header=0)['Bound'].values
Y_train = (Y[:] - 0.5) * 2
X_test = pd.read_csv('data/Xte1.csv', sep=' ', header=None).values.reshape((-1))

print('train shape : {}'.format(X_train.shape))
print('train shape 2 : {}'.format(Y_train.shape))
print('test shape : {}'.format(X_test.shape))

Y_predict1 = solve_svm_test(X_train, 
                           X_test, 
                           Y_train, 
                           kernel='k_gram', 
                           k=3, lamb=.001, gamma=3.)

train shape : (2000,)
train shape 2 : (2000,)
test shape : (1000,)
     pcost       dcost       gap    pres   dres
 0:  3.2913e-01  6.5896e+02  2e+04  3e+00  2e+06
 1:  2.5184e+00 -1.9498e+02  2e+02  3e-02  2e+04
 2:  2.4821e+00 -9.1002e+00  1e+01  2e-03  8e+02
 3:  1.9370e+00 -6.8985e-01  3e+00  2e-04  1e+02
 4:  8.0824e-01  4.0574e-01  4e-01  1e-05  6e+00
 5:  5.9208e-01  4.8932e-01  1e-01  3e-06  1e+00
 6:  5.5668e-01  5.1376e-01  4e-02  1e-06  5e-01
 7:  5.4229e-01  5.2354e-01  2e-02  3e-07  2e-01
 8:  5.3606e-01  5.2789e-01  8e-03  1e-07  7e-02
 9:  5.3319e-01  5.2986e-01  3e-03  3e-08  2e-02
10:  5.3210e-01  5.3071e-01  1e-03  1e-08  6e-03
11:  5.3163e-01  5.3107e-01  6e-04  1e-09  8e-04
12:  5.3140e-01  5.3127e-01  1e-04  1e-09  2e-05
13:  5.3135e-01  5.3131e-01  4e-05  1e-09  7e-06
14:  5.3133e-01  5.3133e-01  5e-06  1e-09  5e-07
15:  5.3133e-01  5.3133e-01  3e-07  1e-09  8e-12
Optimal solution found.


In [37]:
X_raw = pd.read_csv('data/Xtr2.csv', sep=' ', header=None).values.reshape((-1))
Y = pd.read_csv('data/Ytr2.csv', sep=',', header=0)['Bound'].values
Y_train = (Y[:] - 0.5) * 2
X_test = pd.read_csv('data/Xte2.csv', sep=' ', header=None).values.reshape((-1))

print('train shape : {}'.format(X_train.shape))
print('train shape 2 : {}'.format(Y_train.shape))
print('test shape : {}'.format(X_test.shape))

Y_predict2 = solve_svm_test(X_train, 
                           X_test, 
                           Y_train, 
                           kernel='k_gram', 
                           k=3, lamb=.001, gamma=3.)

train shape : (2000,)
train shape 2 : (2000,)
test shape : (1000,)
     pcost       dcost       gap    pres   dres
 0:  4.8686e-01  9.7281e+02  1e+04  2e+00  6e+04
 1:  2.2059e+00 -1.1041e+02  1e+02  2e-02  6e+02
 2:  2.1801e+00 -9.0260e-01  3e+00  4e-04  9e+00
 3:  1.1754e+00  7.9545e-01  4e-01  2e-09  5e-08
 4:  9.8075e-01  8.7471e-01  1e-01  1e-09  2e-08
 5:  9.4197e-01  8.9847e-01  4e-02  1e-09  1e-08
 6:  9.2764e-01  9.0771e-01  2e-02  1e-09  5e-09
 7:  9.2159e-01  9.1147e-01  1e-02  8e-10  3e-09
 8:  9.1750e-01  9.1444e-01  3e-03  9e-10  1e-09
 9:  9.1625e-01  9.1541e-01  8e-04  9e-10  4e-10
10:  9.1591e-01  9.1570e-01  2e-04  1e-09  4e-10
11:  9.1581e-01  9.1579e-01  2e-05  1e-09  2e-10
12:  9.1580e-01  9.1580e-01  9e-07  1e-09  5e-11
Optimal solution found.


In [74]:
test0 = Y_predict0[:]
test1 = Y_predict1[:]
test2 = Y_predict2[:]
test = np.concatenate((test0,test1,test2), axis=1).reshape(-1).astype(int)
final = pd.DataFrame(test, columns=['bound'])
final.to_csv('result.csv')

ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [55]:
test.shape

(3000,)

In [57]:
final = pd.DataFrame(test, columns=['bound'])