In [9]:
from svmutil import *
import numpy as np
import random
from sklearn import metrics

In [10]:
def file_read(data_dir):
    f = open(data_dir, 'r')
    lines = f.readlines()
    feature = tuple(lines[0].split('\t')[1:])
    samples = []
    data = np.zeros([100,len(feature)])
    for i, line in enumerate(lines):
        if i != 0:
            samples.append(line.split('\t')[0])
            data[i-1,:] = line.split('\t')[1:]
    f.close()
    return data, feature, samples

In [56]:
rna_data, rna_feature, samples = file_read('../../data/ADD_cor_and_rna_0.15.site.ham21.ham17.ssi')
beta_data, beta_feature, _ = file_read('../../data/ADD_cor_and_beta_0.35.site.ham21.ham17.ssi')

In [57]:
data = np.column_stack((rna_data,beta_data))
feature = rna_feature + beta_feature

In [58]:
label = np.zeros(100)
for i,sample in enumerate(samples):
    if 'sdpc' in sample:
        label[i] = 0
    else:
        label[i] = 1
print("size of label:", len(label))
print("# of MDD:", len(label) - len(np.nonzero(label)[0]))
print("# of normal sample:",len(np.nonzero(label)[0]))

('size of label:', 100)
('# of MDD:', 44)
('# of normal sample:', 56)


In [59]:
idx = np.arange(100)
np.random.shuffle(idx)
#numpy to list
x = data[idx].tolist()
y = label[idx].tolist()

In [60]:
def cross_validation(x,y,portion):
    t1 = 0.2*portion
    t2 = 0.2*(portion+1)
    train_x = x[:int(len(x)*t1)] + x[int(len(x)*(t2)):]
    train_y = y[:int(len(x)*t1)] + y[int(len(x)*(t2)):]
    test_x = x[int(len(x)*t1):int(len(x)*(t2))]
    test_y = y[int(len(x)*t1):int(len(x)*(t2))]
    return train_x, train_y, test_x, test_y

In [61]:
acc = []
auc = []
for portion in range(int(1/0.2)):
    train_x, train_y, test_x, test_y = cross_validation(x,y,portion)
    print("# of train:", len(train_x))
    print("# of test:", len(test_x))
    print(portion*20,"~", (portion+1)*20, "%")
    m = svm_train(train_y, train_x, '-s 1 -t 2 -c 4')
    p_label, p_acc, p_val = svm_predict(test_y, test_x, m)
    acc.append(p_acc[0])
    fpr, tpr, thresholds = metrics.roc_curve(test_y, p_label, pos_label=1)
    auc.append(metrics.auc(fpr, tpr))
    print("\n")


('# of train:', 80)
('# of test:', 20)
(0, '~', 20, '%')
Accuracy = 90% (18/20) (classification)


('# of train:', 80)
('# of test:', 20)
(20, '~', 40, '%')
Accuracy = 70% (14/20) (classification)


('# of train:', 80)
('# of test:', 20)
(40, '~', 60, '%')
Accuracy = 75% (15/20) (classification)


('# of train:', 80)
('# of test:', 20)
(60, '~', 80, '%')
Accuracy = 95% (19/20) (classification)


('# of train:', 80)
('# of test:', 20)
(80, '~', 100, '%')
Accuracy = 85% (17/20) (classification)




In [62]:
print(sum(acc)/len(acc))
print(sum(auc)/len(auc))

83.0
0.828278388278


# Analysis of SVM

In [7]:
portion = 0
btrain_x, btrain_y, btest_x, btest_y = cross_validation(x,y,portion)
print("# of train:", len(train_x))
print("# of test:", len(test_x))
print(portion*20,"~", (portion+1)*20)

('# of train:', 80)
('# of test:', 20)
(0, '~', 20)


In [8]:
m = svm_train(btrain_y, btrain_x, '-s 1 -t 0 -c 4')
p_label, p_acc, p_val = svm_predict(btest_y, btest_x, m)

Accuracy = 90% (18/20) (classification)


In [99]:
fpr, tpr, thresholds = metrics.roc_curve(btest_y, p_label, pos_label=1)
auc = metrics.auc(fpr, tpr)
print auc

0.85


In [102]:
np.argwhere(np.array(p_label)!=np.array(best_y))[0][0]
error_idx = np.argwhere(np.array(p_label)!=np.array(best_y))
print error_idx

[0]


In [128]:
error_samples = []
for idx in error_idx:
    error_samples.append([btest_y[int(idx[0])]] + btest_x[int(idx[0])])

# File Save

In [137]:
g = open('result_svm/error_samples.txt','w')
g.write('option: -s 1 -t 0 -c 4 nu-SVC linear SVM ')
g.write(str(20*portion) + ' ' + str(20*(portion+1))+'\n')
g.write('label\t')
for fe in feature:
    g.write(fe)
    g.write('\t')

In [138]:
for sample in error_samples:
    for x in sample:
        g.write(str(x))
        g.write('\t')
    g.write('\n')
g.close()

In [140]:
len(error_samples[1])

87