# Import and Prepare Data

Import Data

In [171]:
'''Get feature data from file as a matrix with a row per data instance'''
import sys
def getFeatureData(featureFile,n):
    x=[]
    dFile = open(featureFile, 'r')
    i = 0
    for line in dFile:
        row = line.split()
        rVec = [float(item) for item in row]
        x.append(rVec)
        print(str(int(i/n*100)) + "%",end = '\r')
        sys.stdout.flush()
        i += 1
    print('100%')
    dFile.close()
    return x

'''Get label data from file as a dictionary with key as data instance index
and value as the class index
'''
def getLabelData(labelFile,n):
    lFile = open(labelFile, 'r')
    lDict = {}
    i = 0
    for line in lFile:
        row = line.split()
        lDict[int(row[1])] = int(row[0])
        print(str(int(i/n*100)) + "%",end = '\r')
        sys.stdout.flush()
        i += 1
    print('100%')
    lFile.close()
    return lDict

In [172]:
print("Labeled Data: "),
x = getFeatureData('D:\\moconnor\\My Documents\\M.S. Data Science\\Fall 2018\\CS675\\Testing\\traindata',8000)
print ("Labels :"),
y_dict = getLabelData('D:\\moconnor\\My Documents\\M.S. Data Science\\Fall 2018\\CS675\\Testing\\trainingLabels.txt',8000)
print("Testing Data: "),
test_data = getFeatureData('D:\\moconnor\\My Documents\\M.S. Data Science\\Fall 2018\\CS675\\Testing\\testdata',2000)

Labeled Data: 
100%
Labels :
100%


Split into testing and training sets

In [173]:
import random
train_size = .90
rand_index = random.sample(range(len(x)),int(train_size*len(x)))
train_x = []
train_y = []
test_x  = []
test_y  = []
for i in range(len(x)):
    if i in rand_index:
        train_x.append(x[i])
        train_y.append(y_dict[i])
    else:
        test_x.append(x[i])
        test_y.append(y_dict[i])

# Feature Selection
## Calculate Signal to Noise Ratio
(Golub et al)

$snr = \left|\frac{(m_{c1}-m_{c2})}{(\sigma _{c1} + \sigma_{c2})}\right|$

In [174]:
class_0 = []
class_1 = []
for i in range(len(train_x)):
    if train_y[i] == 0:
        class_0.append(train_x[i])
    else:
        class_1.append(train_x[i])

In [175]:
mean_c0 = [sum(class_0[i][j] for i in range(len(class_0)))/len(class_0) for j in range(len(train_x[0]))]
mean_c1 = [sum(class_1[i][j] for i in range(len(class_1)))/len(class_1) for j in range(len(train_x[0]))]

In [176]:
var_c0 = [sum([(class_0[i][j]-mean_c0[j])**2 for i in range(len(class_0))])/len(class_0) for j in range(len(class_0[0]))]
std_c0 = [vi**(1/2) for vi in var_c0]
var_c1 = [sum([(class_1[i][j]-mean_c1[j])**2 for i in range(len(class_1))])/len(class_1) for j in range(len(class_1[0]))]
std_c1 = [vi**(1/2) for vi in var_c1]

In [177]:
snr = [[abs((mean_c0[j]-mean_c1[j])/(std_c0[j]+std_c1[j])),j] for j in range(len(train_x[0]))]
snr = sorted(snr, key = lambda row: row[0], reverse = True)
top_snr = [pi[1] for pi in snr[:100]]

## Calculate Mutual Information
$I_{xy}=\sum\limits_{y\in Y}\sum\limits_{x\in X} p(x,y)log(\frac{p(x,y)}{p(x)p(y)})$

(Cho et. al)

In [179]:
import math
def mi_score(u,v):
    mi = 0
    for ui in set(u):
        for vi in set(v):
            ui_vi = list(zip(u,v)).count((ui,vi))
            if ui_vi == 0:
                pass
            else:
                mi += (ui_vi/len(u))*math.log((len(u)*ui_vi)/(u.count(ui)*v.count(vi)))
    return mi

mi_array = [[j, mi_score([train_x[i][j] for i in range(len(train_x))], train_y)] for j in range(len(train_x[0]))]


In [181]:
mi_array = sorted(mi, key=lambda row: row[1], reverse=True)
top_mi = [pi[0] for pi in mi_array[:100]]

# Calculate Pearson Coefficient
$r_{xy}=\frac{\sum\limits_{i=1}^n(x_i-\bar x)(y_i-\bar y)}{\sqrt{\sum\limits_{i=1}^n(xi-\bar x)^2}\sqrt{\sum\limits_{i=1}^n(y_i-\bar y)^2}}$

In [182]:
def mean(u):
    return sum(u)/len(u)

def fn_a(u):
    return (len(u)*sum([ui**2 for ui in u])-sum(u)**2)**(1/2)

def pearson_coef(u,v, mu_v, fn_a_v):
    mu_u = mean(u)
    fn_a_u = fn_a(u)
    return (sum([u[i]*v[i] for i in range(len(u))])-len(u)*mu_u*mu_v)/(fn_a(u)*fn_a(v))

pearson_array = []
mu_y_train = mean(train_y)
fn_a_y_train = fn_a(train_y)
for j in range(len(train_x[0])):
    pearson_array.append([j, pearson_coef([train_x[i][j] for i in range(len(train_x))], train_y, mu_y_train, fn_a_y_train)])
    print(str(int(j/len(train_x)*100)) + "%",end = '\r')
    sys.stdout.flush()

[[0, -5.8242839733508065e-08], [1, 5.117483790325117e-07], [2, 3.4378318586352323e-07], [3, 1.0699963549924643e-07], [4, 4.2321948490701064e-07], [5, 8.904297458133807e-07], [6, -2.4600473408262734e-06], [7, -8.486771070639851e-07], [8, 1.6448224825847449e-06], [9, 2.3998415257232294e-06], [10, -1.1328373913681415e-06], [11, -2.6825877141731734e-06], [12, 1.5559393928232262e-06], [13, 1.8627626859855095e-06], [14, -2.0072536058148436e-06], [15, -2.8413745206880054e-06], [16, -3.641826547396104e-07], [17, -2.3833717455826077e-06], [18, 6.543072920639188e-07], [19, -9.109384801678499e-07]]


In [183]:
pearson = sorted(pearson_array, key = lambda row: row[1])
top_pearson = [pi[0] for pi in pearson[:100]]

## Choose best features
Find elements that are meaured as highly correlated by both standards

In [184]:
common = []
for pi in top_mi:
    if pi in top_snr and top_pearson:
        common.append(pi)

# Try different feature subsets

In [194]:
red_x_vec = []
red_x_test_vec = []
for n in [28,29,30,31,32]:    
    red_x = []
    red_x_test = []
    for i in range(len(train_x)):
        red_x.append([train_x[i][j] for j in common[:n]])
    red_x_vec.append(red_x)
    for i in range(len(test_x)):    
        red_x_test.append([test_x[i][j] for j in common[:n]])
    red_x_test_vec.append(red_x_test)


# Test Accuracy with Different Models
For each subset of features, output accuracy for each model

In [195]:
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

for (red_x, red_x_test) in zip(red_x_vec,red_x_test_vec):
    print("Number of Features: ", len(red_x[0]))
    print()
    print("Linear SVM")
    clf = svm.SVC(kernel = "linear", C=2.0)
    y_hat = clf.fit(red_x,train_y).predict(red_x_test)
    acc = sum([y_hat[i] == test_y[i] for i in range(len(red_x_test))])/len(red_x_test)
    print(acc)
    
    print("Poly SVM")
    clf = svm.SVC(kernel = "poly", degree = 2, C=2.0)
    y_hat = clf.fit(red_x,train_y).predict(red_x_test)
    acc = sum([y_hat[i] == test_y[i] for i in range(len(red_x_test))])/len(red_x_test)
    print(acc)
        
    print("RBF SVM")
    clf = svm.SVC(kernel = "rbf", C=2.0)
    y_hat = clf.fit(red_x,train_y).predict(red_x_test)
    acc = sum([y_hat[i] == test_y[i] for i in range(len(red_x_test))])/len(red_x_test)
    print(acc)

Number of Features:  28

Linear SVM
0.67375
Poly SVM
0.67375
RBF SVM
0.655
Number of Features:  29

Linear SVM
0.67375
Poly SVM
0.66125
RBF SVM
0.65625
Number of Features:  30

Linear SVM
0.675
Poly SVM
0.67
RBF SVM
0.65375
Number of Features:  31

Linear SVM
0.67625
Poly SVM
0.67
RBF SVM
0.655
Number of Features:  32

Linear SVM
0.66875
Poly SVM
0.67
RBF SVM
0.6575


SVM seems to be working the best, this is consistent with the literature

In [198]:
train_x_common = [[train_x[i][j] for j in common]for i in range(len(train_x))]
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 15).fit(matrixTranspose(train_x_common))

In [225]:
common_clustered = []
clusters = []
for c in range(len(common)):
    if kmeans.labels_[c] not in clusters:
        clusters.append(c)
        common_clustered.append(common[c])
common_clustered = common_clustered[:]
red_x_clustered = [[train_x[i][j] for j in common_clustered] for i in range(len(train_x))]
red_x_test_clustered = [[test_x[i][j] for j in common_clustered] for i in range(len(test_x))]

In [238]:
y_hats = []
print("Linear SVM")
clf = svm.SVC(kernel = "linear", C=2.0)
y_hat = clf.fit(red_x_clustered,train_y).predict(red_x_test_clustered)
y_hats.append(y_hat)
acc = sum([y_hat[i] == test_y[i] for i in range(len(red_x_test_clustered))])/len(red_x_test_clustered)
print(acc)

print("Poly SVM")
clf = svm.SVC(kernel = "poly", degree = 2, C=2.0)
y_hat = clf.fit(red_x_clustered,train_y).predict(red_x_test_clustered)
acc = sum([y_hat[i] == test_y[i] for i in range(len(red_x_test_clustered))])/len(red_x_test_clustered)
y_hats.append(y_hat)
print(acc)

print("RBF SVM")
clf = svm.SVC(kernel = "rbf", C=2.0)
y_hat = clf.fit(red_x_clustered,train_y).predict(red_x_test_clustered)
acc = sum([y_hat[i] == test_y[i] for i in range(len(red_x_test_clustered))])/len(red_x_test_clustered)
y_hats.append(y_hat)
print(acc)



Linear SVM
0.67
Poly SVM
0.68
RBF SVM
0.67875


In [239]:
y_hat = []
for i in range(len(red_x_test_clustered)):
    if y_hats[0][i]+y_hats[1][i]+y_hats[2][i] < 1.5:
        y_hat.append(0)
    else:
        y_hat.append(1)
acc = sum([y_hat[i] == test_y[i] for i in range(len(red_x_test_clustered))])/len(red_x_test_clustered)
print(acc)

0.6825


24
