In [1]:
import numpy as np
train_data = np.load("data/train_data1.npy")
train_label = np.load("data/train_label1.npy")
test_data = np.load("data/test_data1.npy")
test_label = np.load("data/test_label1.npy")

In [2]:
"""
accuracy in test data
"""
import sklearn
from sklearn import svm
CLASSFIER = svm.SVC(gamma=0.001, C=100.)
CLASSFIER.fit(train_data, train_label)
y_pred = CLASSFIER.predict(test_data)
correct_prediction = np.equal(y_pred, test_label)
accuracy = np.mean(correct_prediction.astype(np.float32))
print(accuracy)

0.59


In [3]:
"""
Accuracy in training data
"""
y_pred = CLASSFIER.predict(train_data)
correct_prediction = np.equal(y_pred, train_label)
accuracy = np.mean(correct_prediction.astype(np.float32))
print(accuracy)

1.0


In [4]:
"""
find best feature indicies to use in classfier by cross_validation
"""
def k_fold_cross_validation(k, X ,Y,numpySeedNumber):
    N = X.shape[0]
    part_N = int(np.floor(N / k))
    values = []  # this list will hold the mean,std dev,r2_score in each validation
    max_accuracy=0
    max_column_indicies=None
    for i in range(k):
        # use i'th part as the validation set
        validation_X = X[i*part_N:(i+1)*part_N,:]
        training_X = np.concatenate((X[:i*part_N,:],X[(i+1)*part_N:,:]), axis=0)
        validation_Y = Y[i*part_N:(i+1)*part_N]
        training_Y = np.concatenate((Y[:i*part_N],Y[(i+1)*part_N:]), axis=0)
        accuracy,column_indicies=execute(training_X,validation_X,training_Y,validation_Y,max_itr_count=25,numpySeedNumber=numpySeedNumber)
        if accuracy>max_accuracy:
            max_column_indicies=column_indicies
            max_accuracy=accuracy
    return max_column_indicies,max_accuracy


In [5]:
"""
execute given data with execution_repeat_count number of times
return best values for "max_exe_acc,clf_exe,column_indicies"
"""
def execute(X_train,X_test,Y_train,Y_test,max_itr_count,numpySeedNumber):
    np.random.seed(numpySeedNumber) 
    training_X=None
    curr_column_indicies=[]
    itr=0
    classifier=None
    max_accuracy=0
    while itr<max_itr_count:
        itr+=1
        clf = CLASSFIER #Same classifier
        X_new,new_col_index,curr_column_indicies=rand_col_sampling(X_old=training_X,curr_column_indicies=curr_column_indicies,data=X_train)
        clf.fit(X_new, Y_train)
        test_data_new=rand_col_sampling(X_old=None,curr_column_indicies=curr_column_indicies+[new_col_index],data=X_test,only_selection=True)
        y_pred = clf.predict(test_data_new)
        correct_prediction = np.equal(y_pred, Y_test)
        accuracy = np.mean(correct_prediction.astype(np.float32))
        if accuracy>max_accuracy:
            curr_column_indicies+=[new_col_index]
            max_accuracy=accuracy
            training_X=X_new

    return max_accuracy,curr_column_indicies

In [6]:
"""
takes X_old and add new random column of data into it, if only_selection=True
just select given column indicies from data and return them.
"""

def rand_col_sampling(X_old,curr_column_indicies,data,only_selection=False):

    if only_selection: #if only selection then just return columns at column_indicies
        X_new=None
        for ind in curr_column_indicies:
            if X_new is None: #first column
                X_new = data[:,ind]
            else: #other columns
                X_new = np.concatenate((X_new,data[:,ind]), axis=1)
        return X_new
    else: #if not only_selection then add random column to X_old
        max_col_count=data.shape[1]
        X_new=X_old
        rand=np.random.randint(max_col_count, size=(1))
        while rand in curr_column_indicies:
            rand=np.random.randint(max_col_count, size=(1))
        if X_old is None:
            X_new = data[:,rand]
        else:
            X_new = np.concatenate((X_old,data[:,rand]), axis=1)
        return X_new,rand,curr_column_indicies

In [7]:
#k_fold to find best classfier trained with features at column_indicies
column_indicies,validation_accuracy=k_fold_cross_validation(k=10, X=train_data ,Y=train_label,numpySeedNumber=0)

In [8]:

train_data_new=rand_col_sampling(X_old=None,curr_column_indicies=column_indicies,data=train_data,only_selection=True)
#use selected columns of test_data to predict labels
test_data_new=rand_col_sampling(X_old=None,curr_column_indicies=column_indicies,data=test_data,only_selection=True)
CLASSFIER.fit(train_data_new, train_label)
#predict test_data
y_pred = CLASSFIER.predict(test_data_new)
#get accuracy
correct_prediction = np.equal(y_pred, test_label)
accuracy = np.mean(correct_prediction.astype(np.float32))
print(accuracy)


0.9925
