In [1]:
import numpy as np
from sklearn import preprocessing
import time
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
try: import cPickle as pickle
except: import pickle
from math import ceil
try: from pyflann import *
except: pass
try: from pykdtree.kdtree import KDTree
except: pass
try: from sklearn.neighbors import NearestNeighbors
except: pass
try: from scipy.spatial import cKDTree
except: pass

In [None]:

def get_data_process(li, list_of_index):
    result = []
    for entry in li:
        temp = ()
        for index in list_of_index:
            temp += (entry[index],)
        result.append(temp)
    return np.asarray(result)
    
def PCA_analysis(data, n_components = 2):
    pca = PCA(n_components = n_components, svd_solver = "randomized" )
    X_pca = pca.fit_transform(data)
    return X_pca, sum(pca.explained_variance_ratio_)
    
def PCA_analysis_desc(data,list_desc):
    data_process = get_data_process(data, list_desc)
    pca = PCA(n_components = 2, svd_solver = "randomized" )
    X_pca = pca.fit_transform(data_process)
    return X_pca, pca.components_[0], pca.components_[1]



def remove_list_from_list(a,b):
    return list(set(a)-set(b))

def remove_list_from_list2(a,b):
    for x in b:
        try:
            a.remove(x)
        except ValueError:
            pass
    return a


def get_array_based_on_index(array, index_list):
    return np.asarray([array[i] for i in index_list])
def get_subsampling_index2(data_process, cutoff_sig = 0.02, rate = 0.3, method = "pykdtree"):
    

    if method == "flann":
        print "use flann backend"
    elif method == "pykdtree":
        print "use pykdtree backend"
    elif method == "sklearn":
        print "use slearn nearest neighbors backend"
    elif method == "scipy":
        print "use scipy cKDTree backend"
    else:
        print "method {} not impletemented".format(method)
        raise NotImplemented
    
    data_process = StandardScaler().fit_transform(np.asarray(data_process).copy())
    

    list_of_descs = zip(*data_process)
    sum_std2 = 0.    
    for descs in list_of_descs:
        temp_std = np.std(descs)
        sum_std2 += temp_std**2


    #setting cutoff distance
    cutoff = cutoff_sig * np.sqrt(sum_std2)
      
    overall_keep_list = np.arange(len(data_process)).tolist() 
    
    keep_going = True
    while keep_going:
        print 'start total length: {}'.format(len(overall_keep_list))
        start = time.time()
        temp_data_process = get_array_based_on_index(data_process.copy(), overall_keep_list)
        
        if method == "flann":
            flann = FLANN()
            indices, distances = flann.nn(temp_data_process, temp_data_process, 2, algorithm="kmeans")
        elif method == "scipy":
            kd_tree = cKDTree(temp_data_process)
            distances, indices = kd_tree.query(temp_data_process, k=2)
        elif method == "pykdtree":
            kd_tree = KDTree(temp_data_process,leafsize=6)
            distances, indices = kd_tree.query(temp_data_process, k=2)
        elif method == "sklearn":
            nbrs = NearestNeighbors(n_neighbors=2, algorithm='kd_tree',n_jobs=-1).fit(temp_data_process)
            distances, indices = nbrs.kneighbors(temp_data_process)
        else:
            raise NotImplemented
            
    
        remove_index_li = []
        index_li = []
        
        for index, distance in zip(indices, distances):
            index_li.append(index[0])
            if distance[1] <= cutoff:
                #print index
                remove_index_li.append(index[1])
        

        temp_num = int(ceil(float(len(remove_index_li))*rate
                            
        if temp_num == 0:
            keep_going = False
        remove_index_li = random_subsampling(remove_index_li,temp_num)
        
        
        temp_keep_list = remove_list_from_list(index_li, remove_index_li)
        overall_keep_list = [overall_keep_list[i] for i in temp_keep_list ]
        print 'end cycle. length: {}\t time:{}'.format(len(overall_keep_list), time.time()-start)
       

    return overall_keep_list

    
def subsampling_system(data, list_desc = [], cutoff_sig = 0.05, rate = 0.3, method = "pykdtree"):
    
    '''
    list_desc: list of dimensions/descriptors used in subsampling
    
    '''
    
    if len(list_desc) == 0:
        data_process = data
    else:
        data_process = get_data_process(data, list_desc)
    
    overall_keep_list = get_subsampling_index2(data_process, cutoff_sig = cutoff_sig, method = method)
    sampling_result = [data[i] for i in overall_keep_list]
    return sampling_result    


def subsampling_system_with_PCA(data, list_desc = [], cutoff_sig = 0.05, rate = 0.3,start_trial_component = 10, max_component = 30, target_variance = 0.999999, method = "pykdtree"):
    if len(list_desc) == 0:
        data_process = data
    else:
        data_process = get_data_process(data, list_desc)
    
    print 'start PCA'
    start = time.time()
    trial_component = start_trial_component
    keep_going = True
    while keep_going:
        pca_result, sum_explained_variance = PCA_analysis(data_process, n_components = trial_component)
        print str(time.time()-start)
        if sum_explained_variance > target_variance:
            keep_going = False
        
        
        if trial_component > max_component:
            keep_going = False
            print "stopped PCA at {} components, total explained variance: {}".format(trial_component, sum_explained_variance)

        if trial_component >= len(data_process[0]):
            keep_going = False
            pca_result = data_process
        trial_component +=1
    print 'end trial PCA'
    print str(time.time()-start)
    overall_keep_list = get_subsampling_index2(pca_result, cutoff_sig = cutoff_sig,rate = rate, method = method)
    sampling_result = [data[i] for i in overall_keep_list]
    return sampling_result 
