In [1]:
import pandas as pd
import numpy as np
import h5py

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, regularizers

from sklearn import ensemble, preprocessing, multiclass
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score

from collections import Counter

Using TensorFlow backend.


In [68]:
# Transformation

def sqrt(col):
    return list(map(np.sqrt, col));

def freq(col):
    col = np.floor(col)
    counter = Counter(col)
    return [counter.get(elem) for elem in col]


In [81]:
# Globals

# Datasets
dids = np.load("datasets/indexes.npy")

# RF model parameters
seed = 67
transformations = [sqrt, freq]
transformations_name = ["sqrt", "freq"]
trans2target = {}

# Comrpessed Dataset paramters
qsa_representation = []
num_bin = 20

# Neural Nets Parameters and Variables
MLP_LFE_Nets = {}
inp_shape = (2,num_bin)
dropout = 0.2
norm = (-10, 10)


In [4]:
#def binarize_dataset():

def load_dataset(id):
    X = np.load("datasets/" + str(id) + "-data.npy")
    y = np.load("datasets/" + str(id) + "-target.npy")
    categorical = np.load("datasets/" + str(id) + "-categorical.npy")
    return X,y,categorical

    
def evaluate_model(X, y, categorical):
    imp = Imputer(missing_values="NaN")
    X = imp.fit_transform(X)
    enc = preprocessing.OneHotEncoder(categorical_features=categorical)
    X = enc.fit_transform(X)
    clf = ensemble.RandomForestClassifier(random_state=seed)
    #clf_ovsr = multiclass.OneVsRestClassifier(clf, n_jobs=-1)
    
    return cross_val_score(clf, X, y,cv=10)
    
def is_positive(X,y,categorical,base_score,transformation,feature):
    transformed_feature = np.array(transformation(X[:,feature]))
    X = np.c_[X,transformed_feature]
    categorical = np.append(categorical,False)
    new_score = evaluate_model(X,y,categorical).mean()
    
    return 1 if(base_score <= (new_score - 0.01)) else 0
    
    

In [13]:
# Build the target for the compressed feature

for transf in transformations:
    trans2target[transf] = []

for did in dids[:2]:
    print("Start dataset number", did)
    
    X, y, categorical = load_dataset(did)
    
    base_score = evaluate_model(X, y, categorical).mean()
    
    # Find the indexes of numeric attributes
    numerical_indexes = np.where(np.invert(categorical))[0]
    
    for i,transf in enumerate(transformations):
        for feature in numerical_indexes:
            
            print("\tEvaluating feature " + str(feature))
            
            mlp_target = is_positive(X,y,categorical,base_score,transf, feature)
            
            print("\t\t" + str(mlp_target))
            
            trans2target[transf].append((did,feature,mlp_target))

Start dataset number 4
	Evaluating feature 0
		1
	Evaluating feature 1
		0
	Evaluating feature 2
		1
	Evaluating feature 3
		1
	Evaluating feature 5
		1
	Evaluating feature 7
		1
	Evaluating feature 8
		1
	Evaluating feature 10
		1
	Evaluating feature 0
		1
	Evaluating feature 1
		1
	Evaluating feature 2
		1
	Evaluating feature 3
		1
	Evaluating feature 5
		0
	Evaluating feature 7
		1
	Evaluating feature 8
		1
	Evaluating feature 10
		1
Start dataset number 8




	Evaluating feature 0
		0
	Evaluating feature 1




		0
	Evaluating feature 2
		0
	Evaluating feature 3




		0
	Evaluating feature 4
		0
	Evaluating feature 0




		0
	Evaluating feature 1
		1
	Evaluating feature 2




		0
	Evaluating feature 3
		0
	Evaluating feature 4
		0




In [14]:
# Save the result

for transf, name in zip(transformations, transformations_name):
    np.save("datasets/transformation_ds/" + name, trans2target[transf])

In [80]:
def normalize_Rx(matrix):
    
    Rxc = np.zeros(shape=matrix.shape)
    
    for i,row in enumerate(matrix):
        max_c = np.amax(row)
        min_c = np.amin(row)
        bin_width = (max_c-min_c)/(norm[1]-norm[0])
        Rxc[i] = np.apply_along_axis(lambda x : np.floor((x-min_c)/(bin_width)+norm[0]), 0, row)
    
    return Rxc

def to_quantile_sketch_array(did, col, targets, bins, t_class, index):
    max_c = np.nanmax(col)
    min_c = np.nanmin(col)
    bin_width = (max_c-min_c)/num_bin
    Rx = np.zeros(shape=(2,num_bin))
    
    for val,y in zip(col,targets):
        if not np.isnan(val):
            bin_value = int(np.floor((val-min_c)/bin_width))
            bin_value = np.clip(bin_value, 0, num_bin-1)
            my_class = 0 if t_class == y else 1
            Rx[my_class][bin_value] = Rx[my_class][bin_value] + 1
            
    Rx = normalize_Rx(Rx)

    qsa_representation.append(np.insert(Rx.flatten(), 0, [did,index]))

In [82]:
# Build the compressed dataset

for did in dids[:1]:
    print("Start dataset number", did)
    
    X, y, categorical = load_dataset(did)
    numerical_indexes = np.where(np.invert(categorical))[0]
    
    classes = set(y)

    for t_class in classes:
        for index in numerical_indexes:
            to_quantile_sketch_array(did,X[:,index], y, num_bin, t_class, index)
        
    

Start dataset number 4


In [85]:
# Save the compressed datasets
np.save("datasets/compressed/compressed.npy", qsa_representation)

array([[  4.,   0.,   0., -10., -10., -10., -10., -10., -10., -10., -10.,
        -10.,  10., -10., -10., -10., -10., -10., -10., -10., -10.,   0.,
         -5., -10., -10., -10., -10., -10., -10., -10., -10., -10.,  10.,
        -10., -10., -10., -10., -10., -10., -10., -10.,   6.],
       [  4.,   1.,  10., -10.,  -2.,  -8.,  -8., -10.,  -8., -10.,  -4.,
        -10.,  -8., -10., -10., -10., -10., -10., -10., -10., -10., -10.,
         -6., -10., -10.,  -8.,  -4., -10.,   1., -10.,  -4.,  -8.,  10.,
        -10.,   1., -10.,  -6., -10.,  -6.,  -8., -10.,  -6.],
       [  4.,   2.,   2., -10.,  10., -10.,   2., -10., -10., -10.,   2.,
        -10., -10., -10.,  -6., -10., -10., -10., -10., -10., -10., -10.,
        -10., -10.,  -8., -10.,  -6., -10.,  -8., -10.,  10.,  -6.,   2.,
         -8.,  -4.,  -8., -10.,  -8.,  -8.,  -8., -10.,  -8.],
       [  4.,   3.,  10., -10., -10.,  -4., -10., -10., -10., -10., -10.,
        -10., -10., -10., -10., -10., -10., -10., -10., -10., -10., -10

In [95]:

def load_targets():
    return []


    

In [None]:
# CREATING THE NEURAL NETS

for transf in transformations_name:
    model = Sequential()
    
    model.add(Dense(64, input_shape=inp_shape, W_regularizer=regularizers.l2(0.01)))
    model.add(Activation('softmax'))
    model.add(Dense(64, W_regularizer=regularizers.l2(0.01)))
    model.add(Dropout(dropout))
    model.add(Dense(1))
    model.add(Activation('relu'))

    # For a binary classification problem
    model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    
    MLP_LFE_Nets[transf] = model
    

In [None]:
# Training the nets


for transf, name in zip(transformations, transformations_name):
    k = np.load("datasets/transformation_ds/" + name + ".npy")
    print(k)

for id in dids[:100]:
    targets = load_targets("datasets/targets.npy")
    X = load_only_numeric_data(id)
    
    for i,trans in enumerate(transformations):
        for feature, target in zip(X.transpose(), targets):
            
    