In [32]:
import pandas as pd
import numpy as np
import h5py
import math

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, regularizers, Flatten

from sklearn import ensemble, preprocessing, multiclass
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score, train_test_split


from collections import Counter

In [33]:
# Transformation

def sqrt(col):
    return list(map(np.sqrt, col));

def freq(col):
    col = np.floor(col)
    counter = Counter(col)
    return [counter.get(elem) for elem in col]


In [34]:
# Globals

# Datasets
dids = np.load("datasets/indexes.npy")

# RF model parameters
seed = 67
transformations = [sqrt, freq]
transformations_name = ["sqrt", "freq"]
trans2target = {}

# Comrpessed Dataset paramters
qsa_representation = []
num_bin = 200
too_big = 100000

# Neural Nets Parameters and Variables
MLP_LFE_Nets = {}
inp_shape = (2,num_bin)
dropout = 0.2
norm = (-10, 10)
pred_threshold = 0.5


In [35]:
#def binarize_dataset():

def load_dataset(id):
    X = np.load("datasets/" + str(id) + "-data.npy")
    y = np.load("datasets/" + str(id) + "-target.npy")
    categorical = np.load("datasets/" + str(id) + "-categorical.npy")
    return X,y,categorical

    
def evaluate_model(X, y, categorical):
    imp = Imputer(missing_values="NaN")
    X = imp.fit_transform(X)
    enc = preprocessing.OneHotEncoder(categorical_features=categorical)
    X = enc.fit_transform(X)
    clf = ensemble.RandomForestClassifier(random_state=seed)
    #clf_ovsr = multiclass.OneVsRestClassifier(clf, n_jobs=-1)
    
    return cross_val_score(clf, X, y,cv=10)
    
def is_positive(X,y,categorical,base_score,transformation,feature):
    transformed_feature = np.array(transformation(X[:,feature]))
    X = np.c_[X,transformed_feature]
    categorical = np.append(categorical,False)
    new_score = evaluate_model(X,y,categorical).mean()
    
    return 1 if(base_score <= (new_score - 0.01)) else 0
    
    

In [36]:
# Build the target for the compressed feature
bad_datasets = []

def build_target_for_compressed(dids):

    for transf in transformations:
        trans2target[transf] = []

    
    for did in dids:
        print("Start dataset number", did)
        
        try:
            
            X, y, categorical = load_dataset(did)       

            new_indexes = []

            if(X.shape[0] > too_big):
                new_indexes = np.random.choice(X.shape[0], too_big, replace=False)
                X = X[new_indexes]
                y = y[new_indexes]

            base_score = evaluate_model(X, y, categorical).mean()

            # Find the indexes of numeric attributes
            numerical_indexes = np.where(np.invert(categorical))[0]

            for i,transf in enumerate(transformations):
                for feature in numerical_indexes:

                    print("\tEvaluating feature " + str(feature))

                    mlp_target = is_positive(X,y,categorical,base_score,transf, feature)

                    print("\t\t" + str(mlp_target))

                    trans2target[transf].append((did,feature,mlp_target))
        except:
                print("The evaluation of dataset " + str(did) + " failed")
                bad_datasets.append(did)
                continue

In [37]:
# Save the result
def save_target_for_compressed(path):

    for transf, name in zip(transformations, transformations_name):
        np.save(path + name, trans2target[transf])

In [38]:
def normalize_Rx(matrix):
    
    Rxc = np.zeros(shape=matrix.shape)
    
    for i,row in enumerate(matrix):
        max_c = np.amax(row)
        min_c = np.amin(row)
        bin_width = (max_c-min_c)/(norm[1]-norm[0])
        Rxc[i] = np.apply_along_axis(lambda x : np.floor((x-min_c)/(bin_width)+norm[0]), 0, row)
    
    return Rxc

def to_quantile_sketch_array(did, col, targets, bins, t_class, index):
    max_c = np.nanmax(col)
    min_c = np.nanmin(col)
    bin_width = (max_c-min_c)/num_bin
    Rx = np.zeros(shape=(2,num_bin))
    
    if(bin_width == 0):
        return
    
    for val,y in zip(col,targets):
        if not np.isnan(val):
            bin_value = int(np.floor((val-min_c)/bin_width))
            bin_value = np.clip(bin_value, 0, num_bin-1)
            my_class = 0 if t_class == y else 1
            Rx[my_class][bin_value] = Rx[my_class][bin_value] + 1
            
    Rx = normalize_Rx(Rx)

    qsa_representation.append(np.insert(Rx.flatten(), 0, [did,index]))

In [39]:
# Build the compressed dataset
def build_compressed_dataset(dids):

    qsa_representation = []

    for did in dids[:2]:
        print("Start dataset number", did)

        try:
            X, y, categorical = load_dataset(did)
        except:
            print("Dataset " + str(did) + " not found")
            continue;

        new_indexes = []

        if(X.shape[0] > too_big):
            new_indexes = np.random.choice(X.shape[0], too_big, replace=False)
            X = X[new_indexes]
            y = y[new_indexes]

        numerical_indexes = np.where(np.invert(categorical))[0]

        classes = set(y)

        for t_class in classes:
            for index in numerical_indexes:
                to_quantile_sketch_array(did,X[:,index], y, num_bin, t_class, index)
        
    

In [40]:
# Save the compressed datasets
def save_compressed_dataset(path):
    np.save(path + "compressed.npy", qsa_representation)

In [41]:
# CREATING THE NEURAL NETS

def initialize_MLPs():
    
    for transf in transformations_name:
        model = Sequential()

        model.add(Dense(64, input_shape=inp_shape, W_regularizer=regularizers.l2(0.01)))
        model.add(Activation('softmax'))
        model.add(Dense(64, W_regularizer=regularizers.l2(0.01)))
        model.add(Dropout(dropout))
        model.add(Flatten())
        model.add(Dense(output_dim=1))
        model.add(Activation('relu'))

        # For a binary classification problem
        model.compile(optimizer='rmsprop',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])

        MLP_LFE_Nets[transf] = model
    

In [42]:

def load_compressed_ds():
    data = np.load("datasets/compressed/compressed.npy")
    return pd.DataFrame(data)


def assemble_training_set(compressed, transformation_targets):
    targetDf = pd.DataFrame(transformation_targets)
    merged = pd.merge(compressed, targetDf, how='left', on=[0, 1])
    class_1 = merged.ix[:,4:num_bin + 4].values
    class_2 = merged.ix[:,num_bin + 4:-1].values
    target = np.array(merged.ix[:,-1].values)
    meta_inf = np.array(merged.ix[:,:2].values)
    meta_target = np.c_[target, meta_inf]
    X = []
    
    for c1, c2 in zip(class_1, class_2):
        X.append([c1,c2])    

    return np.array(X), np.array(meta_target)


def split_training_test():
    compressed_ds = load_compressed_ds()

    for transf, name in zip(transformations, transformations_name):
        transformation_targets = np.load("datasets/compressed/" + name + ".npy")
        X,y = assemble_training_set(compressed_ds, transformation_targets)
        
        X_s_tr, X_s_test, y_s_tr, y_s_test = train_test_split(X, y, test_size=0.2)
        
        # Dropping the meta-info from training set
        y_s_tr = y_s_tr[:,:1]
        
        np.save("datasets/training/" + name + "-data_split",X_s_tr)
        np.save("datasets/training/" + name + "-target_split",y_s_tr)
        np.save("datasets/test/" + name + "-data_split",X_s_test)
        np.save("datasets/test/" + name + "-target_split",y_s_test)

        
def load_training_set(transf):
    
    X = np.load("datasets/training/" + transf + "-data_split.npy")
    y = np.load("datasets/training/" + transf + "-target_split.npy")
    
    return X,y


In [49]:
# Training the nets

def train_MLPs():

    for transf, name in zip(transformations, transformations_name):
        
        X,y = load_training_set(name)

        MLP_LFE_Nets[name].summary()
        print ("Inputs: {}".format(MLP_LFE_Nets[name].input_shape))
        print ("Outputs: {}".format(MLP_LFE_Nets[name].output_shape))
        print ("Actual input: {}".format(X.shape))
        print ("Actual output: {}".format(y.shape))   

        MLP_LFE_Nets[name].fit(X, y)       


In [44]:

def save_MLPs():
    for transf in transformations_name:
        MLP_LFE_Nets[transf].save_weights("datasets/MLPs/" + transf + "-weights")
        model_json = MLP_LFE_Nets[transf].to_json()
        with open("datasets/MLPs/" + transf + "-net_model", "w") as f:
            f.write(model_json)


In [111]:
def load_test_set():
    
    X = np.load("datasets/test/" + transformations_name[0] + "-data_split.npy")
    y_meta = np.load("datasets/test/" + transformations_name[0] + "-target_split.npy")
    t = np.full((y_meta.shape[0],1), 0)
    y_meta = np.concatenate((y_meta, t), axis=1)
    
    for i,name in enumerate(transformations_name[1:]):
        X = np.concatenate((X, np.load("datasets/test/" + name + "-data_split.npy")), axis=0)
        y_meta_tmp = np.load("datasets/test/" + name + "-target_split.npy")
        t = np.full((y_meta_tmp.shape[0],1),i+1)
        y_meta_tmp = np.concatenate((y_meta_tmp, t), axis=1)
        y_meta = np.concatenate((y_meta, y_meta_tmp), axis=0)
    
    return X,y_meta

In [122]:
# Test the accuracy on a dataset

def evaluate_transformation_classifier():
    
    # Number of prediction on features
    num_of_prediction = 0
    # Number of correct prediction on features
    num_of_correct_prediction = 0
    # Number of dataset which received a prediction
    good_predicted_dids = set()
    num_of_predicted_dataset = 0
    
    pred_mat = []
    
    X, y_meta = load_test_set()
    
    for transf in transformations_name:
        pred_mat.append(MLP_LFE_Nets[transf].predict(X))
        
    pred_mat = np.array(pred_mat).transpose()
    
    for predictions,did,feature in zip(pred_mat,y_meta[:,1], y_meta[:,2]):
        pmax = np.amax(predictions)
        
        if pmax > pred_threshold:
            index = np.where(predictions==pmax)[0][0]
            num_of_prediction += 1
            
            # Select the target for the transformation and the dataset
            positive_example_found = len(np.where((y_meta[:,0] == 1)       &  \
                                                  (y_meta[:,1] == did)     &  \
                                                  (y_meta[:,2] == feature) &  \
                                                  (y_meta[:,3] == index)))    \
                                                                                  >0
            
            if(positive_example_found):
                good_predicted_dids.add(did) 
                num_of_correct_prediction += 1
    
    print("Number of prediction:", num_of_prediction)
    print("Number of Correct prediciton:", num_of_correct_prediction)
    print("Accuracy:", num_of_correct_prediction/num_of_prediction)
    print("Number of datasets who received a good prediction:", len(good_predicted_dids))
    


# Build and Preprocess the Dataset

In [None]:

build_target_for_compressed(dids)
save_target_for_compressed("datasets/compressed/")
    

In [None]:

build_compressed_dataset(dids)
save_compressed_dataset("datasets/compressed/")
    

# Train Test Split

In [None]:

split_training_test()



# Train the MLPs


In [50]:

initialize_MLPs()
train_MLPs()
save_MLPs()


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dense_19 (Dense)                 (None, 2, 64)         12864       dense_input_7[0][0]              
____________________________________________________________________________________________________
activation_13 (Activation)       (None, 2, 64)         0           dense_19[0][0]                   
____________________________________________________________________________________________________
dense_20 (Dense)                 (None, 2, 64)         4160        activation_13[0][0]              
____________________________________________________________________________________________________
dropout_7 (Dropout)              (None, 2, 64)         0           dense_20[0][0]                   
___________________________________________________________________________________________


# Test the nets


In [121]:

evaluate_transformation_classifier()


Number of prediction: 1
Number of Correct prediciton: 1
Accuracy: 1.0
Number of datasets who received a good prediciton: 1
