In [1]:
#%reset
import sys
sys.path.append('/usr/local/lib/python2.7/site-packages')
sys.path.append('./modules')

import warnings
#import re

import pandas as pd
#import numpy as np
from sklearn import metrics

## import modules to build pipelines
import pipemodules as pm
import projecthandle as proj
import run_grid as rg

#% matplotlib inline

In [2]:
import pickle
import pipemodules as pm
import pandas as pd
import numpy as np


def quality_filter(all_data, min_train_score=0.75, max_diff=0.15):

    results = all_data.eval_results

    results.reset_index(drop=True, inplace=True)

    ## filter results and eliminate poor models
    for i in range(0,len(results)):
        if results.mean_train_score[i] > min_train_score \
        and abs(results.mean_test_score[i] - results.mean_train_score[i]) < max_diff:
            continue
        else: 
            results.drop(i, axis=0, inplace=True)

    results.reset_index(drop=True, inplace=True)

    ## create analysis set
    # set arrays for results
    dev_set_score = []
    eval_set_score = []
    dev_evs = []
    eval_evs = []
    dev_mae = []
    eval_mae = []
    dev_mse = []
    eval_mse = []
    dev_medae = []
    eval_medae = []
    method_ids = []
    parameters = []


    for i in range(0,len(results)):
        ## take method_ids and build estimator for current method
        string = results.method_ids[i] # retrive method id
        setup = eval(string) # convert to iterable array

        temp = pm.search_random_forest() #initiate class

        # set the estimator type and initiate estimator class
        _,clf,_ = temp.set_method(setup[2]) 

        # get the development set features
        X_dev_temp, _ = pm.get_X(all_data.dev_set.matrix_raw, \
                                 all_data.meth.indvals[setup[0]][setup[1]]) 
        # get the evaluation set features
        X_eval_temp, _ = pm.get_X(all_data.eval_set.matrix_raw, \
                                  all_data.meth.indvals[setup[0]][setup[1]]) 

        del temp

        # retreive hyper-parameters
        params = results['params'][i]
        # set estimator hyper-parameters
        clf.set_params(**params)

        # fit the estimator to the development set
        clf.fit(X_dev_temp, all_data.dev_set.y_raw)
        # predict the evaluation set
        eval_predict = clf.predict(X_eval_temp)
        # predict the development set - for metrics
        dev_predict = clf.predict(X_dev_temp)

        # add calculated metrics, methods, and parameters to lists for results
        dev_set_score.append(clf.score(X_dev_temp, all_data.dev_set.y_raw))
        eval_set_score.append(clf.score(X_eval_temp, all_data.eval_set.y_raw))
        dev_evs.append(metrics.explained_variance_score(dev_predict, all_data.dev_set.y_raw))
        eval_evs.append(metrics.explained_variance_score(eval_predict, all_data.eval_set.y_raw))
        dev_mae.append(metrics.mean_absolute_error(dev_predict, all_data.dev_set.y_raw))
        eval_mae.append(metrics.mean_absolute_error(eval_predict, all_data.eval_set.y_raw))
        dev_mse.append(metrics.mean_squared_error(dev_predict, all_data.dev_set.y_raw))
        eval_mse.append(metrics.mean_squared_error(eval_predict, all_data.eval_set.y_raw))
        dev_medae.append(metrics.median_absolute_error(dev_predict, all_data.dev_set.y_raw))
        eval_medae.append(metrics.median_absolute_error(eval_predict, all_data.eval_set.y_raw))
        method_ids.append(string)
        parameters.append(params)

    # create dictionary object from results
    evaluation_results = {'dev_set_score':dev_set_score, 'eval_set_score':eval_set_score, \
                         'method_ids':method_ids, 'parameters':parameters, 'dev_evs':dev_evs, \
                         'eval_evs':eval_evs, 'dev_mae':dev_mae, 'eval_mae':eval_mae, \
                         'dev_mse': dev_mse, 'eval_mse':eval_mse, 'dev_median_ae':dev_medae, \
                         'eval_median_ae':eval_medae}
    

    # re-rank and sort filtered methods by test-score (r**2)
    analysis_set = pd.DataFrame(evaluation_results)
    array = np.array(analysis_set['eval_set_score'])
    temp = array.argsort()[::-1]
    ranks = np.empty(len(array), int)
    ranks[temp] = np.arange(len(array))
    analysis_set['rank_test_score'] = ranks
    analysis_set.sort_values(by='rank_test_score', inplace=True)
    analysis_set.reset_index(drop=True, inplace=True)
    
    return analysis_set

In [4]:
X,y,labels = proj.set_input('./input_files/rdkit_descriptors.csv')
results = rg.auto_grid(X, y, labels)
proj.save_eval('./results_test',results)
analysis_set = quality_filter(results)


  positive)
  estimator.fit(X_train, y_train, **fit_params)
  best_estimator.fit(X, y, **self.fit_params)


<pipemodules.preprocess at 0x105f045d0>

In [54]:
pd.DataFrame.to_dict

In [55]:
to_drop = []
for i in range(0, len(evaluation_results)-1):
    if str(evaluation_results.method_ids[i])==str(evaluation_results.method_ids[i+1]):
        to_drop.append(i+1)
    else:
        continue

for i in to_drop:
    evaluation_results.drop(i, inplace=True)

evaluation_results.sort_values(by='rank_test_score', inplace=True)
evaluation_results.reset_index(drop=True, inplace=True)

In [None]:
    currind = 0
    to_drop = []
    for i in range(0, len(evaluation_results)-1):
        if abs(evaluation_results.eval_set_score[currind] - evaluation_results.eval_set_score[i+1]) < 0.01 \
            and abs(evaluation_results.dev_set_score[i] - evaluation_results.dev_set_score[i+1]) < 0.01:
            to_drop.append(i+1)
        else:
            currind = i
    
    for i in to_drop:
        evaluation_results.drop(i, inplace=True)
    
    evaluation_results.sort_values(by='rank_test_score', inplace=True)
    evaluation_results.reset_index(drop=True, inplace=True)
    
    to_drop = []
    for i in range(0, len(evaluation_results)-1):
        if str(evaluation_results.method_ids[i])==str(evaluation_results.method_ids[i+1]):
            to_drop.append(i+1)
        else:
            continue

    for i in to_drop:
        evaluation_results.drop(i, inplace=True)

    evaluation_results.sort_values(by='rank_test_score', inplace=True)
    evaluation_results.reset_index(drop=True, inplace=True)

In [16]:
test = proj.file_loader()

In [17]:
test.load_file('./results_files/results.p')

This file contains an evaluation and analysis set
5
hi
hi


In [18]:
string = str(test.analysis_results.__class__)


In [11]:
test.meth.

<projecthandle.method_object at 0x362786690>