# Best PREFER models VS RF

After running TestSet_Bootstrapping.ipynb a .pkl file containing the final performances for each molecular representation should have been created. This notebook will then plot the best PREFER results against the RF results.

## Imports

In [None]:
import sys
%load_ext autoreload
# path to the main directory
path_to_PREFER = 'path_to/PREFER/'
# path to submodules
path_to_cddd = 'path_to/PREFER/prefer/model_based_representations/models/cddd/'
path_to_moler = 'path_to/PREFER/prefer/model_based_representations/models/molecule-generation/'
sys.path.append(path_to_PREFER)
sys.path.append(path_to_cddd)
sys.path.append(path_to_moler)
import warnings
warnings.filterwarnings('ignore')
from prefer.utils.filtering import *
import sys

In [None]:
from prefer.utils.post_processing_and_optimization_helpers import create_heat_map
from prefer.utils.automation import merge_table_metrics, data_preparation, generate_molecular_representations, run, create_comparison_table

### Folders where to find models

In [None]:
import pickle

    
name = "final_dict_['publicSolubility', 'publicLogD'].pickle"
with open(name, 'rb') as handle:
    dict_ = pickle.load(handle)


In [None]:
def clean_RF_results (dict_):
    reference = dict_['autosklearn']
    dict_ref = dict()
    
    for key in reference.keys():
        # check the index of the fingerprints representation for each assay
        for index, repr_ in enumerate(reference[key]):
            if(repr_ == 'FINGERPRINTS'):
                dict_ref[key] = index
    for key in dict_ref.keys():
        index_ = dict_ref[key]
        dict_['RF'][key] = dict_['RF'][key][0+index_*6:6+index_*6]
    return dict_

In [None]:
dict_ = clean_RF_results(dict_)

In [None]:
import numpy as np
final_dict = dict_
collect_means_RF = []
collect_means_autosklearn_fp = []
collect_means_autosklearn_2dd = []
collect_means_autosklearn_cddd = []
collect_means_autosklearn_moler = []
collect_stds_RF = []
collect_stds_autosklearn_fp = []
collect_stds_autosklearn_2dd = []
collect_stds_autosklearn_cddd = []
collect_stds_autosklearn_moler = []
collect_names = []
collect_names_confirmation = []
for key2 in final_dict['RF'].keys():
    collect_means_RF.append(np.mean(final_dict['RF'][key2]))
    collect_stds_RF.append(np.std(final_dict['RF'][key2]))
    collect_names.append(key2)
for key2 in final_dict['autosklearn'].keys():
    collect_means_autosklearn_fp.append(np.mean(final_dict['autosklearn'][key2]['FINGERPRINTS']))
    collect_means_autosklearn_2dd.append(np.mean(final_dict['autosklearn'][key2]['DESCRIPTORS2D']))
    collect_means_autosklearn_cddd.append(np.mean(final_dict['autosklearn'][key2]['CDDD']))
    collect_means_autosklearn_moler.append(np.mean(final_dict['autosklearn'][key2]['MOLER']))
    collect_stds_autosklearn_fp.append(np.std(final_dict['autosklearn'][key2]['FINGERPRINTS']))
    collect_stds_autosklearn_2dd.append(np.std(final_dict['autosklearn'][key2]['DESCRIPTORS2D']))
    collect_stds_autosklearn_cddd.append(np.std(final_dict['autosklearn'][key2]['CDDD']))
    collect_stds_autosklearn_moler.append(np.std(final_dict['autosklearn'][key2]['MOLER']))
    collect_names_confirmation.append(key2)

In [None]:
collect_names, collect_names_confirmation

In [None]:
best_values_mean =[]
best_values_std =[]
best_representation =[]
representations_names = ['FINGERPRINTS', 'DESCRIPTORS2D', 'CDDD', 'MOLER']
for fp, _2dd, cddd, moler, fpSD, _2ddSD, cdddSD, molerSD in zip(collect_means_autosklearn_fp, collect_means_autosklearn_2dd, collect_means_autosklearn_cddd, collect_means_autosklearn_moler, collect_stds_autosklearn_fp, collect_stds_autosklearn_2dd, collect_stds_autosklearn_cddd, collect_stds_autosklearn_moler):
    means_vect = [fp, _2dd, cddd, moler]
    stds_vect = [fpSD, _2ddSD, cdddSD, molerSD]
    index_max = np.argmax(means_vect)
    best_values_mean.append(means_vect[index_max])
    best_values_std.append(stds_vect[index_max])
    best_representation.append(representations_names[index_max])

In [None]:
best_representation

In [None]:
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import matplotlib.pylab as pylab
params = {'legend.fontsize': 'x-large',
          'figure.figsize': (25, 5),
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}
pylab.rcParams.update(params)

def plot_each_res(bars1, bars2, yer1, yer2, collect_names):
    plt.rcParams["figure.figsize"] = (15,10)
    
    # width of the bars
    barWidth = 0.3


    # The x position of bars
    r1 = np.arange(len(bars1))
    r2 = [x + barWidth for x in r1]

    # Create blue bars
    barsRF = plt.bar(r1, bars1, width = barWidth, yerr=yer1, label='RandomForest')

    # Create cyan bars
    barsAS = plt.bar(r2, bars2, width = barWidth, yerr=yer2, label='PREFER')
    
    for i in [1, 2, 4]:
        barsRF[i].set_color('#9ecae1')
        barsAS[i].set_color('#fc9272')
        
    for i in [0, 3, 5]:
        barsRF[i].set_color('#3182bd')
        barsAS[i].set_color('#de2d26')

    # general layout
    plt.xticks([r + barWidth/2 for r in range(len(bars1))], collect_names, size = 21)
    plt.ylabel('Evaluation Metrics', size = 30)
    plt.legend()
    plt.rcParams['font.size'] = '50'
    plt.savefig(f'./comparison_with_baseline.png', bbox_inches='tight', transparent=True)
    # Show graphic
    plt.show()

In [None]:
bar_regression_means_prefer= []
bar_classification_means_prefer = [] 
bar_regression_stds_prefer= []
bar_classification_stds_prefer = [] 
collect_names_regression = []
bar_regression_means_rf = []
bar_classification_means_rf= []
bar_regression_stds_rf= []
bar_classification_stds_rf = []
collect_names_classification = []
for name, mean_rf, mean_prefer, std_rf, std_prefer in zip(collect_names, collect_means_RF, best_values_mean, collect_stds_RF, best_values_std):
    if name in ['publicLogD']:
        bar_regression_means_prefer.append(mean_prefer)
        bar_regression_stds_prefer.append(std_prefer)
        bar_regression_means_rf.append(mean_rf)
        bar_regression_stds_rf.append(std_rf)
        collect_names_regression.append(name)
        
    else:
        bar_classification_means_prefer.append(mean_prefer)
        bar_classification_stds_prefer.append(std_prefer)
        bar_classification_means_rf.append(mean_rf)
        bar_classification_stds_rf.append(std_rf)
        collect_names_classification.append(name)

In [None]:
params = {'legend.fontsize': 'x-large',
          'figure.figsize': (25, 5),
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}
pylab.rcParams.update(params)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import matplotlib.pylab as pylab


def plot_each_res(bar_regression_means_prefer, bar_classification_means_prefer, bar_regression_stds_prefer, bar_classification_stds_prefer, collect_names_regression, 
                 bar_regression_means_rf, bar_classification_means_rf, bar_regression_stds_rf, bar_classification_stds_rf, collect_names_classification):
    
    
    fig, axs = plt.subplots(2)

    plt.rcParams["figure.figsize"] = (30,25)
    
    # width of the bars
    barWidth = 0.3

    # regression
    # The x position of bars
    r1 = np.arange(len(bar_regression_means_rf))
    r2 = [x + barWidth for x in r1]
    # Create blue bars
    barsRF = axs[0].bar(r1, bar_regression_means_rf, width = barWidth, yerr=bar_regression_stds_rf, label='RandomForest', color = '#3182bd')

    # Create cyan bars
    barsAS = axs[0].bar(r2, bar_regression_means_prefer, width = barWidth, yerr=bar_regression_stds_prefer, label='PREFER', color = '#de2d26')
    
    # general layout
    axs[0].set_xticks([r + barWidth/2 for r in range(len(bar_regression_means_rf))], collect_names_regression, size = 30)
    axs[0].set_ylabel('R2', size =25)
    axs[0].legend(fontsize=25, loc='upper left')
    axs[0].figure.set_size_inches(15,15)
    axs[0].tick_params(axis='both', which='major', labelsize=25)
    
    # classification
    # The x position of bars
    r1 = np.arange(len(bar_classification_means_rf))
    r2 = [x + barWidth for x in r1]
    # Create blue bars
    barsRF = axs[1].bar(r1, bar_classification_means_rf, width = barWidth, yerr=bar_classification_stds_rf, label='RandomForest', color = '#9ecae1')

    # Create cyan bars
    barsAS = axs[1].bar(r2, bar_classification_means_prefer, width = barWidth, yerr=bar_classification_stds_prefer, label='PREFER', color = '#fc9272')
    
    # general layout
    axs[1].set_xticks([r + barWidth/2 for r in range(len(bar_classification_means_rf))], collect_names_classification, size = 30)
    axs[1].set_ylabel('ROC_AUC', size = 25)
    axs[1].legend(fontsize=25, loc='lower left')
    axs[1].figure.set_size_inches(15,15)
    axs[1].tick_params(axis='both', which='major', labelsize=25)
    

    
    plt.rcParams['font.size'] = '20'
    plt.savefig(f'./comparison_with_baseline2.png', bbox_inches='tight', transparent=True)
    # Show graphic
    plt.show()

In [None]:
bar_regression_means_prefer, bar_regression_means_rf

In [None]:
plot_each_res(bar_regression_means_prefer, bar_classification_means_prefer, bar_regression_stds_prefer, bar_classification_stds_prefer, collect_names_regression, 
                 bar_regression_means_rf, bar_classification_means_rf, bar_regression_stds_rf, bar_classification_stds_rf, collect_names_classification)