# Analysis of the small data experiments and comparison with the FS-Mol results

This notebook can be used to analyzed results obtained by running the run_PREFER_smalldata_example.ipynb notebook and to compare such results with the FS-Mol results, as described [here](https://github.com/microsoft/FS-Mol). Before running the cells, please download the results of the FS-Mol paper (.csv files) stored [here](https://github.com/microsoft/FS-Mol/tree/main/baselines).

## WARNING:

In order to run this notebook, please 
1) use the prefer-environment
2) unpack the git submodules within the PREFER repo as described in the README.txt
3) Change the config files as described in the README.txt

## Imports

In [None]:
import sys
import numpy as np
import pandas as pd
import os
%load_ext autoreload
# path to the main directory
path_to_PREFER = 'path_to/PREFER/'
# path to submodules
path_to_cddd = 'path_to/cddd/'
path_to_moler = 'path_to/molecule-generation/'
sys.path.append(path_to_PREFER)
sys.path.append(path_to_cddd)
sys.path.append(path_to_moler)
import warnings
warnings.filterwarnings('ignore')
from prefer.utils.filtering import *
import sys

In [None]:
from prefer.utils.post_processing_and_optimization_helpers import create_heat_map
from prefer.utils.automation import merge_table_metrics, data_preparation, generate_molecular_representations, run, create_comparison_table

## UTILS FUNCTIONS

In [None]:
# Only the following assays have been used for comparison
common_assays = ['CHEMBL1243967',
 'CHEMBL1613800',
 'CHEMBL1613898',
 'CHEMBL1614027',
 'CHEMBL1614503',
 'CHEMBL1738395',
 'CHEMBL1738579',
 'CHEMBL1963715',
 'CHEMBL1963756',
 'CHEMBL1963824',
 'CHEMBL1963827',
 'CHEMBL1963969',
 'CHEMBL2218957',
 'CHEMBL2218989',
 'CHEMBL2219050',
 'CHEMBL2219070',
 'CHEMBL2219102',
 'CHEMBL2219104',
 'CHEMBL2219113',
 'CHEMBL2219115',
 'CHEMBL2219146',
 'CHEMBL2219159',
 'CHEMBL2219180',
 'CHEMBL2219194',
 'CHEMBL2219203',
 'CHEMBL2219211',
 'CHEMBL2219242',
 'CHEMBL2219244',
 'CHEMBL2219283',
 'CHEMBL2219297',
 'CHEMBL2219308',
 'CHEMBL2219363',
 'CHEMBL3214944',
 'CHEMBL3431932',
 'CHEMBL3431933',
 'CHEMBL3706128',
 'CHEMBL3707783',
 'CHEMBL641707',
 'CHEMBL657032',
 'CHEMBL819742']

In [None]:
import yaml
path_to_dfs = 'path_to/fs-mol/csv_files/' # path to the csv files converted from the zip files with extract_zipped_files.ipynb notebook
path_to_df_list = os.listdir(path_to_dfs)
dimensions = []
ratios = []
for common_assay in common_assays:
    df_tmp = pd.read_csv(path_to_dfs+common_assay+'.csv')
    dimensions.append(df_tmp.shape[0])
    ratios.append(np.round(df_tmp.Property.sum()/df_tmp.shape[0], 2))

In [None]:
import matplotlib.pyplot as plt
plt.hist(dimensions, bins='auto')
plt.title('Data dimensions distribution')
plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.hist(ratios, bins='auto')
plt.title('Data classes ratio distribution')
plt.show()

In [None]:
# go into merged and mean
import pandas as pd

import os
merged_mean = dict()
merged_std = dict()
limit_defs = ['16', '32', '64', '128', '256'] # number of samples used inthe training set

path_to_mergeds_dict = {}

for limit_def in limit_defs:
    df_concat = pd.DataFrame()
    num_folder = 0
    path_to_mergeds_list = []
    file_name_list = []
    path_to_mergeds = os.listdir(f'./merged_folder_limit_def_{limit_def}')
    for merged in path_to_mergeds:
        file_name = merged.split('_')[-1]
        file_name = file_name.split('.')[0]
        file_name_list.append(file_name)

        if((not merged.startswith('.')) and (file_name in common_assays)):
            num_folder = num_folder +1
            df = pd.read_csv(f'./merged_folder_limit_def_{limit_def}/{merged}')
            df = df.iloc[3:]
            df_concat = pd.concat((df, df_concat))
        else:
            continue
    path_to_mergeds_list= path_to_mergeds_list+file_name_list
        #collect all the deltaAUPRC for each merged table
    df_concat.index = df_concat.Metrics
    df_concat.drop(columns = ['Metrics'], inplace = True)
    df_concat = df_concat.astype(float)
    by_row_index = df_concat.groupby(df_concat.index)
    path_to_mergeds_dict[limit_def] = path_to_mergeds_list
    merged_mean[limit_def] = by_row_index.mean() 
    merged_std[limit_def] = by_row_index.std()/np.sqrt(num_folder)

In [None]:
tmp_dict_mean = dict()
tmp_dict_std = dict()
for limit_def in limit_defs:
    
    _, tmp_dict_mean[limit_def] = create_comparison_table(merged_mean[limit_def], metric_classification = "deltaAUPRC")
    _, tmp_dict_std[limit_def] = create_comparison_table(merged_std[limit_def], metric_classification = "deltaAUPRC")

In [None]:
# collect results for comparison with baseline
# save
import pickle
import os

# define the name of the directory to be created
path = "delta_performance_folder"

try:
    os.mkdir(path)
except OSError:
    print ("Creation of the directory %s failed" % path)
else:
    print ("Successfully created the directory %s " % path)
    
best_representation = None 
exp_names = []
best_metric_std = None
metric = 'deltaAUPRC'
delta_performance = dict()
for limit_def in limit_defs:
    delta_performance[limit_def] = dict()
    for index, exp_name in enumerate(tmp_dict_mean[limit_def].keys()):
        exp_names.append(exp_name)

        best_metric_value = -1000
        for representation in tmp_dict_mean[limit_def][exp_name].keys():
            current_metric_value = tmp_dict_mean[limit_def][exp_name][representation][exp_name]
            current_metric_std = tmp_dict_std[limit_def][exp_name][representation][exp_name]
            if current_metric_value>best_metric_value:
                best_metric_value = current_metric_value
                best_metric_std = current_metric_std
                best_representation = representation

        delta_performance[limit_def]['experiment_name'] = exp_name
        delta_performance[limit_def]['metric'] = metric
        delta_performance[limit_def]['prefer_model_performance_mean'] = best_metric_value
        delta_performance[limit_def]['prefer_model_performance_std'] = best_metric_std
        delta_performance[limit_def]['prefer_model_representation'] = best_representation

    
with open(f'{path}/delta_performance_ALL.pkl', 'wb') as f:
    pickle.dump(delta_performance, f)

In [None]:
# plot
vect_means = [delta_performance['16']['prefer_model_performance_mean'], 
              delta_performance['32']['prefer_model_performance_mean'],
              delta_performance['64']['prefer_model_performance_mean'], 
              delta_performance['128']['prefer_model_performance_mean'], 
              delta_performance['256']['prefer_model_performance_mean']]

vect_stds = [delta_performance['16']['prefer_model_performance_std'], 
              delta_performance['32']['prefer_model_performance_std'],
              delta_performance['64']['prefer_model_performance_std'], 
              delta_performance['128']['prefer_model_performance_std'], 
              delta_performance['256']['prefer_model_performance_std']]
model_name = ['16', '32', '64', '128', '256']

In [None]:
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
plt.rcParams["figure.figsize"] = (8,5.5)
plt.errorbar(range(0, 5), vect_means,
                 yerr=vect_stds, fmt='', linewidth=3, label=model_name)  # To draw legend

locs, labels = plt.xticks()  # Get the current locations and labels.
plt.xticks(np.arange(len(model_name)), list(model_name))
plt.grid(color = 'grey', linestyle = '--', linewidth = 0.5)

#plt.legend(bbox_to_anchor=(1.8, 0.2),loc="upper right")
plt.ylabel('∆AUPRC', fontdict=None)
plt.ylim(0,0.4)
plt.xlabel('Training size', fontdict=None)
plt.savefig(f'small_data_performances2.png', bbox_inches='tight', transparent=True)



## Comparing PREFER to FS-MOL

Before running the cells, please download the results of the FS-Mol paper (.csv files) stored [here](https://github.com/microsoft/FS-Mol/tree/main/baselines).

In [None]:
import pandas as pd
import os
limit_defs = ['16', '32', '64', '128', '256']

path_to_fsmol_res = 'path_to/fs-results/' # path to the FS-Mol results as reported in the git repo
path_to_fsresults = os.listdir(path_to_fsmol_res)
vect_list_means = dict()
vect_list_stds = dict()
for res in path_to_fsresults:
    name = res.replace('.csv', '')
    vect_list_means[name] = [] # for each limit def one elem
    vect_list_stds[name] = []
    df = pd.read_csv(f'{path_to_fsmol_res}{res}')
    for limit_def in limit_defs:
        elems_per_assay_per_limit_def = []
        for elem, fr_train in zip(df[f'{limit_def}_train'].values, df.fraction_positive_train.values):
            if not isinstance(elem, float):
                elem_converted = float(elem.split('+')[0])
                elems_per_assay_per_limit_def.append(elem_converted-fr_train)
        vect_list_means[name].append(np.mean(elems_per_assay_per_limit_def))
        vect_list_stds[name].append(np.std(elems_per_assay_per_limit_def)/np.sqrt(len(elems_per_assay_per_limit_def)))

In [None]:
import seaborn as sns
colors = sns.color_palette("colorblind")

In [None]:
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
plt.rcParams["figure.figsize"] = (8,5.5)
for index, key in enumerate(vect_list_means.keys()):
    vect_means_sd = vect_list_means[key]
    vect_stds_sd = vect_list_stds[key]
    model_name_sd = limit_defs
    plt.errorbar(range(0, 5), vect_means_sd,
                     yerr=vect_stds_sd, fmt='', linewidth=3, label=key, color = colors[index], alpha = 0.7)  # To draw legend

    locs, labels = plt.xticks()  # Get the current locations and labels.
    plt.xticks(np.arange(len(model_name_sd)), list(model_name_sd))
    plt.grid(color = 'grey', linestyle = '--', linewidth = 0.5)

    plt.legend(bbox_to_anchor=(1.5, 0.2),loc="upper right")
    
plt.errorbar(range(0, 5), vect_means,
                 yerr=vect_stds, fmt='', linewidth=3, label = 'PREFER', color = 'black')  # To draw legend
plt.legend(bbox_to_anchor=(1.6, 0.2),loc="upper right")
plt.ylabel('∆AUPRC', fontdict=None)
plt.ylim(0,0.4)
plt.xlabel('Training size', fontdict=None)
plt.savefig(f'PREFER4small_data_comparison.png', bbox_inches='tight', transparent=True)
plt.show()