# Run PREFER for [FS-Mol](https://github.com/microsoft/FS-Mol) test data

#### This notebook can be used to run PREFER on the FS-Mol test data, after extracting the data using the "extract_zipped_files.ipynb" notebook

## WARNING:

In order to run this notebook, please 
1) use the prefer-environment
2) unpack the git submodules within the PREFER repo as described in the README.txt
3) Change the config files as described in the README.txt

## IMPORTS

In [None]:
import sys
import numpy as np
%load_ext autoreload
# path to the main directory
path_to_PREFER = 'path_to/PREFER/'
# path to submodules
path_to_cddd = 'path_to/cddd/'
path_to_moler = 'path_to/molecule-generation/'
sys.path.append(path_to_PREFER)
sys.path.append(path_to_cddd)
sys.path.append(path_to_moler)
import warnings
warnings.filterwarnings('ignore')
from prefer.utils.filtering import *
import sys

In [None]:
from prefer.utils.post_processing_and_optimization_helpers import create_heat_map
from prefer.utils.automation import merge_table_metrics, data_preparation, generate_molecular_representations, run, create_comparison_table

## PREPARING PATHS

In [None]:
# Preparing the shell commands to compute the data based molecular representations
path_to_cddd_model = 'path_to/cddd/default_model'
path_to_moler_model = 'path_to/cddd/default_model'
path_to_compute_model_based_representations = '../compute_model_based_representations.py' # path to the python script compute_model_based_representations.py
folder_path_csv = None # here inser the path to the folder where the csv files generated from the exctract_zipped_files.ipynb

## UTILS FUNCTIONS

In [None]:
import yaml
# write to the config zaml file a new name for the path_to_df
def set_file(prefer_args, file_name, limit_def):
    a_yaml_file = open(prefer_args)
    parsed_yaml_file = yaml.load(a_yaml_file, Loader=yaml.FullLoader)

    parsed_yaml_file['path_to_df'] =folder_path_csv+file_name
    parsed_yaml_file['experiment_name'] =f'small_data_{limit_def}'
    parsed_yaml_file['limit_def'] =limit_def

    with open(prefer_args, 'w') as f:
        yaml.dump(parsed_yaml_file, f)

In [None]:

from prefer.molecule_representations.fingerprints_representations_builder import (
    FingerprintsRepresentationsBuilder,
)

from prefer.src.vector_molecule_representations import VectorMoleculeRepresentations
from prefer.molecule_representations.descriptors2D_representations_builder import (
    Descriptors2DRepresentationsBuilder,
)


def PREFER_job_smalldata(data_info, limit_def):
    df = data_preparation(data_info)
    split_type = data_info['split_type']
    experiment_name = data_info['experiment_name']
    list_of_model_based_representations_paths = data_info['list_of_model_based_representations_paths']
    # If time split you need to provide time column name
    temporal_info_column_name = data_info['temporal_info_column_name']
    if (data_info['split_type'] == 'temporal'):
        if(not temporal_info_column_name):
            raise ValueError(f'ERROR: if time split is required then you need to provide the temporal_info_column_name')
    # 2DD
    _2d_descriptors = Descriptors2DRepresentationsBuilder(limit_def = limit_def)
    _2dd = _2d_descriptors.build_representations(df, split_type=split_type)
    _2dd.experiment_name = experiment_name
    # FINGERPRINTS
    fingerprints_descriptors = FingerprintsRepresentationsBuilder(limit_def = limit_def)
    fingerprints = fingerprints_descriptors.build_representations(df, split_type=split_type)
    fingerprints.experiment_name = experiment_name
    
    dict_of_representations = dict()
    dict_of_representations["FINGERPRINTS"] = fingerprints
    dict_of_representations["2DDESCRIPTORS"] = _2dd
    import pandas as pd

    if list_of_model_based_representations_paths:
        for path in list_of_model_based_representations_paths:
            model_name = path.split("_")[0]

            model_name = model_name.replace(".", "")
            model_name = model_name.replace("/", "")
            vector_repr = VectorMoleculeRepresentations(
                df=pd.DataFrame(), representation_name="", split_type=" ", limit_def = limit_def
            )
            model_based_representation = vector_repr.load(path)
            model_based_representation.experiment_name = experiment_name
            model_based_representation.representation_name = model_name
            model_based_representation.split_type = split_type

            dict_of_representations[model_name] = model_based_representation
    representations = dict_of_representations
    # Run PREFER
    bench_list, dir_destination = run(representations, problem_type = data_info['problem_type'], model_instance = data_info['model_instance'])
    # Evaluate results
    merged = merge_table_metrics(bench_list)
    merged.to_csv('merged.csv')
    experiments_dict, tmp_dict = create_comparison_table(merged, metric_classification = "deltaAUPRC")
    create_heat_map(experiments_dict, tmp_dict)
    return bench_list, merged, dir_destination
    
    
    
    
        
    

In [None]:

def run_prefer_all(data_info, limit_def, prefer_args):
    # Preparing the shell commands to compute the data based molecular representations
    model_name = 'CDDD'
    run_commands = f'conda activate cddd-env-prefer-light; PYTHONPATH="{path_to_cddd}:{path_to_moler}:{path_to_PREFER}:$PYTHONPATH"; export PYTHONPATH; python {path_to_compute_model_based_representations} --prefer_args {prefer_args} --path_to_model {path_to_cddd_model} --model_name {model_name}'
    !{run_commands}
    import datetime
    from os import listdir
    from os.path import isfile, join
    cdddpath = f'./{model_name}_representations_{experiment_name}'
    files = [f for f in listdir(cdddpath) if isfile(join(cdddpath, f))]
    collect_dates = []
    mapping = {}
    for file in files:
        date = file.split('_')[-1]
        date = date.replace('.pkl','')
        date = datetime.datetime.strptime(date, '%Y%m%d-%H%M%S')
        collect_dates.append(date)
        mapping[date] = file

    collect_dates.sort()
    data_info['list_of_model_based_representations_paths'].append(f'{cdddpath}/{mapping[collect_dates[-1]]}')
    model_name = 'MOLER'
    run_commands = f'conda activate moler-env-prefer-light; PYTHONPATH="{path_to_cddd}:{path_to_moler}:{path_to_PREFER}:$PYTHONPATH"; export PYTHONPATH; python {path_to_compute_model_based_representations} --prefer_args {prefer_args} --path_to_model {path_to_moler_model} --model_name {model_name}'
    !{run_commands}
    # find path to the new MOLER representation found
    import datetime
    from os import listdir
    from os.path import isfile, join
    molerpath = f'./{model_name}_representations_{experiment_name}'
    files = [f for f in listdir(molerpath) if isfile(join(molerpath, f))]
    collect_dates = []
    mapping = {}
    for file in files:
        date = file.split('_')[-1]
        date = date.replace('.pkl','')
        date = datetime.datetime.strptime(date, '%Y%m%d-%H%M%S')
        collect_dates.append(date)
        mapping[date] = file

    collect_dates.sort()
    data_info['list_of_model_based_representations_paths'].append(f'{molerpath}/{mapping[collect_dates[-1]]}')
    
    
    _, merged, dir_destination = PREFER_job_smalldata(data_info, limit_def)
    # save merged
    if (not dir_destination.endswith('/')):
        dir_destination = dir_destination+'/'
    name = data_info['experiment_name']
    file_name = data_info['path_to_data'].split('/')[-1]
    file_name = file_name.replace('.csv', '')

    merged.to_csv(f'{dir_destination}_merged_autosklearn_res_{name}_{file_name}.csv')
    
    # collect all the final merged table in one folder
    path = f"merged_folder_limit_def_{limit_def}"

    try:
        os.mkdir(path)
    except OSError:
        print ("Creation of the directory %s failed" % path)
    else:
        print ("Successfully created the directory %s " % path)

    exp_name = data_info['experiment_name']
    merged.to_csv(f'{path}/merged_autosklearn_res_{exp_name}_{file_name}.csv')

## MAIN CODE

In [None]:
# read yaml config file to set the data_info
import yaml
import json


prefer_args = f'../config_files/config_PREFER_smalldata.yaml'# path to your yaml file. An example for 16 sample is stored in the config_files folder
a_yaml_file = open(prefer_args)
parsed_yaml_file = yaml.load(a_yaml_file, Loader=yaml.FullLoader)

path_to_dfs = folder_path_csv
experiment_name = parsed_yaml_file["experiment_name"]
id_column_name = parsed_yaml_file["id_column_name"]
smiles_column_name = parsed_yaml_file["smiles_column_name"]
properties_column_name = parsed_yaml_file["properties_column_name_list"]
problem_type = parsed_yaml_file["problem_type"]
splitting_strategy = parsed_yaml_file["splitting_strategy"]

if 'model_instance' in parsed_yaml_file:
    model_instance = parsed_yaml_file["model_instance"]
else:
    model_instance = None

if "temporal_info_column_name" in parsed_yaml_file:
    temporal_info_column_name = parsed_yaml_file["temporal_info_column_name"]
else:
    temporal_info_column_name = None

In [None]:
import os
try:
    path_to_df_list = os.listdir(path_to_dfs)
except:
    path_to_dfs = path_to_dfs.split('/')[:-1]
    path_to_dfs = "/".join(path_to_dfs)
    path_to_df_list = os.listdir(path_to_dfs)
    
if(not path_to_dfs.endswith('/')):
    path_to_dfs = path_to_dfs+'/'

In [None]:
# Only a subset of the entire set of assays has been used for comparison
common_assays = ['CHEMBL1243967',
 'CHEMBL1613800',
 'CHEMBL1613898',
 'CHEMBL1614027',
 'CHEMBL1614503',
 'CHEMBL1738395',
 'CHEMBL1738579',
 'CHEMBL1963715',
 'CHEMBL1963756',
 'CHEMBL1963824',
 'CHEMBL1963827',
 'CHEMBL1963969',
 'CHEMBL2218957',
 'CHEMBL2218989',
 'CHEMBL2219050',
 'CHEMBL2219070',
 'CHEMBL2219102',
 'CHEMBL2219104',
 'CHEMBL2219113',
 'CHEMBL2219115',
 'CHEMBL2219146',
 'CHEMBL2219159',
 'CHEMBL2219180',
 'CHEMBL2219194',
 'CHEMBL2219203',
 'CHEMBL2219211',
 'CHEMBL2219242',
 'CHEMBL2219244',
 'CHEMBL2219283',
 'CHEMBL2219297',
 'CHEMBL2219308',
 'CHEMBL2219363',
 'CHEMBL3214944',
 'CHEMBL3431932',
 'CHEMBL3431933',
 'CHEMBL3706128',
 'CHEMBL3707783',
 'CHEMBL641707',
 'CHEMBL657032',
 'CHEMBL819742']

In [None]:
limits_def = [16, 32, 64, 128, 256]

In [None]:
for limit_def in limits_def:
    print(f'>>>>>>> CURRENT NUMBER OF TRAINING SAMPLES: {limit_def}')
    for path_to_df in common_assays:
        print(f'>>>>>>> CURRENT FILE : {path_to_df}')
        path_to_df = path_to_df+'.csv'
        set_file(prefer_args, path_to_df, limit_def)
        data_info = {'path_to_data': path_to_dfs+path_to_df,
                 'experiment_name': experiment_name,
                 'id_column_name':id_column_name,
                 'model_instance' : model_instance,
                 'problem_type': problem_type,
                 'smiles_column_name':smiles_column_name,
                 'split_type': splitting_strategy,
                 'temporal_info_column_name': temporal_info_column_name,
                 'properties_column_name_list':properties_column_name, 
                'list_of_model_based_representations_paths': []}
        try:
            run_prefer_all(data_info, limit_def, prefer_args)
        except Exception as e:
            print(f'>>>>> Problem with file: {path_to_dfs+path_to_df} - in particular: {e}')

In [None]:
# go into merged and mean
import pandas as pd
df_concat = pd.DataFrame()
merged_mean = {}
merged_std = {}
for limit_def in limits_def:
    path_to_mergeds = os.listdir(f'./merged_folder_limit_def_{limit_def}')
    for merged in path_to_mergeds:
        if(not merged.startswith('.')):
            print((f'./merged_folder_limit_def_{limit_def}/{merged}'))
            df = pd.read_csv(f'./merged_folder_limit_def_{limit_def}/{merged}')
            df = df.iloc[3:]
            df_concat = pd.concat((df, df_concat))
        else:
            continue
        #collect all the deltaAUPRC for each merged table
    df_concat.index = df_concat.Metrics
    df_concat.drop(columns = ['Metrics'], inplace = True)
    df_concat = df_concat.astype(float)
    by_row_index = df_concat.groupby(df_concat.index)

    merged_mean[limit_def] = by_row_index.mean() 
    merged_std[limit_def] = by_row_index.std() 

In [None]:
merged_mean


In [None]:
merged_std