# Run PREFER

This notebook is used to test the benchmarking and property prediction framework.

#### WARNING:

In order to run this notebook, please 
1) use the prefer-environment
2) unpack the git submodules within the PREFER repo as described in the README.txt
3) Change the config files as described in the README.txt

![caption](./prefer/docs/PREFER_scheme.png)

In [None]:
import sys
import warnings
import pickle
import numpy as np
import time


%load_ext autoreload
# path to the main directory
path_to_PREFER = 'path_to/PREFER/'
# path to submodules
path_to_cddd = 'path_to/PREFER/prefer/model_based_representations/models/cddd/'
path_to_moler = 'path_to/PREFER/prefer/model_based_representations/models/molecule-generation/'
sys.path.append(path_to_PREFER)
sys.path.append(path_to_cddd)
sys.path.append(path_to_moler)
warnings.filterwarnings('ignore')

In [None]:
from prefer.utils.post_processing_and_optimization_helpers import create_heat_map
from prefer.src.prefer_model_wrapper import PreferModelWrapper
from prefer.utils.filtering import *
from prefer.utils.automation import merge_table_metrics, data_preparation, generate_molecular_representations, run, create_comparison_table

In [None]:
def PREFER_job(data_info):
    # Prepare data
    df = data_preparation(data_info)
    # If time split you need to provide time column name
    temporal_info_column_name = data_info['temporal_info_column_name']
    if (data_info['split_type'] == 'temporal'):
        if(not temporal_info_column_name):
            raise ValueError(f'ERROR: if time split is required then you need to provide the temporal_info_column_name')
    # Extract representations
    representations = generate_molecular_representations(df, split_type = data_info['split_type'],
                                   experiment_name = data_info['experiment_name'] ,
                                   list_of_model_based_representations_paths = data_info['list_of_model_based_representations_paths'])

    # Run PREFER
    bench_list, dir_destination = run(representations, problem_type = data_info['problem_type'], model_instance = data_info['model_instance'])
    # Evaluate results
    merged = merge_table_metrics(bench_list)
    merged.to_csv('merged.csv')
    experiments_dict, tmp_dict = create_comparison_table(merged)
    create_heat_map(experiments_dict, tmp_dict)
    #create_heat_map_scaled(experiments_dict, tmp_dict)
    return bench_list, merged, dir_destination

## Set Data Info

In [None]:
# read yaml config file to set the data_info
import yaml
import json

prefer_args = './config_files/config_PREFER_logD.yaml' # OR THE PATH TO YOUR CONFIG FILE
a_yaml_file = open(prefer_args)
parsed_yaml_file = yaml.load(a_yaml_file, Loader=yaml.FullLoader)

path_to_df = parsed_yaml_file["path_to_df"]
experiment_name = parsed_yaml_file["experiment_name"]
id_column_name = parsed_yaml_file["id_column_name"]
smiles_column_name = parsed_yaml_file["smiles_column_name"]
properties_column_name = parsed_yaml_file["properties_column_name_list"]
problem_type = parsed_yaml_file["problem_type"]
splitting_strategy = parsed_yaml_file["splitting_strategy"]

if 'model_instance' in parsed_yaml_file:
    model_instance = parsed_yaml_file["model_instance"]
else:
    model_instance = None

if "temporal_info_column_name" in parsed_yaml_file:
    temporal_info_column_name = parsed_yaml_file["temporal_info_column_name"]
else:
    temporal_info_column_name = None

In [None]:

data_info = {'path_to_data': path_to_df,
             'experiment_name': experiment_name,
             'id_column_name':id_column_name,
             'model_instance' : model_instance,
             'problem_type': problem_type,
             'smiles_column_name':smiles_column_name,
             'split_type': splitting_strategy,
             'temporal_info_column_name': temporal_info_column_name,
             'properties_column_name_list':properties_column_name, 
            'list_of_model_based_representations_paths': []}

# To store the info related to model based representations
dict_commands = dict()

### Compute CDDD representations

#### WARNING: Before running this please install the cddd-env-light as described in the README.txt

In [None]:
# Preparing the shell commands to compute the data based molecular representations
path_to_cddd_model = 'path_to/cddd/default_model'
model_name = 'CDDD'
# write commands that should run as subprocess
# sometimes you may have a CommandNotFoundError since your shell has not been properly configured to use 'conda actovate'. 
# In this case you need to add at the beginning of the run_commands string: source path/to/conda.sh;
# or try: . path/to/conda.sh;
run_commands = f'conda activate cddd-env-prefer-light; PYTHONPATH="{path_to_cddd}:{path_to_moler}:{path_to_PREFER}:$PYTHONPATH"; export PYTHONPATH; python compute_model_based_representations.py --prefer_args {prefer_args} --path_to_model {path_to_cddd_model} --model_name {model_name}'
dict_commands[model_name] = dict()
dict_commands[model_name]['run'] = run_commands
dict_commands[model_name]['path_to_representations'] = f"./{model_name}_representations_{experiment_name}"


In [None]:
print('WARNING: in case of troubles with conda activate in the run_commands please follow the instructions in the comment above')
!{run_commands}

In [None]:
# find path to the new CDDD representation found
import datetime
from os import listdir
from os.path import isfile, join
cdddpath = f'./{model_name}_representations_{experiment_name}'
files = [f for f in listdir(cdddpath) if isfile(join(cdddpath, f))]
collect_dates = []
mapping = {}
for file in files:
    date = file.split('_')[-1]
    date = date.replace('.pkl','')
    date = datetime.datetime.strptime(date, '%Y%m%d-%H%M%S')
    collect_dates.append(date)
    mapping[date] = file
    
collect_dates.sort()
data_info['list_of_model_based_representations_paths'].append(f'{cdddpath}/{mapping[collect_dates[-1]]}')

### Compute MOLER representations

#### WARNING: Before running this please install the moler-env as described in the README.txt

In [None]:
path_to_moler_model = 'path_to/moler/'
model_name = 'MOLER'
# write commands that should run as subprocess
# sometimes you may have a CommandNotFoundError since your shell has not been properly configured to use 'conda actovate'. 
# In this case you need to add at the beginning of the run_commands string: source path/to/conda.sh;
# or try: . path/to/conda.sh;
run_commands = f'conda activate moler-env-prefer-light; PYTHONPATH="{path_to_cddd}:{path_to_moler}:{path_to_PREFER}:$PYTHONPATH"; export PYTHONPATH; python compute_model_based_representations.py --prefer_args {prefer_args} --path_to_model {path_to_moler_model} --model_name {model_name}'
dict_commands[model_name] = dict()
dict_commands[model_name]['run'] = run_commands
dict_commands[model_name]['path_to_representations'] = f"./{model_name}_representations_{experiment_name}"


In [None]:
print('WARNING: in case of troubles with conda activate in the run_commands please follow the instructions in the comment above')
!{run_commands}

In [None]:
# find path to the new MOLER representation found
import datetime
from os import listdir
from os.path import isfile, join
molerpath = f'./{model_name}_representations_{experiment_name}'
files = [f for f in listdir(molerpath) if isfile(join(molerpath, f))]
collect_dates = []
mapping = {}
for file in files:
    date = file.split('_')[-1]
    date = date.replace('.pkl','')
    date = datetime.datetime.strptime(date, '%Y%m%d-%H%M%S')
    collect_dates.append(date)
    mapping[date] = file
    
collect_dates.sort()
data_info['list_of_model_based_representations_paths'].append(f'{molerpath}/{mapping[collect_dates[-1]]}')

### Run PREFER

In [None]:
bench_list, merged, dir_destination = PREFER_job(data_info)

### Save complete dataframe 

In [None]:
from prefer.utils.save_load import saving_procedure_autosklearn
for bench in bench_list:
    saving_procedure_autosklearn(bench, dir_destination)

## Create Model Wrapper from Benchmarking object and use it to predict new samples

### For each combination of model and molecular representation create a PREFER-wrapper

In [None]:
timestr = time.strftime("%Y%m%d_%H%M%S")
path_to_model_dict = None
model_based_representation = False
for bench in bench_list:
    representation_name = bench.representations[0]
    print(f'Preparing wrapper for {representation_name}')
    if representation_name == 'CDDD':
        path_to_model_dict = dict()
        model_based_representation = True
        path_to_model_dict[representation_name] = path_to_cddd_model
    if representation_name == 'MOLER':
        path_to_model_dict = dict()
        model_based_representation = True
        path_to_model_dict[representation_name] = path_to_moler_model
    
    arg_dict = dict(
        datapath=path_to_df,
        friendly_model_name=experiment_name,
        id_column_name=id_column_name,
        smiles_column_name=smiles_column_name,
        properties_column_name_list=properties_column_name,
        problem_type=problem_type,  # Can be regression or classification
        best_model_output_dir=dir_destination,
        representations=[representation_name],
        path_to_model=path_to_model_dict,  # this should be set
        project_code="",
    )
    
    final_meta_data = arg_dict
    bm_rep = representation_name
    final_meta_data["best_model_representation"] = bm_rep
    final_meta_data["desirability_scores"] = None
    
    
    final_meta_data["rep_model_id"] = bench.models_ids[representation_name]
    model = bench.best_estimator[representation_name]
    final_meta_data["features_scaling_type"] = bench.features_scaling_type[representation_name]
    final_meta_data["features_means_vect"] = bench.features_means_vect[representation_name]
    final_meta_data["features_stds_vect"] = bench.features_stds_vect[representation_name]
    
    # add info needed to compute model based representations
    if(model_based_representation):
        
        final_meta_data["prefer_path"] = path_to_PREFER
        final_meta_data["dict_commands"] = dict_commands
        
    # Store info related to the probability threshold used (e.g. otpimized by GHOSTml) if classification task
    if(problem_type == 'classification'):
        final_meta_data["probability_threshold"] = bench.metrics[representation_name]['prob_threshold']
    
    
    # take all the train and the test set to refit the autosklearn model
    Xtrain, ytrain, Xtest, ytest = bench.molecule_representations_obj_list[0].split()
    X_fin = np.concatenate((Xtrain, Xtest), 0)
    y_fin = np.concatenate((ytrain, ytest), 0)
    print("Refitting AutoSklearn model...")
    model.refit(X_fin, y_fin)
    print('Refitted!')
    wrapper = PreferModelWrapper(model=model, metadata=final_meta_data)
    # Save wrapper in final location
    if not dir_destination.endswith('/'):
        dir_destination = dir_destination+'/'
    metadata_name = f"{dir_destination}{experiment_name}_{representation_name}_{timestr}"
    print(f'Wrapper for {representation_name} model has been stored in {dir_destination}{experiment_name}_{representation_name}_{timestr}')
    with open(metadata_name + ".pkl", "wb") as output:
        pickle.dump(wrapper, output)

### Load one wrapper related to one combination of model and molecular representation

In [None]:
one_stored_wrapper_path = f'{dir_destination}{experiment_name}_{representation_name}_{timestr}'

In [None]:
with open(f'{one_stored_wrapper_path}.pkl', 'rb') as f:
    one_stored_wrapper = pickle.load(f)

### Load samples to predict

In [None]:
test_smiles_samples = list(bench_list[-1].df[representation_name].Smiles.values[0:5])
test_smiles_samples

### Predict with wrapper

In [None]:
# problem IS that you pass the info of the entire original dataframe and not just the list of smiles you have. Need to r=fix this
#import pandas as pd
predictions = one_stored_wrapper.predict(test_smiles_samples)

In [None]:
predictions

## Evaluate Overall Results

In [None]:
merged

In [None]:
# save merged
merged.to_csv(f'{dir_destination}_merged_autosklearn_res.csv', index = False)
merged.to_pickle(f'{dir_destination}_merged_autosklearn_res.pkl')

In [None]:
dir_destination

In [None]:
for bench in bench_list:
    bench.plot_res()

## Inspect Results

### Get models with weights

In [None]:
bench_list[0].representations[0]

In [None]:
from pprint import pprint
for bench in bench_list:
    repr_ = bench.representations[0]
    print(f'-------------- Results for : {repr_} --------------')
    print(f'-----------------------------------------------------')
    pprint(bench.best_estimator[repr_].get_models_with_weights(), indent=4)

### Compute PREFER table

In [None]:
for bench in bench_list:
    pprint(bench.create_summary_table())

### Check the ensemble models

In [None]:
# detail of the models in the ensemble
merged.loc['Prediction Model'].values

### Check feature preprocessing 

In [None]:
# detail of the feature preprocessor used 

for bench in bench_list:
    repr_ = bench.representations[0]
    print(f'-------------- Results for : {repr_} --------------')
    print(f'-----------------------------------------------------')
    pprint(bench.best_estimator[repr_].leaderboard(detailed = True, top_k= 50)['feature_preprocessors'])

### Inspect all the evaluated models

In [None]:
bench_list[0].best_estimator[bench_list[0].representations[0]].leaderboard(detailed = True, ensemble_only= False, top_k= 50)

In [None]:
bench_list[1].best_estimator[bench_list[1].representations[0]].leaderboard(detailed = True, ensemble_only= False, top_k= 50)