In [None]:
import pyrfume
import pandas as pd
import pyrfume.benchmarking as pbm
import itertools
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
archive = ' '

In [None]:
# Import data, models (unfit), and data-prep functions
results = pbm.gridsearch_csv_to_frame('benchmarking.csv')
prepare_dataset, models = pbm.load_pickle()

# Fit models
models['fitted_model'] = models.apply(
    lambda row: pbm.fit_model_for_pickle(
        archive=archive,
        prepare_dataset=prepare_dataset,
        row=row
    ),
    axis=1
)

models.set_index(['target', 'features', 'metric'], inplace=True)
models['estimator'] = models['pipeline_steps'].astype(str).str[1:-3]

In [None]:
models.head()

In [None]:
# Best results
best_results = pbm.get_best_results(results)
best_results.head()

# I. Landscape of scores across models and features

This section compares model scores in the overall parameter space explored, for all estimator families that were part of the gridsearch.

## 1. Heat maps of scores
This section iterates through each combination of targets and features, and shows best scores for all model families (rows) vs. metrics (columns). The single best performing (model, metric) pair is indicated by a blue box. 

In [None]:
# Heat maps
targets = best_results.index.get_level_values(level='target').unique()
features = best_results.index.get_level_values(level='features').unique()

for target, feature in itertools.product(targets, features):
    plt.title(f'Score summary: feature set={feature}; targets={target}')
    pbm.plot_heatmap(best_results.xs((target, feature), level=['target','features']), show_rect=True)

## 2. Score report
This section iterates through each target, and shows a strip-chart with the leading 10 scores for all estimator classes for that target (rows). Scores for all metrics are shown (columns). Both mordred and morgan features are shown on the same axes, for easy comparison. These plots are useful for visualizing intra and inter-model variability, as well as for discerning whether one feature set leads to systematically stronger predictions.

In [None]:
valid_metrics = [colname for colname in results.columns if colname.startswith('mean_')]
targets = results.index.get_level_values(level='target').unique()

for target in targets:
    results_by_target = results.xs(target, level='target', drop_level=False)
    pbm.plot_score_report(results_by_target, features.to_list(), valid_metrics, 10)

## 3. Score distributions
A summary version of the score plot. Shows score distributions for all models (rows) vs. metrics (columns) as box and whisker plots. Allows for easy comparison of central tendency across models as well as outliers.

In [None]:
short_names = [name.replace('mean_','') for name in valid_metrics]
name_map = {k: v for k, v in zip(valid_metrics, short_names)}

df = results[valid_metrics].reset_index()
df = df.melt(id_vars='pipeline_string', value_vars=valid_metrics, value_name='score', var_name='metric')
df['metric'] = df['metric'].map(name_map)
sns.catplot(data=df, x='score', y='pipeline_string', col='metric', kind='box', height=10, aspect=.3, palette='tab10')

# II. Evaluation of top performing models
This section shows a more granular and detailed view of the performance of individual models. For each dataset target, it shows the best-peforming model for the best feature set, across all metrics. For classification tasks, ROC curves and confusion matrices are plotted. For regression tasks, fit residuals and actual-v-predicted plots are shown.

In [None]:
pbm.iterate_and_plot_models(models, prepare_dataset, archive)