### Filtering Evaluation Results

After running the `eval` module of TarPass on a specific model, this notebook can be used to perform stepwise filtering by specifying a particular target and selecting specific evaluation metrics.


In [None]:
from analysis.collect_eval import collect_readable
from pathlib import Path
from module.screen import screen_with_stats

#### Specifying Model and Target

In [None]:
model = 'Drugflow'
target = 'JAK2'
results = collect_readable(Path(f'{model}/{target}/results'))

#### Specify Filtering Criteria:

Filtering conditions can be provided either via a JSON file in the first two cells below, or directly as a dictionary in the third cell.  
Please note that when modifying the custom JSON file, it must strictly follow the format shown in the third cell.

In [None]:
# from utils.io import dump_json, load_json

# di = {}
# for cls in list((r:=results[0]).keys())[1:]:
#     di[cls] = {}
#     subclses = r[cls]
#     for subcls, metrics in subclses.items():
#         di[cls][subcls] = {}
#         for metric in metrics.keys():
#             di[cls][subcls][metric] = ()

# dump_json('filter_conditions.json', di)

In [None]:
# # Modify 'filter_conditions.json' to set your filtering conditions
# conditions = load_json('filter_conditions.json')

In [None]:
conditions = {
    # Redock results filter
    'Dock': {
        'numerical': {
            'score': ['<=', -8.0],
            'fully_matched': ['==', True],
        }, 
        'interactions': {}
        },

    # Rescore reults filter, which available model is 3D
    'Score':{
        'numerical': {
            'fully_matched': ['==', True]
        },
        'interactions': {}
        },

    # Properties filter
    'Prop': {
        'Structural': {
            'all_common': ['==', True],
            'completeness': ['==', True],
            'largest ring': ['<=', 7],
        },
        'Descriptors': {
            'molwt': ['>=', 250],
            'molwt': ['<=', 750],
        },
        'Alerts': {
            'sa_score': ['<=', 4.0],
            'qed': ['>=', 0.4],
            'lipinski': ['>', 3],
            'PAINS_alert': ['==', 0],
            'SureChEMBL_alert': ['==', 0],
            'Glaxo_alert': ['==', 0]
        }
        }
}

In [None]:
filtered, stats = filter_stats(results, conditions)

#### Post-processing

In [None]:
from utils.io import write_pkl
write_pkl(f'{model}_{target}_filtered.pkl', filtered)

In [None]:
import pandas as pd
df_di = {'Initial': len(results)}
for cls, subclses in stats.items():
    for metric, count in subclses.items():
        df_di[f'{cls}-{metric}'] = count
df = pd.DataFrame.from_dict(df_di, orient='index', columns=[model])
# df.to_csv(f'{model}_{target}_filter_stats.csv')
df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
plt.figure(figsize=(12, 6), dpi=600)

ax = sns.lineplot(x=range(len(df)),
                  y=df[model],
                  marker='o',
                  linewidth=2.5,
                  markersize=8)

ax.set_xticks(range(len(df)))
ax.set_xticklabels(df.index, rotation=45, ha='right')

plt.xlabel('Filter Criteria (Applied Sequentially)', fontsize=12)
plt.ylabel('Remaining Compounds', fontsize=12)
plt.ylim(0, 1025)

for i, v in enumerate(df[model]):
    plt.annotate(f'{v}', (i, v), textcoords="offset points", 
                xytext=(0,10), ha='center', fontsize=9, fontweight='bold')

plt.legend([model])
plt.tight_layout()