# Plot performances for each molecular representation

After running TestSet_Bootstrapping.ipynb a .pkl file containing the final performances for each molecular representation should have been created. This notebook will then plot the results and perform a statistical analysis. 

## Imports

In [None]:
import sys
%load_ext autoreload
# path to the main directory
path_to_PREFER = 'path_to/PREFER/'
# path to submodules
path_to_cddd = 'path_to/PREFER/prefer/model_based_representations/models/cddd/'
path_to_moler = 'path_to/PREFER/prefer/model_based_representations/models/molecule-generation/'
sys.path.append(path_to_PREFER)
sys.path.append(path_to_cddd)
sys.path.append(path_to_moler)
import warnings
warnings.filterwarnings('ignore')
from prefer.utils.filtering import *
import sys

In [None]:
from prefer.utils.post_processing_and_optimization_helpers import create_heat_map
from prefer.utils.automation import merge_table_metrics, data_preparation, generate_molecular_representations, run, create_comparison_table

### Folders where to find models

In [None]:
import pickle 
name = "final_dict_['publicSolubility', 'publicLogD'].pickle"
with open(name, 'rb') as handle:
    dict1 = pickle.load(handle)


In [None]:
import pandas as pd
df1 = pd.DataFrame()
repr_vect = []
value_vect = []
for repr_ in dict1['autosklearn']['publicSolubility'].keys():
    for elem in dict1['autosklearn']['publicSolubility'][repr_]:
        repr_vect.append(repr_)
        value_vect.append(elem)
df1['Representation'] = repr_vect
df1['∆AUPRC'] = value_vect

In [None]:
import pandas as pd
df2 = pd.DataFrame()
repr_vect = []
value_vect = []
for repr_ in dict1['autosklearn']['publicLogD'].keys():
    for elem in dict1['autosklearn']['publicLogD'][repr_]:
        repr_vect.append(repr_)
        value_vect.append(elem)
df2['Representation'] = repr_vect
df2['R2'] = value_vect

In [None]:
from scipy import stats
collect_stats = []
for exper in dict1['autosklearn'].keys():
    print(exper)
    collect_group = []
    for repr_ in dict1['autosklearn'][exper].keys():
        collect_group.append(dict1['autosklearn'][exper][repr_])
    #perform Friedman Test
    collect_stats.append(stats.friedmanchisquare(collect_group[0], collect_group[1], collect_group[2], collect_group[3]))

collect_stats

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
fig, axes = plt.subplots(1, 3, figsize=(10, 5), sharey=True)
fig.suptitle('Performances', size = 15)
sns.set(font_scale=0.8)
sns.violinplot(ax=axes[0], x='Representation', y='∆AUPRC', data=df1.sort_values('Representation'));
axes[0].set_title('LE-MDCK', size = 15)
axes[0].tick_params(axis='x', rotation=45)


plt.savefig(f'classification_results_with_mean_and_std.png', bbox_inches='tight', transparent=True)

plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
fig, axes = plt.subplots(1, 3, figsize=(10, 5), sharey=True)
fig.suptitle('Performances', size = 15)
sns.set(font_scale=0.8)
sns.violinplot(ax=axes[0], x='Representation', y='R2', data=df2.sort_values('Representation'));
axes[0].set_title('logD', size = 15)
axes[0].tick_params(axis='x', rotation=45)


plt.savefig(f'regression_results_with_mean_and_std.png', bbox_inches='tight', transparent=True)

plt.show()

In [None]:
income_groups = [df1.loc[df1['Representation']==repr_, '∆AUPRC'].values for repr_ in df1['Representation'].dropna().unique()]
stat, p_value = f_oneway(*income_groups)
print(f"F Test: statistic={stat:.4f}, p-value={p_value:.4f}")

In [None]:
income_groups = [df2.loc[df2['Representation']==repr_, 'R2'].values for repr_ in df2['Representation'].dropna().unique()]
stat, p_value = f_oneway(*income_groups)
print(f"F Test: statistic={stat:.4f}, p-value={p_value:.4f}")