In [1]:
from tdt_autopsy.logregs_extra_experiments import tidy_results
from ipythonme import *
import pandas as pd
pd.set_option('precision', 2)
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000

In [2]:
# Let's load the very anecdotical results from the competition benchmark
#   - Train on the whole training set with the best individual logreg
#   - Test in the tiny competition set (1056 mols, around 100 positives)
#     Anecdotical, but gives us the prize and agrees with more serious cross-val + OOB evaluations.
df = tidy_results(recompute=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1568 entries, 0 to 1567
Data columns (total 33 columns):
data_name                    1568 non-null object
model_name                   1568 non-null object
zero_columns                 1568 non-null object
binarize_threshold           1568 non-null float64
input_is_binary              1568 non-null bool
is_binary                    1568 non-null bool
is_binary_counts             1568 non-null bool
is_regular_counts            1568 non-null bool
allow_unseen_in_folding      1568 non-null bool
fold_size                    1568 non-null int64
fold_seed                    1568 non-null object
num_fold_seeds               1568 non-null int64
min_radius                   1568 non-null float64
max_radius                   1568 non-null float64
row_normalizer               1568 non-null object
scale                        1568 non-null bool
auc                          1568 non-null float64
enrichment_1                 1568 non-null float64
en

In [3]:
# We will only look at L2-regularized networks (training with L1 penalty takes much longer to worse results)
df = df.query('model_name == "tdtl2"')
# Also we will remove binarize + fold counts
# These are binarized matrices that become counts on folding
# (each bucket counts the number of substructures going them without taking into account their counts)
# It works well, but I do not think it is common practice, so for the time being I let them out
df = df.query('is_binary or is_regular_counts')

In [4]:
# The table looks like this...
show_df(df.sort_values('auc', ascending=False).head(5))

data_name,model_name,zero_columns,binarize_threshold,input_is_binary,is_binary,is_binary_counts,is_regular_counts,allow_unseen_in_folding,fold_size,fold_seed,num_fold_seeds,min_radius,max_radius,row_normalizer,scale,auc,enrichment_1,enrichment_5,enrichment_10,bedroc20,rie20,pre_column_manipulation_s,fit_s,predict_s,eval_s,total_s,id,model_sparsity,model,folder,fold_as_binary,unfolded
competition-external,tdtl2,transductive,0.0,True,True,False,False,False,16383,"(3, 4, 5)",3,-inf,inf,none,False,0.84,3.8,4.93,4.54,0.49,4.41,0.04,435.18,3.61,0.0118,438.84,f073e276e79b504c08bb0bd74d3f99233a9ad542,0.0,"Pipeline(memory=None,steps=[('zero_columns',ZeroColumns(invert=True,origin='transductive')),('binarize',FastBinarizer(copy=True)),('fold',MultiFolder(as_binary=True,fold_size=16383,safe=True,seeds=(3,4,5))),('model',logreg(C=5,class_weight='balanced',dual=False,fit_intercept=True,intercept_scaling=1,max_iter=100,multi_class='ovr',n_jobs=1,penalty='l2',random_state=0,solver='liblinear',tol=0.0001,verbose=0,warm_start=False))])","MultiFolder(as_binary=True,fold_size=16383,safe=True,seeds=(3,4,5))",True,False
competition-external,tdtl2,none,0.0,True,True,False,False,True,16383,"(3, 4, 5)",3,-inf,4.0,none,False,0.84,4.75,4.93,4.14,0.49,4.34,41.1,228.16,0.14,0.0106,269.41,2575f208e526e34c300446f684d201f7d4d0d68d,0.0,"Pipeline(memory=None,steps=[('binarize',FastBinarizer(copy=True)),('fold',MultiFolder(as_binary=True,fold_size=16383,safe=True,seeds=(3,4,5))),('model',logreg(C=5,class_weight='balanced',dual=False,fit_intercept=True,intercept_scaling=1,max_iter=100,multi_class='ovr',n_jobs=1,penalty='l2',random_state=0,solver='liblinear',tol=0.0001,verbose=0,warm_start=False))])","MultiFolder(as_binary=True,fold_size=16383,safe=True,seeds=(3,4,5))",True,False
competition-external,tdtl2,from_train,inf,False,False,False,True,False,4294967295,nofold,0,-inf,inf,none,False,0.84,3.8,5.52,4.34,0.51,4.53,0.02,291.0,4.7,0.0135,295.73,3913df9df8492ab4d358036ca1d2962bb4f97157,0.74,"Pipeline(memory=None,steps=[('zero_columns',ZeroColumns(invert=True,origin='from_train')),('model',logreg(C=5,class_weight='balanced',dual=False,fit_intercept=True,intercept_scaling=1,max_iter=100,multi_class='ovr',n_jobs=1,penalty='l2',random_state=0,solver='liblinear',tol=0.0001,verbose=0,warm_start=False))])",none,False,True
competition-external,tdtl2,none,inf,False,False,False,True,True,16383,"(3, 4, 5)",3,-inf,4.0,none,False,0.84,4.75,4.54,4.14,0.46,4.07,41.71,420.46,0.24,0.00601,462.41,30c9117e7e40f9e5ddb1a7231e7b5221a77a5aed,0.0,"Pipeline(memory=None,steps=[('fold',MultiFolder(as_binary=False,fold_size=16383,safe=True,seeds=(3,4,5))),('model',logreg(C=5,class_weight='balanced',dual=False,fit_intercept=True,intercept_scaling=1,max_iter=100,multi_class='ovr',n_jobs=1,penalty='l2',random_state=0,solver='liblinear',tol=0.0001,verbose=0,warm_start=False))])","MultiFolder(as_binary=False,fold_size=16383,safe=True,seeds=(3,4,5))",False,False
competition-external,tdtl2,none,0.0,True,True,False,False,False,8191,"(0, 1)",2,-inf,3.0,none,False,0.84,3.8,4.73,4.93,0.48,4.27,46.6,160.23,0.09,0.0111,206.93,868da2ea145a980318cf7d1a185c5e17b92d7f7d,0.0,"Pipeline(memory=None,steps=[('binarize',FastBinarizer(copy=True)),('fold',MultiFolder(as_binary=True,fold_size=8191,safe=True,seeds=(0,1))),('model',logreg(C=5,class_weight='balanced',dual=False,fit_intercept=True,intercept_scaling=1,max_iter=100,multi_class='ovr',n_jobs=1,penalty='l2',random_state=0,solver='liblinear',tol=0.0001,verbose=0,warm_start=False))])","MultiFolder(as_binary=True,fold_size=8191,safe=True,seeds=(0,1))",True,False


In [5]:
# Lets look at:
# counts vs binary (remember, still not implemented for folding) 
#   => irrelevant for non folding (most values are 1 anyway)
#      result might differ enormously when we allow folding counts
# whether we allow unseen features to add noise in folding
#   => large effect when folding to usual sizes (1024, 2048, 4096)
#      N.B. should we had any small effect of folding seed for no folding, 
#      it would be due to liblinear uncontrollable seed:
#        https://github.com/scikit-learn/scikit-learn/issues/365
#   => the effect is smaller as fold size grows
# how we normalize each row
#   => showed no effect, so to make things more easily graspable, I have moved it to
#      the query from the groupby (look at it back when we add counts to folding)
df.query('not scale and '
         'max_radius > 1000000 and '
         'zero_columns == "none" and '
         'row_normalizer == "none" and '
         'num_fold_seeds < 2').groupby(['fold_size',
                                        'is_binary',
                                        'allow_unseen_in_folding']).auc.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
fold_size,is_binary,allow_unseen_in_folding,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1023,False,False,2.0,0.72,0.000689,0.72,0.72,0.72,0.72,0.72
1023,False,True,2.0,0.71,0.0127,0.7,0.71,0.71,0.71,0.72
1023,True,False,2.0,0.72,0.00348,0.71,0.72,0.72,0.72,0.72
1023,True,True,2.0,0.7,0.00249,0.7,0.7,0.7,0.7,0.7
2047,False,False,2.0,0.73,0.011,0.72,0.72,0.73,0.73,0.74
2047,False,True,2.0,0.72,0.0163,0.71,0.71,0.72,0.72,0.73
2047,True,False,2.0,0.72,0.0166,0.71,0.72,0.72,0.73,0.73
2047,True,True,2.0,0.7,0.0212,0.69,0.69,0.7,0.71,0.72
4091,False,False,2.0,0.78,0.0129,0.78,0.78,0.78,0.79,0.79
4091,False,True,2.0,0.74,0.0157,0.73,0.74,0.74,0.75,0.75


In [6]:
# Or more focused
df.query('max_radius > 1000000 and '
         'fold_size == 4091 and '
         'zero_columns == "none" and '
         'num_fold_seeds < 2 and '
         'not scale and '
         'row_normalizer == "none"').groupby('allow_unseen_in_folding').auc.mean()


allow_unseen_in_folding
False    0.78
True     0.74
Name: auc, dtype: float64

In [7]:
# What about the max radius?
#  - For small fold sizes, larger radii have no to detrimental effect => Due to growing collision rate?
#  - For large fold sizes and no fold, the model is able to figure out how not to over fit too much => clear gains 
df.query('row_normalizer == "none" and '
         'zero_columns == "none" and '
         'num_fold_seeds < 2 and '
         'not scale').groupby(['fold_size', 
                               'is_binary',
                               'allow_unseen_in_folding',
                               'max_radius']).auc.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count,mean,std,min,25%,50%,75%,max
fold_size,is_binary,allow_unseen_in_folding,max_radius,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1023,False,False,2.0,2.0,0.71,0.00577,0.7,0.7,0.71,0.71,0.71
1023,False,False,3.0,2.0,0.75,0.00738,0.74,0.75,0.75,0.75,0.75
1023,False,False,4.0,2.0,0.73,0.00232,0.73,0.73,0.73,0.74,0.74
1023,False,False,inf,2.0,0.72,0.000689,0.72,0.72,0.72,0.72,0.72
1023,False,True,2.0,2.0,0.7,3.67e-05,0.7,0.7,0.7,0.7,0.7
1023,False,True,3.0,2.0,0.75,0.00887,0.74,0.75,0.75,0.75,0.75
1023,False,True,4.0,2.0,0.72,0.00305,0.72,0.72,0.72,0.72,0.72
1023,False,True,inf,2.0,0.71,0.0127,0.7,0.71,0.71,0.71,0.72
1023,True,False,2.0,2.0,0.69,0.0129,0.68,0.69,0.69,0.69,0.7
1023,True,False,3.0,2.0,0.73,0.00301,0.73,0.73,0.73,0.74,0.74


In [8]:
# What about scaling?
# What about removing duplicates ("representative columns")?
# What about hashing each substructure to different columns?
# What about L1 regularization (note that, as usual, it works worse than L2)?
# What about the same analysis with our original SMARTS feats? (computation needs to be better,
#   - What about completing with other substructures (e.g. our not so clever add FCFP to the mix?).
# What about just using FCFP itself?
# What about RF, now that sklearn supports CSR for building them?
# What about using the malaria dataset I mined out of chembl, so we can trust a bit more the results?
# What about speed differences between folded at different folds and unfolded?
#  - (might be surprising, as sparsity diminishes with smaller folds).

In [9]:
# Lets read the old results we have access to.
# In particular, we miss workflow1 model used for screening the commercial dataset
# (fingerprints with 4096 buckets, using their original monkey-patching weighting - which was detrimental)
from tdt_autopsy.eval import results_df
rdf = results_df().sort_values('auc', ascending=False).reset_index(drop=True)
rdf.model = rdf.model.str.replace('ccl', 'workflow2').str.replace('sg', 'workflow1')
rdf

Unnamed: 0,model,dataset,auc,enrichment_1,enrichment_5,enrichment_10,bedroc20,rie20
0,workflow2_logregs_lastFold_linr,competition_benchmark,0.83,7.6,4.14,3.95,0.47,4.16
1,workflow2_logregs_avg_linr,competition_benchmark,0.83,7.6,4.73,3.55,0.47,4.16
2,workflow2_workflow1_blend,competition_benchmark,0.81,4.75,4.14,3.95,0.42,3.73
3,workflow1_moderner_4096,competition_benchmark,0.8,3.8,3.16,3.65,0.39,3.44
4,workflow1_modern,competition_benchmark,0.79,4.75,3.95,3.25,0.39,3.48
5,workflow1_moderner,competition_benchmark,0.79,4.75,3.95,3.25,0.39,3.48
6,workflow2_final_lastFold_avg,competition_benchmark,0.79,4.75,4.54,3.75,0.45,3.98
7,workflow2_final_avg_avg,competition_benchmark,0.79,4.75,4.34,3.95,0.44,3.96
8,workflow2_final_lastFold_linr,competition_benchmark,0.79,3.8,4.34,3.65,0.42,3.78
9,workflow2_logregs_lastFold_avg,competition_benchmark,0.79,7.6,4.34,3.65,0.45,3.99
