## Predict BBB permeability of holdout set (636 compounds)

-

### Modules

In [55]:
import os
import rdkit
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import scipy
import sklearn

In [56]:
from os import path
from __future__ import print_function
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.Draw.MolDrawing import MolDrawing, DrawingOptions
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors3D
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors

In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn import model_selection
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, cohen_kappa_score, matthews_corrcoef
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, explained_variance_score
from sklearn.externals import joblib

### Get and Set working directory

In [58]:
os.chdir('/Users/fabienplisson/Desktop/PLISSON LAB/RESEARCH/Publications/12_Marine-derived_kinase_inhibitors_BBB_model/')

### Path to save figures

In [59]:
figpath = '/Users/fabienplisson/Desktop/PLISSON LAB/RESEARCH/Publications/12_Marine-derived_kinase_inhibitors_BBB_model/Figures/'

### Set seed

In [60]:
seed = 123

### Load datasets

In [61]:
normalized_df = pd.read_csv('Data/DESCRIPTORS/datasetNormalizedDescrs.csv', index_col=0)
normalized_df = normalized_df.fillna(0) 
normalized_df.head(10)

Unnamed: 0,qed,MinAbsPartialCharge,NumRadicalElectrons,FpDensityMorgan2,FpDensityMorgan3,FpDensityMorgan1,HeavyAtomMolWt,MaxAbsEStateIndex,MaxAbsPartialCharge,MaxEStateIndex,MinPartialCharge,ExactMolWt,MolWt,NumValenceElectrons,MinEStateIndex,MinAbsEStateIndex,MaxPartialCharge,BalabanJ,BertzCT,Chi0,Chi0n,Chi0v,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,Chi3n,Chi3v,Chi4n,Chi4v,HallKierAlpha,Ipc,Kappa1,Kappa2,Kappa3,LabuteASA,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14,PEOE_VSA2,PEOE_VSA3,PEOE_VSA4,PEOE_VSA5,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,SMR_VSA1,SMR_VSA10,SMR_VSA2,SMR_VSA3,SMR_VSA4,SMR_VSA5,SMR_VSA6,SMR_VSA7,SMR_VSA8,SMR_VSA9,SlogP_VSA1,SlogP_VSA10,SlogP_VSA11,SlogP_VSA12,SlogP_VSA2,SlogP_VSA3,SlogP_VSA4,SlogP_VSA5,SlogP_VSA6,SlogP_VSA7,SlogP_VSA8,SlogP_VSA9,TPSA,EState_VSA1,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,EState_VSA5,EState_VSA6,EState_VSA7,EState_VSA8,EState_VSA9,VSA_EState1,VSA_EState10,VSA_EState2,VSA_EState3,VSA_EState4,VSA_EState5,VSA_EState6,VSA_EState7,VSA_EState8,VSA_EState9,FractionCSP3,HeavyAtomCount,NHOHCount,NOCount,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAromaticCarbocycles,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount,MolLogP,MolMR,fr_Al_COO,fr_Al_OH,fr_Al_OH_noTert,fr_ArN,fr_Ar_COO,fr_Ar_N,fr_Ar_NH,fr_Ar_OH,fr_COO,fr_COO2,fr_C_O,fr_C_O_noCOO,fr_C_S,fr_HOCCN,fr_Imine,fr_NH0,fr_NH1,fr_NH2,fr_N_O,fr_Ndealkylation1,fr_Ndealkylation2,fr_Nhpyrrole,fr_SH,fr_aldehyde,fr_alkyl_carbamate,fr_alkyl_halide,fr_allylic_oxid,fr_amide,fr_amidine,fr_aniline,fr_aryl_methyl,fr_azide,fr_azo,fr_barbitur,fr_benzene,fr_benzodiazepine,fr_bicyclic,fr_diazo,fr_dihydropyridine,fr_epoxide,fr_ester,fr_ether,fr_furan,fr_guanido,fr_halogen,fr_hdrzine,fr_hdrzone,fr_imidazole,fr_imide,fr_isocyan,fr_isothiocyan,fr_ketone,fr_ketone_Topliss,fr_lactam,fr_lactone,fr_methoxy,fr_morpholine,fr_nitrile,fr_nitro,fr_nitro_arom,fr_nitro_arom_nonortho,fr_nitroso,fr_oxazole,fr_oxime,fr_para_hydroxylation,fr_phenol,fr_phenol_noOrthoHbond,fr_phos_acid,fr_phos_ester,fr_piperdine,fr_piperzine,fr_priamide,fr_prisulfonamd,fr_pyridine,fr_quatN,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
CPSM001,0.809004,0.206972,0.0,0.839161,0.890909,0.676923,0.1792,0.588648,0.504591,0.588648,0.495409,0.170258,0.170523,0.094891,0.791722,0.066742,0.215594,0.666508,0.100648,0.104419,0.091502,0.13277,0.106195,0.085605,0.12009,0.073313,0.11567,0.062331,0.09244,0.052051,0.07841,0.639712,3.04634e-39,0.098103,0.069539,0.001295,0.121164,0.031405,0.050267,0.064624,0.0,0.0,0.0,0.079962,0.0,0.0,0.0,0.04605,0.184554,0.107299,0.054529,0.022659,0.345722,0.0,0.072546,0.0,0.0,0.0,0.218162,0.0,0.072095,0.0,0.0,0.1,0.357731,0.030008,0.0,0.032478,0.0,0.183017,0.5,0.167853,0.0,0.041864,0.0,0.032131,0.0,0.022564,0.171735,0.042821,0.0,0.144028,0.24854,0.039245,0.153067,0.0,0.677392,0.0,0.0,0.0,0.0,0.0,0.0,0.005958,0.108029,0.0,0.097561,0.055556,0.04,0.0,0.0,0.0,0.142857,0.2,0.25,0.051282,0.058824,0.068966,0.0,0.0,0.0,0.0,0.133333,0.569689,0.135387,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CPSM002,0.712181,0.419505,0.0,0.639118,0.647934,0.536364,0.213784,0.727425,0.394492,0.727425,0.605508,0.205733,0.206332,0.173723,0.328707,0.000673,0.363072,0.667715,0.202541,0.184565,0.158408,0.191389,0.174119,0.139526,0.201177,0.124923,0.197592,0.100089,0.152278,0.087963,0.122252,0.559648,1.359329e-37,0.18282,0.124495,0.002636,0.178498,0.092709,0.0,0.0,0.1414,0.0,0.075234,0.222919,0.146556,0.0,0.0,0.0921,0.027447,0.0,0.29301,0.06001,0.382915,0.0,0.145091,0.0,0.0,0.318679,0.243493,0.0,0.0,0.104365,0.149754,0.0,0.242749,0.127577,0.059292,0.0,0.0,0.117446,1.0,0.169862,0.0,0.155889,0.07787,0.113301,0.0,0.130727,0.0,0.083961,0.032579,0.0,0.0,0.07849,0.373512,0.0,0.703434,0.0,0.0,0.0,0.0,0.0,0.0,0.085224,0.145818,0.272727,0.170732,0.166667,0.16,0.0,0.0,0.0,0.142857,0.2,0.25,0.128205,0.176471,0.189655,0.08,0.0,0.0,0.0,0.133333,0.451581,0.19247,0.0,0.071429,0.076923,0.0,0.0,0.285714,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.166667,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.142857,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CPSM003,0.374274,0.279071,0.0,0.931419,0.9311,0.747368,0.194875,0.518832,0.355767,0.518832,0.644233,0.190903,0.191521,0.147445,0.830614,0.182105,0.265625,0.500595,0.097518,0.154312,0.151399,0.18543,0.158896,0.139485,0.185594,0.099191,0.15194,0.07464,0.121302,0.058205,0.097172,0.582866,4.976433e-38,0.171752,0.202852,0.005716,0.16665,0.065397,0.0,0.0,0.17144,0.0,0.0,0.079962,0.0,0.252794,0.086069,0.0,0.158296,0.326672,0.036133,0.0,0.257991,0.243188,0.227325,0.045416,0.035924,0.253227,0.188451,0.0,0.077658,0.071952,0.0,0.0,0.289725,0.088578,0.03403,0.104207,0.043442,0.208514,0.0,0.0,0.0,0.092398,0.0,0.0,0.0,0.0,0.072537,0.276362,0.0,0.189778,0.165274,0.287722,0.069427,0.0,0.628126,0.0,0.0,0.0,0.0,0.0,0.0,0.007018,0.148276,0.416667,0.146341,0.111111,0.1,0.0,0.0,0.0,0.0,0.2,0.125,0.102564,0.117647,0.12069,0.12,0.0,0.0,0.0,0.066667,0.52576,0.188415,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.375,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060606,0.0
CPSM004,0.545128,0.34716,0.0,0.559441,0.685315,0.330769,0.209253,0.764368,0.506382,0.764368,0.493618,0.203497,0.20382,0.185401,0.72372,0.029385,0.312872,0.408193,0.245665,0.200538,0.192216,0.202924,0.216116,0.18927,0.198217,0.164579,0.173793,0.142718,0.136416,0.128684,0.109607,0.424339,2.9732469999999998e-36,0.176581,0.138383,0.00268,0.20692,0.091189,0.157166,0.0,0.0,0.189389,0.0,0.076923,0.0,0.0,0.0,0.166452,0.217302,0.096422,0.070313,0.043677,0.172528,0.0,0.072546,0.0,0.0,0.0,0.583105,0.0,0.35581,0.069669,0.0,0.3,0.121375,0.030008,0.0,0.0,0.0,0.582082,0.5,0.339154,0.0,0.078772,0.015601,0.062298,0.0,0.044397,0.401516,0.0,0.0,0.333673,0.263063,0.039245,0.215567,0.0,0.67715,0.0,0.0,0.0,0.0,0.0,0.0,0.003586,0.204128,0.0,0.203252,0.111111,0.08,0.0,0.0,0.0,0.428571,0.2,0.5,0.076923,0.117647,0.086207,0.06,0.0,0.0,0.0,0.266667,0.656664,0.23821,0.0,0.0,0.0,0.0,0.0,0.142857,0.25,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.428571,0.0,0.071429,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CPSM005,0.588441,0.371039,0.0,0.761905,0.822727,0.507143,0.212047,0.74645,0.494849,0.74645,0.505151,0.212718,0.212767,0.211679,0.743584,0.076045,0.329441,0.329269,0.203165,0.219087,0.229969,0.229969,0.2339,0.221904,0.221904,0.186947,0.183622,0.156429,0.141319,0.129723,0.104223,0.425941,6.64886e-36,0.212446,0.202801,0.004773,0.22116,0.092482,0.052389,0.0,0.083914,0.189389,0.0,0.236848,0.0,0.07401,0.0,0.02408,0.225993,0.396599,0.08125,0.021018,0.045603,0.0,0.290182,0.0,0.092405,0.25496,0.500782,0.0,0.072095,0.105645,0.156625,0.1,0.0,0.099893,0.075963,0.062986,0.121732,0.357471,0.0,0.0,0.0,0.117286,0.0,0.030168,0.0,0.021817,0.218268,0.385646,0.0,0.201945,0.270499,0.198845,0.0625,0.0,0.607273,0.0,0.0,0.0,0.0,0.0,0.0,0.003586,0.220035,0.333333,0.219512,0.111111,0.14,0.0,0.0,0.0,0.0,0.6,0.375,0.153846,0.117647,0.12069,0.18,0.0,0.0,0.0,0.2,0.557709,0.252044,0.0,0.0,0.0,0.0,0.0,0.571429,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.375,0.166667,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060606,0.0
CPSM006,0.607608,0.44253,0.0,0.824242,0.776364,0.73,0.154597,0.680872,0.47788,0.680872,0.52212,0.159558,0.159597,0.167883,0.662249,0.034366,0.379049,0.644826,0.072884,0.167572,0.176558,0.176558,0.163234,0.168147,0.168147,0.143522,0.140969,0.128067,0.115697,0.109188,0.087724,0.607686,6.176765e-38,0.185954,0.175931,0.004035,0.158487,0.124931,0.0,0.0,0.0,0.0,0.040391,0.085302,0.083474,0.0,0.0,0.054968,0.114644,0.173455,0.158523,0.085969,0.045764,0.0,0.07739,0.05216,0.16267,0.186007,0.077038,0.0,0.0,0.074773,0.0,0.0,0.0,0.146337,0.084402,0.0,0.113997,0.087391,0.0,0.0,0.0,0.118576,0.016751,0.030168,0.0,0.095332,0.22791,0.102093,0.0,0.10361,0.0,0.150903,0.26803,0.0,0.607273,0.0,0.0,0.0,0.0,0.0,0.0,0.003586,0.18397,0.785714,0.154472,0.222222,0.12,0.111111,0.0,0.083333,0.0,0.0,0.0,0.128205,0.176471,0.103448,0.16,0.0,0.0,0.0,0.066667,0.475179,0.172592,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.071429,0.0,0.0,0.0,0.0,0.0,0.083333,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CPSM007,0.545778,0.371039,0.0,0.727273,0.781818,0.485714,0.251062,0.746724,0.35573,0.746724,0.64427,0.248105,0.24869,0.211679,0.744809,0.072411,0.329441,0.328758,0.209839,0.220895,0.228444,0.25091,0.23143,0.221604,0.240376,0.194447,0.216462,0.159421,0.157827,0.131238,0.121549,0.480384,5.437216e-36,0.219642,0.201841,0.005072,0.233646,0.06335,0.0,0.0,0.083914,0.189389,0.0,0.236848,0.0,0.07401,0.0,0.02408,0.327119,0.490777,0.0,0.0,0.167732,0.0,0.290182,0.0,0.112026,0.15941,0.527035,0.0,0.0,0.073593,0.156625,0.0,0.166667,0.078749,0.075963,0.125971,0.140886,0.345516,0.0,0.0,0.0,0.10562,0.0,0.030168,0.0,0.021817,0.218268,0.375495,0.029878,0.097386,0.219018,0.426567,0.0,0.0,0.607273,0.0,0.0,0.0,0.0,0.0,0.0,0.006919,0.204091,0.333333,0.219512,0.111111,0.12,0.0,0.0,0.0,0.0,0.6,0.375,0.128205,0.117647,0.12069,0.16,0.0,0.0,0.0,0.2,0.600723,0.266431,0.0,0.0,0.0,0.0,0.0,0.571429,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.375,0.166667,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060606,0.0
CPSM008,0.465948,0.408537,0.0,0.580349,0.590083,0.454545,0.255407,0.834297,0.481159,0.834297,0.518841,0.258465,0.258513,0.261314,0.635538,0.021451,0.355462,0.603686,0.206602,0.273799,0.281572,0.281572,0.266309,0.263308,0.263308,0.24961,0.245169,0.187499,0.169388,0.173119,0.139088,0.47478,2.682193e-35,0.289663,0.252925,0.006366,0.264678,0.123348,0.053006,0.0,0.0,0.0,0.040391,0.156886,0.076438,0.0,0.0,0.206334,0.158789,0.281158,0.196274,0.129749,0.092346,0.0,0.072546,0.0,0.201737,0.09555,0.38802,0.0,0.139524,0.0,0.115604,0.0,0.0,0.135579,0.095464,0.05292,0.257835,0.22762,0.0,0.264843,0.0,0.126248,0.069029,0.122054,0.0,0.094477,0.080416,0.267456,0.097789,0.198945,0.120097,0.109038,0.195638,0.0,0.607273,0.0,0.0,0.0,0.0,0.0,0.0,0.048877,0.246917,0.461538,0.260163,0.166667,0.12,0.0,0.0,0.0,0.142857,0.2,0.25,0.128205,0.176471,0.12069,0.22,0.0,0.0,0.0,0.133333,0.637807,0.295033,0.2,0.142857,0.153846,0.0,0.0,0.142857,0.0,0.0,0.2,0.2,0.071429,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CPSM009,0.112022,0.54474,0.0,0.535354,0.588636,0.3625,0.390499,0.825394,0.443765,0.825394,0.556235,0.391087,0.391226,0.372263,0.65624,0.02762,0.449973,0.316719,0.333642,0.381975,0.394823,0.417954,0.395195,0.38165,0.423299,0.350444,0.395971,0.280447,0.315343,0.238343,0.239624,0.353082,1.135981e-31,0.399053,0.387612,0.010484,0.401352,0.156069,0.060201,0.065003,0.0,0.0,0.08204,0.156886,0.253721,0.0,0.390892,0.295772,0.195598,0.286826,0.290777,0.107499,0.311119,0.0,0.371193,0.053836,0.23723,0.094715,0.690794,0.0,0.0,0.071952,0.252489,0.0,0.237222,0.187701,0.210142,0.053836,0.228051,0.583284,0.0,0.0,0.0,0.16906,0.068139,0.122633,0.0,0.145915,0.307993,0.212313,0.087199,0.290856,0.572787,0.271258,0.0625,0.0,0.642006,0.0,0.0,0.0,0.0,0.0,0.0,0.003586,0.351422,0.416667,0.382114,0.166667,0.2,0.0,0.0,0.0,0.285714,0.4,0.5,0.230769,0.176471,0.206897,0.34,0.0,0.0,0.0,0.266667,0.697694,0.447767,0.0,0.071429,0.076923,0.0,0.0,0.285714,0.0,0.0,0.0,0.0,0.214286,0.214286,0.0,0.0,0.0,0.375,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.230769,0.0,0.0,0.0,0.0,0.0,0.0,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
CPSM010,0.330905,0.260294,0.0,0.527629,0.63262,0.329412,0.268284,0.832089,0.507966,0.832089,0.492034,0.266423,0.266529,0.252555,0.759592,0.02856,0.252595,0.31194,0.274594,0.260352,0.267249,0.278815,0.28407,0.275365,0.294694,0.244656,0.269576,0.220982,0.237887,0.200127,0.205536,0.415532,2.387314e-34,0.244016,0.207715,0.004239,0.276215,0.091942,0.217367,0.065003,0.0,0.0,0.0,0.155537,0.0,0.0,0.195446,0.025487,0.444417,0.302311,0.0,0.08761,0.208578,0.0,0.071322,0.0,0.054588,0.352661,0.514884,0.0,0.347204,0.032052,0.0,0.3,0.118611,0.140179,0.0,0.0,0.121138,0.500604,0.0,0.316014,0.0,0.08848,0.0,0.094429,0.0,0.067824,0.215848,0.364995,0.164332,0.333673,0.210451,0.038583,0.0625,0.0,0.624349,0.0,0.0,0.0,0.0,0.0,0.0,0.003586,0.255746,0.25,0.268293,0.111111,0.1,0.0,0.1,0.083333,0.428571,0.2,0.5,0.153846,0.117647,0.103448,0.14,0.0,0.125,0.1,0.333333,0.686174,0.31902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.071429,0.071429,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.428571,0.0,0.071429,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0


In [62]:
model_set = normalized_df[0:332]
holdout_set = normalized_df[332:968]

In [63]:
logBB_df = pd.read_csv('Data/logBB_values.csv', index_col=0)
logBB_df.head()

BBclass_df = logBB_df
BBclass_df['BBclass'] = np.where(logBB_df['logBB']>0.1, 1, 0)
BBclass_df = logBB_df.drop(['logBB'], axis=1)
BBclass_df.head()

Unnamed: 0,BBclass
CPSM001,0
CPSM002,0
CPSM003,0
CPSM004,0
CPSM005,0


In [64]:
x_train, x_test, y_train, y_test = train_test_split(model_set, BBclass_df, test_size=0.25, random_state=seed)

### Load models

In [None]:
model1 = joblib.load('./Models/model1.pkl')
model2 = joblib.load('./Models/model2.pkl') 
model3 = joblib.load('./Models/model3.pkl') 
model4 = joblib.load('./Models/model4.pkl') 
model5 = joblib.load('./Models/model5.pkl') 
model6 = joblib.load('./Models/model6.pkl') 

In [None]:
model1.fit(x_train, y_train)
model2.fit(x_train, y_train)
model3.fit(x_train, y_train)
model4.fit(x_train, y_train)
model5.fit(x_train, y_train)
model6.fit(x_train, y_train)

  if __name__ == '__main__':


In [None]:
models = [model1, model2, model3]

result = pd.DataFrame(index=holdout_set.index)

for m in models:
    m.fit(x_train, y_train)
    class_df = pd.DataFrame(m.predict(holdout_set))
    probs_df = pd.DataFrame(m.predict_proba(holdout_set))

    model_df = class_df.merge(probs_df, how='outer', left_index=True, right_index=True)
    model_df.columns = ['pred_class', 'prob 0', 'prob 1']
    
    result.append(result)
    result = pd.concat([model_df, model_set], axis=1, join_axes=[model_df.index])
    
    
result


In [None]:
class_df = pd.DataFrame(model1.predict(holdout_set))
probs_df = pd.DataFrame(model1.predict_proba(holdout_set))

model1_df = class_df.merge(probs_df, how='outer', left_index=True, right_index=True)
model1_df.columns = ['pred_class', 'prob 0', 'prob 1']

In [None]:
class_df = pd.DataFrame(model2.predict(holdout_set))
probs_df = pd.DataFrame(model2.predict_proba(holdout_set))

model2_df = class_df.merge(probs_df, how='outer', left_index=True, right_index=True)
model2_df.columns = ['pred_class', 'prob 0', 'prob 1']

In [None]:
class_df = pd.DataFrame(model3.predict(holdout_set))
probs_df = pd.DataFrame(model3.predict_proba(holdout_set))

model3_df = class_df.merge(probs_df, how='outer', left_index=True, right_index=True)
model3_df.columns = ['pred_class', 'prob 0', 'prob 1']


In [None]:
class_df = pd.DataFrame(model4.predict(holdout_set))
probs_df = pd.DataFrame(model4.predict_proba(holdout_set))

model4_df = class_df.merge(probs_df, how='outer', left_index=True, right_index=True)
model4_df.columns = ['pred_class', 'prob 0', 'prob 1']

In [None]:
class_df = pd.DataFrame(model5.predict(holdout_set))
probs_df = pd.DataFrame(model5.predict_proba(holdout_set))

model5_df = class_df.merge(probs_df, how='outer', left_index=True, right_index=True)
model5_df.columns = ['pred_class', 'prob 0', 'prob 1']

In [None]:
class_df = pd.DataFrame(model6.predict(holdout_set))
probs_df = pd.DataFrame(model6.predict_proba(holdout_set))

model6_df = class_df.merge(probs_df, how='outer', left_index=True, right_index=True)
model6_df.columns = ['pred_class', 'prob 0', 'prob 1']

In [None]:
models_df = [model1_df, model2_df, model3_df, model4_df, model5_df, model6_df]

In [None]:
results = pd.concat(models_df, axis=1, join='inner')
results.index = holdout_set.index
results

#### Comparative heatmap of the 6 models against 636 compounds (divided in 3 groups - CPSMs, KDs and MDKIs)

One way of doing it is to select columns "prob 1" for each model, keep the indices and create a heatmap based on this subset dataframe. Rename column names. Colour the gradient from blue (0) to red (1) based on probability values.

In [None]:
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

subset = results.iloc[:,[2,5,8,11,14,17]]
subset.columns = ['model1', 'model2', 'model3', 'model4', 'model5', 'model6']
subset

In [None]:
import matplotlib as mpl
def reverse_colourmap(cmap, name = 'my_cmap_r'):
    """
    In: 
    cmap, name 
    Out:
    my_cmap_r

    Explanation:
    t[0] goes from 0 to 1
    row i:   x  y0  y1 -> t[0] t[1] t[2]
                   /
                  /
    row i+1: x  y0  y1 -> t[n] t[1] t[2]

    so the inverse should do the same:
    row i+1: x  y1  y0 -> 1-t[0] t[2] t[1]
                   /
                  /
    row i:   x  y1  y0 -> 1-t[n] t[2] t[1]
    """        
    reverse = []
    k = []   

    for key in cmap._segmentdata:    
        k.append(key)
        channel = cmap._segmentdata[key]
        data = []

        for t in channel:                    
            data.append((1-t[0],t[2],t[1]))            
        reverse.append(sorted(data))    

    LinearL = dict(zip(k,reverse))
    my_cmap_r = mpl.colors.LinearSegmentedColormap(name, LinearL) 
    return my_cmap_r

In [None]:
cmap = mpl.cm.RdBu
BuRd = reverse_colourmap(cmap)

In [None]:
plt.figure(figsize=(10,20))
sns.heatmap(subset, vmin=0, vmax=1, cmap=BuRd)
#plt.show()

plt.xlabel('Models')
plt.suptitle('Probability for BBB permeability (positive logBB value)', x=0.42, y =0.90, fontsize=14)
plt.savefig(path.join(figpath,"Figure4.1_Probability for BBB permeability (positive logBB value).pdf"))
plt.close()

In [None]:
cpsms = subset[0:116]
kds = subset[116:165]
mdkis = subset[165:637]

In [None]:
plt.figure(figsize=(10,20))
sns.heatmap(cpsms, vmin=0, vmax=1, cmap=BuRd)
#plt.show()

plt.xlabel('Models')
plt.suptitle('Probability for BBB permeability (positive logBB value)', x=0.42, y =0.90, fontsize=14)
plt.savefig(path.join(figpath,"Figure4.2_Probability for BBB permeability (positive logBB value)_CPSMs.pdf"))
plt.close()

In [None]:
plt.figure(figsize=(10,20))
sns.heatmap(kds, vmin=0, vmax=1, cmap=BuRd)
#plt.show()

plt.xlabel('Models')
plt.suptitle('Probability for BBB permeability (positive logBB value)', x=0.42, y =0.90, fontsize=14)
plt.savefig(path.join(figpath,"Figure4.2_Probability for BBB permeability (positive logBB value)_KDs.pdf"))
plt.close()

In [None]:
plt.figure(figsize=(10,30))
sns.heatmap(mdkis, vmin=0, vmax=1, cmap=BuRd)
#plt.show()

plt.xlabel('Models')
plt.suptitle('Probability for BBB permeability (positive logBB value)', x=0.42, y =0.90, fontsize=14)
plt.savefig(path.join(figpath,"Figure4.2_Probability for BBB permeability (positive logBB value)_MDKIs.pdf"))
plt.close()

In [None]:
subset2 = results.iloc[:,[1,4,7,10,13,16]]
subset2.columns = ['model1', 'model2', 'model3', 'model4', 'model5', 'model6']
subset2

In [None]:
plt.figure(figsize=(10,20))
sns.heatmap(subset2, vmin=0, vmax=1, cmap="RdBu")
#plt.show()

plt.xlabel('Models')
plt.suptitle('Probability for BBB permeability (negative logBB value)', x=0.42, y =0.90, fontsize=14)
plt.savefig(path.join(figpath,"Figure4.1_Probability for BBB permeability (negative logBB value).pdf"))
plt.close()

In [None]:
cpsms = subset2[0:116]
kds = subset2[116:165]
mdkis = subset2[165:637]

In [None]:
plt.figure(figsize=(10,20))
sns.heatmap(cpsms, vmin=0, vmax=1, cmap="RdBu")
#plt.show()

plt.xlabel('Models')
plt.suptitle('Probability for BBB permeability (negative logBB value)', x=0.42, y =0.90, fontsize=14)
plt.savefig(path.join(figpath,"Figure4.3_Probability for BBB permeability (negative logBB value)_CPSMs.pdf"))
plt.close()

In [None]:
plt.figure(figsize=(10,20))
sns.heatmap(kds, vmin=0, vmax=1, cmap="RdBu")
#plt.show()

plt.xlabel('Models')
plt.suptitle('Probability for BBB permeability (negative logBB value)', x=0.42, y =0.90, fontsize=14)
plt.savefig(path.join(figpath,"Figure4.3_Probability for BBB permeability (negative logBB value)_KDs.pdf"))
plt.close()

In [None]:
plt.figure(figsize=(10,30))
sns.heatmap(mdkis, vmin=0, vmax=1, cmap="RdBu")
#plt.show()

plt.xlabel('Models')
plt.suptitle('Probability for BBB permeability (negative logBB value)', x=0.42, y =0.90, fontsize=14)
plt.savefig(path.join(figpath,"Figure4.3_Probability for BBB permeability (negative logBB value)_MDKIs.pdf"))
plt.close()

In general, all 6 models predict the same probability results for the 636 compounds with few hesitations around 0.50. We observed that CPSMs will either have a positive or negative logBB value while KDs and MDKIs are mostly predicted to not pass the blood-brain barrier, with moderate to high probability to belong to class 0 (blue, negative logBB value) with few exceptions;

- (+) CPSM335, 337, 343, 344, 347, 350, 351, 354,356, 362, 364, 367, 385, 391, 392, 395, 410, 413, 429, 430, 438, 440, 441, 443, 445 
- (-) CPSM333, 334, 336, 347, 361, 366, 368, 375
- (+) KD22 
- (-) KD01, 02, 03, 04, 17, 33, 37, 40, 42, 49
- (+) MDKI29, 44, 52, 89, 90, 91, 92, 96, 97, 150, 151, 197, 243, 268, 269, 273, 274, 275, 340, 343, 421, 470 
- (-) MDKI03, 06, 09, 15, 22, 388, 430

#### Projection of probabilities to pass blood-brain barrier (class 1) or not (class 0)  onto principal component analysis

In [None]:
# Export dataset 'subset' as csv file 
subset.to_csv('./Data/holdout_set_probs_class1.csv')

In [None]:
# Export dataset 'subset' as csv file 
subset2.to_csv('./Data/holdout_set_probs_class0.csv')