In [1]:
import rdkit
import pandas as pd
from rdkit import Chem
import plotly.express as px
from rdkit.Chem import PandasTools
from scopy.ScoDruglikeness import molproperty

In [2]:
df = pd.read_csv('ligprep_oral_bioavailability_unique_out.csv')
df.head(2)

Unnamed: 0,smiles,NAME_,F,pref_target_name
0,[H]C([H])([H])N(C(=NC(N([H])[H])=[N+]([H])[H])...,CHEMBL1431,43.0,Rattus norvegicus
1,[H]C1([H])[C@]([H])([C@@]([H])([C@@]([H])(C([H...,CHEMBL80254,89.0,Rattus norvegicus


In [3]:
df['MW'] = df['smiles'].map(lambda x: molproperty.CalculateMolWeight(Chem.MolFromSmiles(x)))
df['logP'] = df['smiles'].map(lambda x: molproperty.CalculateLogP(Chem.MolFromSmiles(x)))
df['logD'] = df['smiles'].map(lambda x: molproperty.CalculateLogD(Chem.MolFromSmiles(x)))
df['TPSA'] = df['smiles'].map(lambda x: molproperty.CalculateTPSA(Chem.MolFromSmiles(x)))
df['HBD'] = df['smiles'].map(lambda x: molproperty.CalculateNumHDonors(Chem.MolFromSmiles(x)))
df['pKa'] = df['smiles'].map(lambda x: molproperty.CalculatepKa(Chem.MolFromSmiles(x)))

In [12]:
df.isnull().sum() # Missing calculated properties.

smiles                 0
NAME_                  0
F                      0
pref_target_name       0
MW                     0
logP                   0
logD                   0
TPSA                   0
HBD                    0
pKa                 1308
dtype: int64

In [46]:
df.head(10)

Unnamed: 0,smiles,NAME_,F,pref_target_name,MW,logP,logD,TPSA,HBD,pKa
0,[H]C([H])([H])N(C(=NC(N([H])[H])=[N+]([H])[H])...,CHEMBL1431,43.0,Rattus norvegicus,130.11,-3.06,-1.498713,93.23,3,
1,[H]C1([H])[C@]([H])([C@@]([H])([C@@]([H])(C([H...,CHEMBL80254,89.0,Rattus norvegicus,134.08,-3.35,-1.484285,77.3,4,
2,[H]c1c([H])c([H])nc([H])c1C([O-])=O,CHEMBL573,100.0,Mus musculus,122.02,-0.55,-0.9549405,53.02,0,7.212303
3,[H]c1c(C(N([H])N([H])[H])=O)c([H])c([H])nc1[H],CHEMBL64,17.36,Rattus norvegicus,137.06,-0.31,0.4855731,68.01,2,
4,[H]c1c([H])c(nc([H])c1C(N([H])[H])=O)N([H])C([...,CHEMBL4218848,19.0,Mus musculus,151.07,0.22,0.7298126,68.01,2,
5,[H]C([H])(C([H])([H])[C@@](C([O-])=O)(C([H])(F...,CHEMBL830,62.3,Rattus norvegicus,182.09,-2.28,-1.832335,93.79,2,
6,[H]C([H])(C([H])([H])[C@](C([O-])=O)(C([H])(F)...,CHEMBL222838,41.0,Rattus norvegicus,182.09,-2.28,-1.832335,93.79,2,
7,[H]C([H])([H])C1=C(C([H])([H])OC(C([H])([H])[H...,CHEMBL4283547,33.0,Rattus norvegicus,172.04,0.6,1.385633,69.65,0,
8,[H]C1([H])C(=C(F)F)[C@]([H])([C@@]([H])(C([O-]...,CHEMBL2203661,79.0,Rattus norvegicus,177.06,-1.09,-1.216587,67.77,1,7.870565
9,[H]C([H])([H])C([H])([H])OP(C(P([O-])([O-])=O)...,CHEMBL1160483,37.9,Rattus norvegicus,268.9,-0.42,-3761763000000.0,112.55,0,inf


In [32]:
df.sort_values(by=['MW'])
fig = px.scatter(df, x="MW", y="F")
fig.show()

In [45]:
fig = px.histogram(df, x="MW")
fig.show()

In [35]:
df.sort_values(by=['logD'])
fig = px.scatter(df, x="logD", y="F")
fig.show()

In [40]:
df.sort_values(by=['logP'])
fig = px.scatter(df, x="logP", y="F")
fig.show()

In [17]:
df.sort_values(by=['HBD'])
fig = px.scatter(df, x="HBD", y="F")
fig.show()

In [18]:
df.sort_values(by=['TPSA'])
fig = px.scatter(df, x="TPSA", y="F")
fig.show()

In [24]:
fig = px.scatter_3d(df, x='MW', y='TPSA', z='F', color='F')
fig.show()

In [25]:
fig = px.scatter_matrix(df,
    dimensions=["MW", "TPSA", "HBD", "logD", "logP"],
    color="F")
fig.show()

In [31]:
fig = px.scatter_ternary(df, a="F", b="MW", c="TPSA", color="F")
fig.show()