# chembl websource

In [194]:
from chembl_webresource_client.new_client import new_client
from IPython.display import SVG
import pandas as pd
import csv
import time
import json

available_resources = [resource for resource in dir(new_client) if not resource.startswith('_')]
print(available_resources)



### Filtering for assays done on human proteins:

In [2]:
clearance_assays1 = new_client.assay.filter(assay_organism = 'Homo sapiens', description__icontains = 'clearance')
clearance_assays2 = new_client.assay.filter(assay_organism = 'Homo sapiens', description__icontains = 'human liver microsome')
# assays = new_client.assay.filter(assay_organism = 'Homo sapiens', description__icontains = 'clearance', assay_subcellular_fraction = 'Microsome')

# can't filter on bao-label to get single protein format, for some reason?

print(
    len(clearance_assays1),
    len(clearance_assays2)
     )

4576 12165


#### *According to this filtering, there are 2542 assays with 'intrinsic clearance' in description. 4576 assays with "clearance" in description. 12165 assays with 'human liver microsome' in description.*

#### converting QuerySet object to DataFrame using this method is painfully slow (about 1 min for 2542 rows):

pd.DataFrame.from_records(QuerySet)

This way seems to work faster:

df = pd.DataFrame(
    list(QuerySet), columns=["desired column", "another desired column"]
)

In [3]:
start_time = time.time()
# --------------
chemblid1 = clearance_assays1.only(['clearance_assays_assay_chembl_id'])
chemblid2 = clearance_assays2.only(['clearance_assays_assay_chembl_id'])

df1 = pd.DataFrame(
    list(clearance_assays1), columns=["assay_chembl_id"]
)
df2 = pd.DataFrame(
    list(clearance_assays2), columns=["assay_chembl_id"]
)

assayIDs1 = df1["assay_chembl_id"].tolist()
assayIDs2 = df2["assay_chembl_id"].tolist()
assayIDs = list(set(assayIDs1 + assayIDs2))
# ------------------
print("There are", len(assayIDs), "assays.")
print("This took", time.time() - start_time, "to run.")

There are 14584 assays.
This took 3.17113995552063 to run.


#### Then, for each of those assay's chembl_id, find the activity values. This doesn't take long.

##### Why not just skip the chembl_id step? Because you can't filter activity set by assay_description, for some reason.

In [12]:
# Method 1 -- filtering by assay_chembl_id, only those that we found to include intrinsic clearance in the description, and on Homo sapiens
start_time = time.time()

activities = new_client.activity.filter( assay_chembl_id__in = assayIDs)

# ------------------
# print("There are", len(activities), "clearance values for", len(assayIDs), "assays.")
print("This took", time.time() - start_time, "to run.")

This took 0.009608745574951172 to run.


In [13]:
start_time = time.time()
def add_to_list(df, i):
    df.loc[len(df)] = (activities[i])

activities_df = pd.DataFrame(activities[0])

for i in range(len(activities)):
    add_to_list(activities_df, i)

activities_df = activities_df.loc[activities_df.astype(str).drop_duplicates().index]

print("This took", time.time() - start_time, "to run")
activities_df.to_csv("activities_values_to_filter_in_R_test.csv", index = False)
activities_df.head()

This took 28385.659880638123 to run


Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,33031,[],CHEMBL695026,In vitro oxidative metabolic stability measure...,F,,,BAO_0002759,...,Homo sapiens,Homo sapiens,9606,,,Clint,ml min-1 kg-1,,,47.7
1,,,34270,[],CHEMBL695026,In vitro oxidative metabolic stability measure...,F,,,BAO_0002759,...,Homo sapiens,Homo sapiens,9606,,,Clint,ml min-1 kg-1,,,4.9
2,,,34274,[],CHEMBL695026,In vitro oxidative metabolic stability measure...,F,,,BAO_0002759,...,Homo sapiens,Homo sapiens,9606,,,Clint,ml min-1 kg-1,,,16.5
3,,,34484,[],CHEMBL708969,Rate of Oxidation in human liver microsomes is...,A,,,BAO_0000179,...,Homo sapiens,Cytochrome P450 3A4,9606,,,Rate of oxidation,,,,0.41
4,,,35507,[],CHEMBL695026,In vitro oxidative metabolic stability measure...,F,,,BAO_0002759,...,Homo sapiens,Homo sapiens,9606,,,Clint,ml min-1 kg-1,,,8.0


With 78104 records, this takes 7.9 hours.

In [28]:
df = pd.read_csv('activities_values_to_filter_in_R.csv', index_col = False, low_memory=False)
assay = len(df['molecule_chembl_id'].tolist())
chems = len(set(df['molecule_chembl_id'].tolist()))

print(f"There are {assay} assays for {chems} unique chemicals.")

There are 78104 assays for 47578 unique chemicals.


#### Next, searching for inchi key given the molecule_chembl_ids in the activities df.

Iterate through all the Molecule IDs in df['molecule_chembl_id'], filter molecules to get the respective inchi_key

Put it all together with list comprehension.

In [209]:
import numpy as np
import math

df = pd.read_csv('activities_values_to_filter_in_R.csv', index_col = False, low_memory=False)[0:10000]

list_df = np.array_split(df, math.ceil(len(df)/1000))
# see for help: https://stackoverflow.com/questions/44729727/pandas-slice-large-dataframe-into-chunks

inchi_keys = []

start_time = time.time()
for i in list_df:
    for chembl_id in i['molecule_chembl_id'].tolist():
        try:
            inchi_keys += [new_client.molecule.filter(molecule_chembl_id = chembl_id)[0]["molecule_structures"]["standard_inchi_key"]]
        except TypeError:
            inchi_keys += ["NA"]
print("This took", time.time() - start_time, "to run")

df['standard_inchi_key'] = inchi_keys
df1 = df
df1.to_csv("activities_with_inchikeys_to_filter_in_R_p1.csv", index = False)

# running the chunk on the full df returns a TypeError:
# due to some chemicals, like df.loc[11669], CHEMBL1200457, which do not have an inchikey in chembl's database.

  return bound(*args, **kwds)


This took 34.76613116264343 to run


In [211]:
import numpy as np
import math

df = pd.read_csv('activities_values_to_filter_in_R.csv', index_col = False, low_memory=False)[10000:20000]

list_df = np.array_split(df, math.ceil(len(df)/1000))
# see for help: https://stackoverflow.com/questions/44729727/pandas-slice-large-dataframe-into-chunks

inchi_keys = []

start_time = time.time()
for i in list_df:
    for chembl_id in i['molecule_chembl_id'].tolist():
        try:
            inchi_keys += [new_client.molecule.filter(molecule_chembl_id = chembl_id)[0]["molecule_structures"]["standard_inchi_key"]]
        except TypeError:
            inchi_keys += ["NA"]
print("This took", time.time() - start_time, "to run")

df['standard_inchi_key'] = inchi_keys
df2 = df
df2.to_csv("activities_with_inchikeys_to_filter_in_R_p2.csv", index = False)

# running the chunk on the full df returns a TypeError:
# due to some chemicals, like df.loc[11669], CHEMBL1200457, which do not have an inchikey in chembl's database.

  return bound(*args, **kwds)


This took 3273.262047767639 to run


In [240]:
import numpy as np
import math

df = pd.read_csv('activities_values_to_filter_in_R.csv', index_col = False, low_memory=False)[20000:30000]

list_df = np.array_split(df, math.ceil(len(df)/1000))
# see for help: https://stackoverflow.com/questions/44729727/pandas-slice-large-dataframe-into-chunks

inchi_keys = []

start_time = time.time()
for i in list_df:
    for chembl_id in i['molecule_chembl_id'].tolist():
        try:
            inchi_keys += [new_client.molecule.filter(molecule_chembl_id = chembl_id)[0]["molecule_structures"]["standard_inchi_key"]]
        except TypeError:
            inchi_keys += ["NA"]
print("This took", time.time() - start_time, "to run")

df['standard_inchi_key'] = inchi_keys
df3 = df
df3.to_csv("activities_with_inchikeys_to_filter_in_R_p2.csv", index = False)

# running the chunk on the full df returns a TypeError:
# due to some chemicals, like df.loc[11669], CHEMBL1200457, which do not have an inchikey in chembl's database.

  return bound(*args, **kwds)


This took 1706.0359268188477 to run


In [241]:
# this chunk gets a timeout error

import numpy as np
import math

df = pd.read_csv('activities_values_to_filter_in_R.csv', index_col = False, low_memory=False)[30000:40000]

list_df = np.array_split(df, math.ceil(len(df)/1000))
# see for help: https://stackoverflow.com/questions/44729727/pandas-slice-large-dataframe-into-chunks

inchi_keys = []

start_time = time.time()
for i in list_df:
    for chembl_id in i['molecule_chembl_id'].tolist():
        try:
            inchi_keys += [new_client.molecule.filter(molecule_chembl_id = chembl_id)[0]["molecule_structures"]["standard_inchi_key"]]
        except TypeError:
            inchi_keys += ["NA"]
print("This took", time.time() - start_time, "to run")

df['standard_inchi_key'] = inchi_keys
df4 = df
df4.to_csv("activities_with_inchikeys_to_filter_in_R_p3.csv", index = False)

# running the chunk on the full df returns a TypeError:
# due to some chemicals, like df.loc[11669], CHEMBL1200457, which do not have an inchikey in chembl's database.

  return bound(*args, **kwds)


This took 85.80057764053345 to run


In [242]:
import numpy as np
import math

df = pd.read_csv('activities_values_to_filter_in_R.csv', index_col = False, low_memory=False)[40000:50000]

list_df = np.array_split(df, math.ceil(len(df)/1000))
# see for help: https://stackoverflow.com/questions/44729727/pandas-slice-large-dataframe-into-chunks

inchi_keys = []

start_time = time.time()
for i in list_df:
    for chembl_id in i['molecule_chembl_id'].tolist():
        try:
            inchi_keys += [new_client.molecule.filter(molecule_chembl_id = chembl_id)[0]["molecule_structures"]["standard_inchi_key"]]
        except TypeError:
            inchi_keys += ["NA"]
print("This took", time.time() - start_time, "to run")

df['standard_inchi_key'] = inchi_keys
df5 = df
df5.to_csv("activities_with_inchikeys_to_filter_in_R_p4.csv", index = False)

# running the chunk on the full df returns a TypeError:
# due to some chemicals, like df.loc[11669], CHEMBL1200457, which do not have an inchikey in chembl's database.

  return bound(*args, **kwds)


This took 93.60196495056152 to run


In [243]:
import numpy as np
import math

df = pd.read_csv('activities_values_to_filter_in_R.csv', index_col = False, low_memory=False)[50000:60000]

list_df = np.array_split(df, math.ceil(len(df)/1000))
# see for help: https://stackoverflow.com/questions/44729727/pandas-slice-large-dataframe-into-chunks

inchi_keys = []

start_time = time.time()
for i in list_df:
    for chembl_id in i['molecule_chembl_id'].tolist():
        try:
            inchi_keys += [new_client.molecule.filter(molecule_chembl_id = chembl_id)[0]["molecule_structures"]["standard_inchi_key"]]
        except TypeError:
            inchi_keys += ["NA"]
print("This took", time.time() - start_time, "to run")

df['standard_inchi_key'] = inchi_keys
df6 = df
df6.to_csv("activities_with_inchikeys_to_filter_in_R_p5.csv", index = False)

# running the chunk on the full df returns a TypeError:
# due to some chemicals, like df.loc[11669], CHEMBL1200457, which do not have an inchikey in chembl's database.

  return bound(*args, **kwds)


This took 87.48066544532776 to run


In [244]:
import numpy as np
import math

df = pd.read_csv('activities_values_to_filter_in_R.csv', index_col = False, low_memory=False)[60000:70000]

list_df = np.array_split(df, math.ceil(len(df)/1000))
# see for help: https://stackoverflow.com/questions/44729727/pandas-slice-large-dataframe-into-chunks

inchi_keys = []

start_time = time.time()
for i in list_df:
    for chembl_id in i['molecule_chembl_id'].tolist():
        try:
            inchi_keys += [new_client.molecule.filter(molecule_chembl_id = chembl_id)[0]["molecule_structures"]["standard_inchi_key"]]
        except TypeError:
            inchi_keys += ["NA"]
print("This took", time.time() - start_time, "to run")

df['standard_inchi_key'] = inchi_keys
df7 = df
df7.to_csv("activities_with_inchikeys_to_filter_in_R_p6.csv", index = False)

# running the chunk on the full df returns a TypeError:
# due to some chemicals, like df.loc[11669], CHEMBL1200457, which do not have an inchikey in chembl's database.

  return bound(*args, **kwds)


This took 79.25466585159302 to run


In [245]:
import numpy as np
import math

df = pd.read_csv('activities_values_to_filter_in_R.csv', index_col = False, low_memory=False)[70000:]

list_df = np.array_split(df, math.ceil(len(df)/1000))
# see for help: https://stackoverflow.com/questions/44729727/pandas-slice-large-dataframe-into-chunks

inchi_keys = []

start_time = time.time()
for i in list_df:
    for chembl_id in i['molecule_chembl_id'].tolist():
        try:
            inchi_keys += [new_client.molecule.filter(molecule_chembl_id = chembl_id)[0]["molecule_structures"]["standard_inchi_key"]]
        except TypeError:
            inchi_keys += ["NA"]
print("This took", time.time() - start_time, "to run")

df['standard_inchi_key'] = inchi_keys
df8 = df
df8.to_csv("activities_with_inchikeys_to_filter_in_R_p7.csv", index = False)

# running the chunk on the full df returns a TypeError:
# due to some chemicals, like df.loc[11669], CHEMBL1200457, which do not have an inchikey in chembl's database.

  return bound(*args, **kwds)


This took 66.02663087844849 to run


In [246]:
frames = [df1, df2, df3, df4, df5, df6, df7, df8]
df = pd.concat(frames)
df.to_csv("activities_with_inchikeys_to_filter_in_R_all.csv", index = False)

In [247]:
len(df)

78104

In [253]:
df.loc['standard_inchi_key']

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value,standard_inchi_key
0,,,33031,[],CHEMBL695026,In vitro oxidative metabolic stability measure...,F,,,BAO_0002759,...,Homo sapiens,9606.0,,,Clint,ml min-1 kg-1,,,47.70,KFTPAHYDNCYGGH-SMBZHLNDSA-N
1,,,34270,[],CHEMBL695026,In vitro oxidative metabolic stability measure...,F,,,BAO_0002759,...,Homo sapiens,9606.0,,,Clint,ml min-1 kg-1,,,4.90,CGQLBSSYXLRPDT-JZWVFAODSA-N
2,,,34274,[],CHEMBL695026,In vitro oxidative metabolic stability measure...,F,,,BAO_0002759,...,Homo sapiens,9606.0,,,Clint,ml min-1 kg-1,,,16.50,QWILIPYTZQNZQV-ZONZVIQZSA-N
3,,,34484,[],CHEMBL708969,Rate of Oxidation in human liver microsomes is...,A,,,BAO_0000179,...,Cytochrome P450 3A4,9606.0,,,Rate of oxidation,,,,0.41,LROMWZWIANWJPU-VKAVYKQESA-N
4,,,35507,[],CHEMBL695026,In vitro oxidative metabolic stability measure...,F,,,BAO_0002759,...,Homo sapiens,9606.0,,,Clint,ml min-1 kg-1,,,8.00,ZSWNXMHSWWRAMR-JZWVFAODSA-N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78099,,,25112883,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5265546,Inhibition of CYP3A4 in human liver microsomes...,A,,,BAO_0000201,...,Cytochrome P450 3A4,9606.0,,,INH,%,UO_0000187,,0.00,JRLOPMVTMMPZMV-UHFFFAOYSA-N
78100,,,25112884,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5265547,Inhibition of CYP2D6 in human liver microsomes...,A,,,BAO_0000201,...,Cytochrome P450 2D6,9606.0,,,INH,%,UO_0000187,,25.80,JRLOPMVTMMPZMV-UHFFFAOYSA-N
78101,"{'action_type': 'INHIBITOR', 'description': 'N...",,25112885,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5265548,Inhibition of CYP2C19 in human liver microsome...,A,,,BAO_0000201,...,Cytochrome P450 2C19,9606.0,,,INH,%,UO_0000187,,48.60,JRLOPMVTMMPZMV-UHFFFAOYSA-N
78102,,,25112887,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5265550,Inhibition of CYP1A2 in human liver microsomes...,A,,,BAO_0000201,...,Cytochrome P450 1A2,9606.0,,,INH,%,UO_0000187,,16.60,JRLOPMVTMMPZMV-UHFFFAOYSA-N
