In [1]:
import pandas as pd
import numpy as np
import scanpy as sc

In [2]:
# paths
import yaml
config = yaml.safe_load(open('../config.yaml', "r"))
datadir = config['DOWNDIR']
chembldir = '../metadata/drug_mapping/'

In [3]:
# no match whatsoever could be found
no_match = pd.read_csv(chembldir+'missing_normalized_drug_names_NONE.csv', index_col=0 )

# assign ChEMBL ID directly
exact_match = pd.read_csv(chembldir+'normalized_drug_names_EXACT.csv' , index_col=0)

# With a little processing I could assign a ChEMBL ID : 
# these entries were in the form `NAME (CODE_NAME_1, CODE_NAME_2, ...
substring_match = pd.read_csv(chembldir+'normalized_drug_names_SUBSTRING.csv', index_col=0 )

# partial match, due to typo, add manually by searching amended name in ChEMBL web page
manual_partial_match = pd.read_csv(chembldir+'manual_normalized_drug_names_PARTIAL.csv', index_col=0)

# no match, but was able to ID manually
manual_match = pd.read_csv(chembldir+'manual_normalized_drug_names_NONE.csv', index_col=0) 

In [98]:
# concatenate matching
drugs = pd.concat([no_match, exact_match, substring_match,manual_partial_match, manual_match]).reset_index()

# Drugs across datasets

In [111]:
# which IDs appear twice
seen = []
doubles = []
for index, row in drugs.iterrows():
    entries = str(row.ChEMBL).split(';')
    common = np.intersect1d(entries, seen)
    if len(common) > 0 and 'nan' not in common:
        doubles.extend(common)
    seen.extend(entries)

In [129]:
# add names
from chembl_webresource_client.new_client import new_client
molecule = new_client.molecule
hits = molecule.filter(molecule_chembl_id__in=list(doubles)).only(['molecule_chembl_id', 'pref_name'])
df = pd.DataFrame(hits)

In [133]:
# in which datasets do they overlap
df['appears_in_1'] = None
df['appears_in_2'] = None
for cid in df.molecule_chembl_id:
    ds = []
    for index, row in drugs.iterrows():
        entries = str(row.ChEMBL).split(';')
        if cid in entries:
            ds.append(row.Dataset)
    ds = pd.unique(ds)
    if len(ds) > 1:
        for j, d in enumerate(ds):
            df[f'appears_in_{j+1}'][df.molecule_chembl_id==cid] = d
df = df[~pd.isna(df.appears_in_1)]

In [135]:
df.to_csv('../supplement/Drug_overlap_table.csv')

# Add to Datasets

In [12]:
dsets = drugs['Dataset'].value_counts().index.values

In [102]:
def add_chembl(dset, drugs=drugs, datadir=datadir):
    #obs = pd.read_csv(datadir+dset+'/obs.csv')

    drg = drugs[drugs['Dataset'] == dset]
    drugdict = dict(zip(drg['Name'],drg['ChEMBL']))
    if dset == "McFarlandTshemiak2020_all_expts_combined":
        dset = "McFarlandTsherniak2020"
    adata  = sc.read_h5ad(datadir+dset+'/'+dset+'.h5ad')
    adata.obs['chembl-ID'] = adata.obs['perturbation'].map(drugdict)
    # write csv obs
    adata.obs.to_csv(datadir+dset+'/obs.csv')
    # write adata WITH GZIP
    adata.write_h5ad(datadir+dset+'/'+dset+'.h5ad', compression = 'gzip')
    print(dset+'_done')
    return(adata.obs)

In [103]:
for dset in dsets[1:]: add_chembl(dset)

McFarlandTsherniak2020_done
SrivatsanTrapnell2020_sciplex4_done
ZhaoSims2021_done
SrivatsanTrapnell2020_sciplex2_done
ChangYe2021_done
AissaBenevolenskaya2021_done
GehringPachter2019_done


In [96]:
dset

'McFarlandTshemiak2020_all_expts_combined'