# Collect Driver Genes from Various Sources

Collect the various driver genes from the various sources listed in the README.md, placing them into
the predifined json-schema outlined in "driversValidationSchemaLight.json". Start
at the top of the list in the README, and just collect...

When collecting, try to stay with a particular column naming

"gene_symbol","driver_type","pmid","source_name"

In [1]:
import pandas as pd
import os
import datetime
import numpy as np

In [2]:
import lxml.etree as ET
import codecs
import subprocess
from rdkit import Chem
import gzip
import requests
from functools import reduce

In [3]:
def combine_cols(series, colname):
    ret_val = np.nan
    for col in colname:
        if not pd.isnull(series[col]):
            ret_val = series[col]
            break
    return ret_val

In [4]:
#set data path with your own
data_path = "/Users/bilges/Desktop/abi_tuebingen/clinical_reporting/DockerStation/clinicalReporting_DB_RESTAPI/data"

In [5]:
%cd data/

/Users/bilges/Desktop/abi_tuebingen/clinical_reporting/DockerStation/clinicalReporting_DB_RESTAPI/data


# Download gene data

In [6]:
# %cd /Users/Heisenberg/DockerStation/clinicalReporting_01/Data

In [7]:
HGNC_DOWNLOAD = "ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt"
UNIPROT_DOWNLOAD = "http://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:9606&format=tab&compress=yes&columns=id,entry%20name,reviewed,protein%20names,genes,organism,length,sequence,ec,feature(ACTIVE%20SITE),feature(BINDING%20SITE),comment(TISSUE%20SPECIFICITY),go(biological%20process),go(molecular%20function),go(cellular%20component),go-id,comment(BIOTECHNOLOGY),comment(DISEASE),comment(PHARMACEUTICAL),feature(MUTAGENESIS),comment(DISRUPTION%20PHENOTYPE),last-modified,created,database(CCDS),database(EMBL),database(RefSeq),database(HGNC)"

In [8]:
genes = pd.read_csv(HGNC_DOWNLOAD, sep="\t").rename(columns={'symbol': 'gene_symbol'})


  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
genes["hgnc_id"] = genes["hgnc_id"].str.replace("HGNC:", "")
#genes

In [10]:
gene_synonyms = [(s["hgnc_id"], x)
                 for i, s in genes[["hgnc_id", "alias_symbol"]].dropna(subset=["alias_symbol"]).iterrows()
                 for x in s["alias_symbol"].split("|") if x not in genes["gene_symbol"].tolist()]

In [11]:
#gene_synonyms

In [12]:
gene_synonyms += [(s["hgnc_id"], x)
                  for i, s in genes[["hgnc_id", "prev_symbol"]].dropna(subset=["prev_symbol"]).iterrows()
                  for x in s["prev_symbol"].split("|") if x not in genes["gene_symbol"].tolist()]

In [13]:
#gene_synonyms

In [14]:
gene_synonym_df = pd.concat([genes[["hgnc_id", "gene_symbol"]],
                             pd.DataFrame(gene_synonyms, columns=["hgnc_id", "gene_symbol"])])

In [15]:
#len(gene_synonym_df['hgnc_id'])

In [16]:
all_genes = genes[['hgnc_id', 'gene_symbol', 'name', 'status', 'locus_group', 'locus_type',
                   'gene_family', 'alias_symbol', 'prev_symbol', 'location', 'uniprot_ids',
                   'entrez_id', 'ensembl_gene_id', 'date_approved_reserved', 'date_modified',
                   'date_symbol_changed']].set_index('hgnc_id', drop=False).fillna('null').to_dict('index')

In [17]:
genes2entrez = genes[["hgnc_id", "entrez_id"]].dropna()

In [18]:
genes2uniprot = genes[["hgnc_id", "uniprot_ids"]].dropna()

In [19]:
#name some global variables
columns_to_db = ["hgnc_id", "gene_symbol","driver_type","pmid","source_name"]
source_container = {}

In [24]:
genes.locus_group.unique()

array(['protein-coding gene', 'non-coding RNA', 'pseudogene', 'withdrawn',
       'other', 'phenotype'], dtype=object)

# Vogelstein et al.

In [25]:
path_name = os.path.join(data_path,'Vogelstein_CancerGenomeLandscape_Science_2013.csv')
vogelstein_df = pd.read_csv(path_name,delimiter=';')
vogelstein_pmid = '23539594'
vogelstein_name = "Vogelstein"
vogelstein_df = vogelstein_df.assign(pmid=vogelstein_pmid,source_name=vogelstein_name)
# correct names for later mergin'
vogelstein_df = vogelstein_df.rename(index=str,columns={"Gene Symbol":"gene_symbol","Classification*":"driver_type"})
vogelstein_df = vogelstein_df.merge(gene_synonym_df)

vogelstein_df=vogelstein_df[columns_to_db]
vogelstein_df['driver_type'].unique()

source_container[vogelstein_name]=vogelstein_df

In [46]:

vogelstein_df.loc[vogelstein_df['driver_type'] == 'TSG'].nunique()

hgnc_id        72
gene_symbol    71
driver_type     1
pmid            1
source_name     1
dtype: int64

In [47]:
vogelstein_df.driver_type.unique()

array(['Oncogene', 'TSG'], dtype=object)

# Rubio-Perez et al.

In [48]:
path_name = os.path.join(data_path,'Drivers_type_role.tsv')
rubiop_df = pd.read_csv(path_name,delimiter='\t',comment='#')
rubiop_pmid = "25759023"
rubiop_name = "Rubio-Perez"
rubiop_df = rubiop_df.assign(pmid=rubiop_pmid,source_name=rubiop_name)
rubiop_df = rubiop_df.rename(index=str,columns={"geneHGNCsymbol":"gene_symbol","Driver_type":"driver_type"})
rubiop_df = rubiop_df.merge(gene_synonym_df)



In [49]:
rubiop_df['Role'].unique()

array(['Activating', 'Loss of function', 'No class', 'A'], dtype=object)

In [50]:
rubiop_df['Role'].unique()
rubiop_df['driver_type'].unique()
mapper = dict(zip(rubiop_df['Role'].unique(),["Oncogene","TSG","Unknown","Oncogene"]))
rubiop_df["driver_type"]=rubiop_df["Role"].map(mapper)
# rubiop_df.loc[rubiop_df['Role']=='A']

rubiop_df= rubiop_df[columns_to_db]

source_container[rubiop_name]=rubiop_df

In [52]:
rubiop_df.head(n=20)

Unnamed: 0,hgnc_id,gene_symbol,driver_type,pmid,source_name
0,76,ABL1,Oncogene,25759023,Rubio-Perez
1,77,ABL2,Oncogene,25759023,Rubio-Perez
2,84,ACACA,Oncogene,25759023,Rubio-Perez
3,87,ACAD8,Oncogene,25759023,Rubio-Perez
4,117,ACO1,TSG,25759023,Rubio-Perez
5,3570,ACSL3,Unknown,25759023,Rubio-Perez
6,16496,ACSL6,TSG,25759023,Rubio-Perez
7,132,ACTB,Oncogene,25759023,Rubio-Perez
8,144,ACTG1,Oncogene,25759023,Rubio-Perez
9,145,ACTG2,Oncogene,25759023,Rubio-Perez


In [55]:
rubiop_df.loc[rubiop_df['driver_type'] == 'Unknown'].nunique()

hgnc_id        83
gene_symbol    83
driver_type     1
pmid            1
source_name     1
dtype: int64

In [69]:
pd.concat(g for _, g in rubiop_df.groupby(['hgnc_id']) if len(g) > 1)

Unnamed: 0,hgnc_id,gene_symbol,driver_type,pmid,source_name
68,1100,BRCA1,TSG,25759023,Rubio-Perez
69,1100,BRCA1,TSG,25759023,Rubio-Perez
70,1101,BRCA2,TSG,25759023,Rubio-Perez
71,1101,BRCA2,TSG,25759023,Rubio-Perez
422,11100,SMARCA4,TSG,25759023,Rubio-Perez
423,11100,SMARCA4,TSG,25759023,Rubio-Perez
424,11103,SMARCB1,TSG,25759023,Rubio-Perez
425,11103,SMARCB1,TSG,25759023,Rubio-Perez
442,11389,STK11,TSG,25759023,Rubio-Perez
443,11389,STK11,TSG,25759023,Rubio-Perez


# Uniprot

In [29]:
oncogene_path_name = os.path.join(data_path,'uniprot-keyword%3A%22Proto-oncogene+%5BKW-0656%5D%22.tab')
tsg_path_name = os.path.join(data_path,'uniprot-keyword%3A%22Tumor+suppressor+%5BKW-0043%5D%22.tab')

uniprot_oncogene_df = pd.read_csv(oncogene_path_name,'\t')
uniprot_oncogene_df["driver_type"]="Oncogene"
uniprot_tsg_df = pd.read_csv(tsg_path_name,'\t')
uniprot_tsg_df["driver_type"]="TSG"

uniprot_df = pd.concat([uniprot_tsg_df,uniprot_oncogene_df],axis=0)
uniprot_df["gene_symbol"]=uniprot_df["Gene names"].apply(lambda x: x.split(" ")[0])

uniprot_df = uniprot_df.merge(genes2uniprot, left_on="Entry", right_on="uniprot_ids").drop("uniprot_ids", axis=1)

uniprot_name = "Uniprot"
uniprot_pmid = "14681372"
uniprot_df = uniprot_df.assign(pmid=uniprot_pmid,source_name=uniprot_name)

uniprot_df = uniprot_df[columns_to_db]

source_container[uniprot_name] = uniprot_df

In [75]:
uniprot_df.loc[uniprot_df['driver_type'] == 'TSG'].nunique()

hgnc_id        174
gene_symbol    174
driver_type      1
pmid             1
source_name      1
dtype: int64

In [76]:
pd.concat(g for _, g in uniprot_df.groupby(['hgnc_id']) if len(g) > 1)

Unnamed: 0,hgnc_id,gene_symbol,driver_type,pmid,source_name
117,23145,MAFA,TSG,14681372,Uniprot
118,23145,MAFA,Oncogene,14681372,Uniprot
52,3657,FES,TSG,14681372,Uniprot
53,3657,FES,Oncogene,14681372,Uniprot
114,6408,MAFB,TSG,14681372,Uniprot
115,6408,MAFB,Oncogene,14681372,Uniprot
120,6776,MAF,TSG,14681372,Uniprot
121,6776,MAF,Oncogene,14681372,Uniprot
148,9113,PML,TSG,14681372,Uniprot
149,9113,PML,Oncogene,14681372,Uniprot


# Cosmic Census

In [30]:
path_name = os.path.join(data_path,'Census_allTue Mar 14 14_33_17 2017.tsv')
cosmic_df = pd.read_csv(path_name,"\t")
cosmic_name = "Cosmic"
cosmic_pmid = "14993899"
cosmic_df = cosmic_df.assign(source_name=cosmic_name,pmid=cosmic_pmid)

cosmic_df["Role in Cancer"].unique()
mapper = dict(zip(cosmic_df["Role in Cancer"].unique(),["TSG","Oncogene","Unknown","Oncogene/TSG"]))

cosmic_df["driver_type"]=cosmic_df["Role in Cancer"].map(mapper)
cosmic_df = cosmic_df.rename(index=str, columns={"Gene Symbol":"gene_symbol"})
cosmic_df = cosmic_df.merge(genes2entrez, left_on="Entrez GeneId", right_on="entrez_id").drop("entrez_id", axis=1)
cosmic_df = cosmic_df[columns_to_db]

source_container[cosmic_name]=cosmic_df

In [83]:
cosmic_df.nunique()

hgnc_id        613
gene_symbol    614
driver_type      4
pmid             1
source_name      1
dtype: int64

In [85]:
pd.concat(g for _, g in cosmic_df.groupby(['hgnc_id']) if len(g) > 1)

Unnamed: 0,hgnc_id,gene_symbol,driver_type,pmid,source_name
96,1787,CDKN2A,TSG,14993899,Cosmic
97,1787,CDKN2A(p14),Unknown,14993899,Cosmic


In [82]:
cosmic_df.loc[cosmic_df['driver_type'] == 'Oncogene/TSG'].nunique()


hgnc_id        39
gene_symbol    39
driver_type     1
pmid            1
source_name     1
dtype: int64

# TSgene

In [31]:
tsgene_pathname = os.path.join(data_path,'Human_TSGs.txt')
tsgene_df = pd.read_csv(tsgene_pathname,delimiter="\t")
tsgene_df[:1]

tsgene_name = "TSgene"
tsgene_pmid = "23066107"

tsgene_df= tsgene_df.assign(source_name=tsgene_name,pmid=tsgene_pmid,driver_type="TSG")
tsgene_df = tsgene_df.rename(index=str,columns={"GeneSymbol":"gene_symbol"})
tsgene_df = tsgene_df.merge(genes2entrez, left_on="GeneID", right_on="entrez_id").drop("entrez_id", axis=1)
tsgene_df = tsgene_df[columns_to_db]

source_container[tsgene_name]=tsgene_df

In [87]:
tsgene_df.driver_type.unique()

array(['TSG'], dtype=object)

# Putting it all together

In [88]:
# check the keys to see if it all went down according to plan
source_container.keys()

dict_keys(['Vogelstein', 'Rubio-Perez', 'Uniprot', 'Cosmic', 'TSgene'])

In [89]:
# put the containers together 
df_final = reduce(lambda left,right: pd.concat([left,right],ignore_index=True), source_container.values())
# some duplicates showed up??
df_final.drop_duplicates(inplace=True)

# this seems a little too hacked for my tast. Don't know, feel free to change it and let me know what you did
df_scored = pd.merge(df_final,df_final.groupby("gene_symbol").count()["pmid"].to_frame(),left_on="gene_symbol",right_index=True)
df_scored = df_scored.rename(index=str,columns=dict(zip(["pmid_x","pmid_y"],["pmid","score"])))

In [94]:
df_final.nunique()

hgnc_id        1998
gene_symbol    2008
driver_type       4
pmid              5
source_name       5
dtype: int64

In [90]:
group_cols = ["hgnc_id"]
exclude_cols = group_cols + ["gene_symbol"]
groups = df_scored.groupby(["hgnc_id"])
for n, g in groups:
    all_genes[str(n)]['cancer'] = g[[c for c in g.columns if c not in exclude_cols]].to_dict('records')

# Do a little "speed vali-dating"

In [30]:
# only have 5 pmids i.e. can't be seven like when there were duplicates still in the dataframe
df_scored.head(n=2)

Unnamed: 0,hgnc_id,gene_symbol,driver_type,pmid,source_name,score
0,76,ABL1,Oncogene,23539594,Vogelstein,4
126,76,ABL1,Oncogene,25759023,Rubio-Perez,4


In [31]:
# df_final["gene_symbol"].unique()
df_final.head(n=2)

Unnamed: 0,hgnc_id,gene_symbol,driver_type,pmid,source_name
0,76,ABL1,Oncogene,23539594,Vogelstein
1,172,ACVR1B,TSG,23539594,Vogelstein


In [34]:
# typical query
df_scored.loc[df_scored["gene_symbol"]=="EGFR"]


Unnamed: 0,hgnc_id,gene_symbol,driver_type,pmid,source_name,score


In [126]:
all_genes['3236']
#all_genes['24301']

{'hgnc_id': '3236',
 'gene_symbol': 'EGFR',
 'name': 'epidermal growth factor receptor',
 'status': 'Approved',
 'locus_group': 'protein-coding gene',
 'locus_type': 'gene with protein product',
 'gene_family': 'Erb-b2 receptor tyrosine kinases',
 'alias_symbol': 'ERBB1',
 'prev_symbol': 'ERBB',
 'location': '7p11.2',
 'uniprot_ids': 'P00533',
 'entrez_id': 1956.0,
 'ensembl_gene_id': 'ENSG00000146648',
 'date_approved_reserved': '1986-01-01',
 'date_modified': '2018-06-30',
 'date_symbol_changed': 'null',
 'cancer': [{'driver_type': 'Oncogene',
   'pmid': '23539594',
   'source_name': 'Vogelstein',
   'score': 4},
  {'driver_type': 'Oncogene',
   'pmid': '25759023',
   'source_name': 'Rubio-Perez',
   'score': 4},
  {'driver_type': 'Oncogene',
   'pmid': '14681372',
   'source_name': 'Uniprot',
   'score': 4},
  {'driver_type': 'Unknown',
   'pmid': '14993899',
   'source_name': 'Cosmic',
   'score': 4}],
 'drugs': [{'drugbank_id': 'DB00002',
   'drug_name': 'cetuximab',
   'target_ac

# Target stuff

In [36]:
drug_data_containers = {}

## DrugBank

In [37]:
DRUGBANK_DOWNLOAD = "http://www.drugbank.ca/releases/latest/downloads/all-full-database"
DRUGBANK_STRUCTURES_DOWNLOAD = "http://www.drugbank.ca/releases/latest/downloads/all-structures"
DRUGBANK_XREFS_DOWNLAOD = "http://www.drugbank.ca/releases/latest/downloads/all-drug-links"
DRUGBANK_USER = "c.schaerfe@gmail.com"
DRUGBANK_PASSWORD = "themirrorcracked"

In [38]:
DRUGBANK_FILE = os.path.join(data_path, "drugbank.xml.zip")
DRUGBANK_SDF_FILE = os.path.join(data_path, "drugbank_molecules.sdf.zip")
DRUGBANK_XREF_FILE = os.path.join(data_path, "drugbank_xrefs.csv.zip")

In [39]:
DRUGBANK_FILE

'/Users/bilges/Desktop/abi_tuebingen/clinical_reporting/DockerStation/clinicalReporting_DB_RESTAPI/data/drugbank.xml.zip'

In [41]:
i = 1
exit_code_xml, exit_code_sdf, exit_code_xref = 1, 1, 1
while exit_code_xml + exit_code_sdf + exit_code_xref > 0 and i < 11:
    print("trying download now for round number {}".format(i))
    
    if exit_code_xml > 0:
        exit_code_xml = subprocess.call(['curl', '-L', '-H', 
                                         '"Accept: application/xml"', 
                                         '-o', DRUGBANK_FILE,
                                         '-u', DRUGBANK_USER + ':' + DRUGBANK_PASSWORD,
                                         DRUGBANK_DOWNLOAD])
 
        print("xml", exit_code_xml)
    if exit_code_sdf > 0:
        exit_code_sdf = subprocess.call(['curl', '-L', 
                                         '-o', DRUGBANK_SDF_FILE,
                                         '-u', DRUGBANK_USER + ':' + DRUGBANK_PASSWORD,
                                         DRUGBANK_STRUCTURES_DOWNLOAD])
        print("sdf", exit_code_sdf)
    if exit_code_xref > 0:
        exit_code_xref= subprocess.call(['curl', '-L',
                                         '-o', DRUGBANK_XREF_FILE,
                                         '-u', DRUGBANK_USER + ':' + DRUGBANK_PASSWORD,
                                         DRUGBANK_XREFS_DOWNLAOD])
 
        print("xref", exit_code_xref)
    i += 1
    

trying download now for round number 1


KeyboardInterrupt: 

In [43]:
if exit_code_xml == 0:
    exit_code_xml = subprocess.call("unzip -p " + DRUGBANK_FILE +" \*.xml | cat > " + DRUGBANK_FILE.replace(".zip", ""), shell=True)
if exit_code_sdf == 0:
    exit_code_sdf = subprocess.call("unzip -p " + DRUGBANK_SDF_FILE +" \*.sdf | cat > " + DRUGBANK_SDF_FILE.replace(".zip", ""), shell=True)
if exit_code_xref == 0:
    exit_code_xref = subprocess.call("unzip -p " + DRUGBANK_XREF_FILE +" \*.csv | cat > " + DRUGBANK_XREF_FILE.replace(".zip", ""), shell=True)

In [44]:
print(exit_code_sdf)

1


In [45]:
# i = 0
# exit_code_xml, exit_code_sdf, exit_code_xref = 1, 1, 1
# while exit_code_sdf  > 0 or i < 10:
#     if exit_code_sdf > 0:
#         exit_code_sdf = subprocess.call(['curl', '-L', 
#                                          '-o', DRUGBANK_SDF_FILE,
#                                          '-u', DRUGBANK_USER + ':' + DRUGBANK_PASSWORD,
#                                          DRUGBANK_STRUCTURES_DOWNLOAD])
#         if exit_code_sdf == 0:
#             exit_code_sdf = subprocess.call("unzip -p " + DRUGBANK_SDF_FILE +" \*.sdf | cat > " + DRUGBANK_SDF_FILE.replace(".zip", ""), shell=True)
#         print("sdf", exit_code_sdf)
#     i += 1
#     print("trying now for round number {}".format(i))

In [46]:
print( exit_code_xml, exit_code_sdf, exit_code_xref )

1 1 1


Reading the drug molecules

In [47]:
DRUGBANK_FILE = DRUGBANK_FILE.replace(".zip", "")
DRUGBANK_SDF_FILE = DRUGBANK_SDF_FILE.replace(".zip", "")
DRUGBANK_XREF_FILE = DRUGBANK_XREF_FILE.replace(".zip", "")

In [48]:
DRUGBANK_SDF_FILE

'/Users/bilges/Desktop/abi_tuebingen/clinical_reporting/DockerStation/clinicalReporting_DB_RESTAPI/data/drugbank_molecules.sdf'

In [49]:
drug_smiles = {}
structures = Chem.SDMolSupplier(DRUGBANK_SDF_FILE)
for s in structures:
    if s is not None:
        db_id = s.GetPropsAsDict().get("DRUGBANK_ID")
        smi = Chem.MolToSmiles(s)
        drug_smiles[db_id] = smi
drugbank2smiles = pd.DataFrame(list(drug_smiles.items()), columns=["drugbank_id", "SMILES"])

In [50]:
drugbank2smiles

Unnamed: 0,drugbank_id,SMILES
0,DB00006,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@...
1,DB00014,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...
2,DB00027,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...
3,DB00035,N=C(N)NCCC[C@H](NC(=O)[C@@H]1CCCN1C(=O)[C@@H]1...
4,DB00050,CC(=O)N[C@H](Cc1ccc2ccccc2c1)C(=O)N[C@H](Cc1cc...
5,DB00080,CCCCCCCCCC(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N...
6,DB00091,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](CC...
7,DB00093,NCCCC[C@H](NC(=O)[C@@H]1CCCN1C(=O)[C@@H]1CSSC[...
8,DB00104,C[C@@H](O)[C@@H]1NC(=O)[C@H](CCCCN)NC(=O)[C@@H...
9,DB00106,CC(=O)N[C@H](Cc1ccc2ccccc2c1)C(=O)N[C@H](Cc1cc...


In [51]:
def extract_gene_info(target, drugbank_id):
    target_id = target.find('{http://www.drugbank.ca}id').text
    target_name = target.find('{http://www.drugbank.ca}name').text
    target_organism = target.find('{http://www.drugbank.ca}organism').text

    target_action = b''
    if target.find('{http://www.drugbank.ca}actions') is not None:
        target_action = b'|'.join([act.text.encode('utf-8') for act in
                                  target.find('{http://www.drugbank.ca}actions').
                                  findall('{http://www.drugbank.ca}action')])
    target_known_action = target.find('{http://www.drugbank.ca}known-action').text

    gene_symbol = ""
    hgnc_id = ""
    if target.find('{http://www.drugbank.ca}polypeptide') is not None:
        gene_symbol = target.find('{http://www.drugbank.ca}polypeptide').\
            find('{http://www.drugbank.ca}gene-name').text
        if target.find('{http://www.drugbank.ca}polypeptide').\
                find('{http://www.drugbank.ca}external-identifiers') is not None:
            for external in target.find('{http://www.drugbank.ca}polypeptide').\
                    find('{http://www.drugbank.ca}external-identifiers').\
                    findall('{http://www.drugbank.ca}external-identifier'):
                if external.find('{http://www.drugbank.ca}resource').text ==\
                        "HUGO Gene Nomenclature Committee (HGNC)":
                    hgnc_id = external.find('{http://www.drugbank.ca}identifier').text

    refs = []
    try:
        if target.find('{http://www.drugbank.ca}references') is not None:
            if target.find('{http://www.drugbank.ca}references').\
                    find('{http://www.drugbank.ca}articles') is not None:
                for article in target.find('{http://www.drugbank.ca}references').\
                        find('{http://www.drugbank.ca}articles').\
                        findall('{http://www.drugbank.ca}article'):
                    pmid = article.find('{http://www.drugbank.ca}pubmed-id').text
                    if pmid is not None:
                        refs.append(pmid)
        refs = "|".join(refs)
    except:
        print(drugbank_id)
        print( refs, len(refs))

    row = [drugbank_id, target_id, target_name, target_organism,
           target_action, target_known_action, gene_symbol, hgnc_id, refs]
    row = [r or "" for r in row]
    return row

In [52]:
DRUGBANK_FILE

'/Users/bilges/Desktop/abi_tuebingen/clinical_reporting/DockerStation/clinicalReporting_DB_RESTAPI/data/drugbank.xml'

In [53]:
tree = ET.parse(DRUGBANK_FILE)
root = tree.getroot()
version = root.attrib.get('version')

drug_synonyms = []
drug_atc = {}
cancer_drugs = {}
lists = []
for record in root.iterfind('{http://www.drugbank.ca}drug'):
    if len(record.find('{http://www.drugbank.ca}drugbank-id')) == 1:
        drugbank_id = record.find('{http://www.drugbank.ca}drugbank-id').text
    else:
        drugbank_id = [x.text for x in record.findall('{http://www.drugbank.ca}drugbank-id')
                       if x.attrib.get('primary', 'false') == 'true'][0]
    
    create_date = record.attrib.get('created')
    update_date = record.attrib.get('updated')
    drug_type = record.attrib.get('type')

    drug_name = record.find('{http://www.drugbank.ca}name').text.lower()
    
    drug_synonyms += [(drugbank_id, syn.text.encode('utf-8').lower()) for syn in record.\
                       find('{http://www.drugbank.ca}synonyms').\
                       findall('{http://www.drugbank.ca}synonym')]
    
    groups = b'|'.join([group.text.encode('utf-8') for group in record.\
                           find('{http://www.drugbank.ca}groups').\
                           findall('{http://www.drugbank.ca}group')])
    
    atc_code_list = []
    is_cancer_drug = 0
    if record.find('{http://www.drugbank.ca}atc-codes') is not None:
        atc_code_list = []
        for atc_code in record.find('{http://www.drugbank.ca}atc-codes').\
                findall('{http://www.drugbank.ca}atc-code'):
            atc_code_list.append(atc_code.attrib.get('code'))
            if atc_code.attrib.get('code').startswith('L01'):
                is_cancer_drug = 1
                cancer_drugs[drugbank_id] = True
            else:
                cancer_drugs[drugbank_id] = False
        drug_atc[drugbank_id] = "|".join(atc_code_list) 
    
    drugbank_data = [drug_name, create_date, update_date, drug_type, is_cancer_drug, groups]
    # extract targets
    if record.find('{http://www.drugbank.ca}targets') is not None:
        for target in record.find('{http://www.drugbank.ca}targets').\
                findall('{http://www.drugbank.ca}target'):
            target_info = extract_gene_info(target, drugbank_id)
            row = target_info + ['target', version]
            lists.append(row + drugbank_data)
    # extract enzymes
    if record.find('{http://www.drugbank.ca}enzymes') is not None:
        for enzyme in record.find('{http://www.drugbank.ca}enzymes').\
                findall('{http://www.drugbank.ca}enzyme'):
            row = extract_gene_info(enzyme, drugbank_id) + ['enzyme', version]
            lists.append(row + drugbank_data)
    # extract carriers
    if record.find('{http://www.drugbank.ca}carriers') is not None:
        for carrier in record.find('{http://www.drugbank.ca}carriers').\
                findall('{http://www.drugbank.ca}carrier'):
            row = extract_gene_info(carrier, drugbank_id) + ['carrier', version]
            lists.append(row + drugbank_data)
    # extract transporters
    if record.find('{http://www.drugbank.ca}transporters') is not None:
        for transporter in record.find('{http://www.drugbank.ca}transporters').\
                findall('{http://www.drugbank.ca}transporter'):
            row = extract_gene_info(transporter, drugbank_id) + ['transporter', version]
            lists.append(row + drugbank_data)

cols = ['drugbank_id', 'target_id', 'target_name', 'target_organism',
        'target_action', 'target_known_action', 'gene_symbol',
        'hgnc_id', 'pmid', 'interaction_type', 'version', 'drug_name',
        'create_date', 'update_date', 'drug_type', 'is_cancer_drug', 'approval_status']
drug2gene_drugbank = pd.DataFrame(lists, columns=cols)

In [56]:
drug2gene_drugbank.head(n=2)

Unnamed: 0,drugbank_id,target_id,target_name,target_organism,target_action,target_known_action,gene_symbol,hgnc_id,pmid,interaction_type,version,drug_name,create_date,update_date,drug_type,is_cancer_drug,approval_status
0,DB00001,BE0000048,Prothrombin,Human,inhibitor,yes,F2,HGNC:3535,10505536|10912644|11055889|11467439|11807012|1...,target,5.1,lepirudin,2005-06-13,2018-03-02,biotech,0,approved
1,DB00002,BE0000767,Epidermal growth factor receptor,Human,antagonist,yes,EGFR,HGNC:3236,10480573|10601294|10628369|11408594|11431346|1...,target,5.1,cetuximab,2005-06-13,2018-04-02,biotech,1,approved


In [55]:
def decode_binary(value):
    if isinstance(value,bytes):
        return value.decode()
    else:
        return value

drug2gene_drugbank['target_action'] = drug2gene_drugbank['target_action'].apply(decode_binary)
drug2gene_drugbank['approval_status'] = drug2gene_drugbank['approval_status'].apply(decode_binary)

In [57]:
drug2gene_drugbank = drug2gene_drugbank.loc[drug2gene_drugbank.target_organism == 'Human']

In [58]:
drug2gene_drugbank["hgnc_id"] = drug2gene_drugbank["hgnc_id"].str.replace("HGNC:", "")

In [59]:
db_name = "DrugBank"
drug2gene_drugbank = drug2gene_drugbank.assign(source_name=db_name, download_date=datetime.date.today().strftime("%Y-%m-%d"))

In [60]:
drug_data_containers[db_name] = drug2gene_drugbank

In [61]:
drugbank_xrefs = pd.read_csv(DRUGBANK_XREF_FILE)
drugbank_xrefs = drugbank_xrefs[["DrugBank ID", "Name", "PubChem Compound ID", "PharmGKB ID", "TTD ID"]]
drugbank_xrefs["name"] = drugbank_xrefs["Name"].str.lower()

In [62]:
drugbank_synonyms = pd.DataFrame(drug_synonyms, columns=["DrugBank ID", "name"])
drugbank_synonyms = pd.concat([drugbank_xrefs[["DrugBank ID", "name"]], drugbank_synonyms])

In [63]:
drugbank_synonyms.head(n=2)

Unnamed: 0,DrugBank ID,name
0,DB00001,lepirudin
1,DB00002,cetuximab


In [64]:
drug2gene_drugbank.loc[drug2gene_drugbank.drug_name.isnull()]

Unnamed: 0,drugbank_id,target_id,target_name,target_organism,target_action,target_known_action,gene_symbol,hgnc_id,pmid,interaction_type,version,drug_name,create_date,update_date,drug_type,is_cancer_drug,approval_status,source_name,download_date


## TTD

In [69]:
#TTD_TARGETS_DOWNLOAD = "http://database.idrb.cqu.edu.cn/TTD/download/TTD_download.txt"
#TTD_DRUGS_XREFS_DOWNLOAD = "http://database.idrb.cqu.edu.cn/TTD/download/TTD_crossmatching.txt"
#TTD_XREFS_DOWNLOAD = "http://bidd.nus.edu.sg/BIDD-Databases/TTD/download/TTD_uniprot_all.txt"
#TTD_DRUGS_DOWNLOAD = "http://database.idrb.cqu.edu.cn/TTD/download/All.sdf"
#TTD_SDF_FILE = os.path.join(data_path, "TTD_molecules.sdf")

TTD_TARGETS_DOWNLOAD = "https://db.idrblab.org/ttd/sites/default/files/ttd_database/P1-01-TTD_download.txt"
TTD_DRUGS_XREFS_DOWNLOAD = "https://db.idrblab.org/ttd/sites/default/files/ttd_database/P1-02-TTD_crossmatching.txt"
TTD_XREFS_DOWNLOAD = "https://db.idrblab.org/ttd/sites/default/files/ttd_database/P2-01-TTD_uniprot_all.txt"
TTD_DRUGS_DOWNLOAD = "https://db.idrblab.org/ttd/sites/default/files/ttd_database/P3-01-All.sdf"
TTD_SDF_FILE = os.path.join(data_path, "TTD_molecules.sdf")

Read TTD molecules

In [70]:
exit_code_ttd_sdf = 1
i = 0
while exit_code_ttd_sdf > 0 and i < 10:
    exit_code_ttd_sdf = subprocess.call("wget -O " + TTD_SDF_FILE + " " +  TTD_DRUGS_DOWNLOAD, shell=True)
    print(exit_code_ttd_sdf)
    i += 1

0


In [71]:
ttd2cid = {}
ttd_smiles = {}
structures = Chem.SDMolSupplier(TTD_SDF_FILE)
for s in structures:
    if s is not None:
        db_id  = s.GetProp("_Name")
        db_cid = s.GetPropsAsDict().get("PUBCHEM_COMPOUND_CID")
        ttd_smiles[db_id] = Chem.MolToSmiles(s)
        if db_cid is not None:
            ttd2cid[db_id] = db_cid
ttd2smiles = pd.DataFrame(list(ttd_smiles.items()), columns=["ttd_drug_id", 'SMILES'])

In [72]:
ttd2smiles.head(n=6)

Unnamed: 0,ttd_drug_id,SMILES
0,D07XVR,[C-]#N.[C-]#N.[C-]#N.[C-]#N.[C-]#N.[Fe+4].[N-]...
1,D03KPZ,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...
2,D0C4RB,CC(=O)O[C@H]1C(=O)[C@]2(C)[C@@H](O)C[C@H]3OC[C...
3,D03HJK,CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(...
4,D0M3FJ,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)NC(=...
5,D0IX6I,C[C@]12CC(=O)[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43...


In [73]:
ttd_drugs = pd.read_csv(TTD_DRUGS_XREFS_DOWNLOAD, skiprows=12, sep="\t",
                        names=["ttd_drug_id", "ign", "interaction_type", "drug_name"])
ttd_drugs = ttd_drugs.loc[ttd_drugs.interaction_type == 'DrugName'][["ttd_drug_id", "drug_name"]]
ttd2uniprot = pd.read_table(TTD_XREFS_DOWNLOAD, skiprows=12)\
                .rename(columns={"TTD Target ID": "target_id", "Uniprot ID": "uniprot_id"})

ttd_targets = pd.read_csv(TTD_TARGETS_DOWNLOAD, skiprows=12, sep="\t",
                          names=["target_id", "edge_type", "target"])\
    .rename(columns={'ttd_id': 'target_id'})\
    .drop_duplicates()

ttd2drug = ttd_targets.loc[ttd_targets.edge_type == "Drug(s)"]
ttd2drug.rename(columns={"target": "drug_name"}, inplace=True)
ttd2drug = ttd2drug.merge(ttd_drugs)
ttd2drug["drug_name"] = ttd2drug["drug_name"].str.lower()


ttd2drug_subtype = ttd_targets.loc[ttd_targets.edge_type.isin(["Inhibitor", "Agonist", "Antagonist", "Modulator", "Binder"])]
ttd2drug_subtype.rename(columns={"edge_type": "target_action", "target": "drug_name"}, inplace=True)
ttd2drug_subtype["drug_name"] = ttd2drug_subtype["drug_name"].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [74]:
ttd2drug.head(n=3)

Unnamed: 0,target_id,edge_type,drug_name,ttd_drug_id
0,T85435,Drug(s),"4-((1h-indazol-6-ylamino)methyl)benzene-1,2-diol",D0W9KO
1,T48069,Drug(s),"4-((1h-indazol-6-ylamino)methyl)benzene-1,2-diol",D0W9KO
2,T85435,Drug(s),"4-((naphthalen-2-ylamino)methyl)benzene-1,2-diol",D0L7MM


In [75]:
ttd2drug_subtype.head(n=4)

Unnamed: 0,target_id,target_action,drug_name
57,T85435,Inhibitor,"4-((1h-indazol-6-ylamino)methyl)benzene-1,2-diol"
58,T85435,Inhibitor,"4-((naphthalen-2-ylamino)methyl)benzene-1,2-diol"
59,T85435,Inhibitor,"adenosine-5'-[beta, gamma-methylene]triphosphate"
60,T85435,Inhibitor,compound 15


In [76]:
drug2gene_ttd = ttd2drug.merge(ttd2drug_subtype, left_on=["target_id", "drug_name"], right_on=["target_id", "drug_name"], how="outer")\
        .merge(ttd2smiles, left_on="ttd_drug_id", right_on="ttd_drug_id")\
        .merge(drugbank_xrefs[["DrugBank ID", "TTD ID"]], left_on="ttd_drug_id", right_on="TTD ID", how="left")\
        .merge(drugbank_xrefs[["DrugBank ID", "Name"]], left_on="drug_name", right_on="Name", how="left")\
        .merge(drugbank2smiles, left_on="SMILES", right_on="SMILES", how="left")\
        .merge(ttd2uniprot, left_on="target_id", right_on="target_id")\
        .drop_duplicates()

In [77]:
drug2gene_ttd["drugbank_id"] = drug2gene_ttd.apply(combine_cols, args=(["DrugBank ID_x", "DrugBank ID_y", 'drugbank_id'],), axis=1)
drug2gene_ttd = drug2gene_ttd[['drugbank_id', "drug_name", "target_id", "uniprot_id", "target_action", "ttd_drug_id"]]
drug2gene_ttd = drug2gene_ttd.dropna(subset=['drugbank_id'])

In [78]:
uniprot_split = pd.DataFrame([(i, x.split(" (")[0].strip()) for i, row in drug2gene_ttd.iterrows()
                              for x in row['uniprot_id'].split(';')], columns=['df_index', 'uniprot_id'])
uniprot_split.set_index('df_index', inplace=True)
drug2gene_ttd = drug2gene_ttd.join(uniprot_split, lsuffix="s")\
                             .drop("uniprot_ids", axis=1)\
                             .merge(genes2uniprot, left_on="uniprot_id", right_on="uniprot_ids")\
                             .drop("uniprot_ids", axis=1)

In [79]:
drug2gene_ttd = drug2gene_ttd.merge(genes[["hgnc_id", "gene_symbol"]], left_on="hgnc_id", right_on="hgnc_id",
                                    suffixes=["", "_dup"])

In [80]:
drug2gene_ttd.head(n=3)

Unnamed: 0,drugbank_id,drug_name,target_id,target_action,ttd_drug_id,uniprot_id,hgnc_id,gene_symbol
0,DB03909,"adenosine-5'-[beta, gamma-methylene]triphosphate",T85435,Inhibitor,D03YIO,P06213,6091,INSR
1,DB11721,mitoglitazone,T85435,Modulator,D00SWQ,P06213,6091,INSR
2,DB06156,tesofensine,T85435,Inhibitor,D0ZK7B,P06213,6091,INSR


In [81]:
db_name = "TTD"
db_pmid = "PMC4702870"
interaction_type = 'target'

drug2gene_ttd = drug2gene_ttd.assign(source_name=db_name, pmid=db_pmid,
                                     interaction_type=interaction_type,
                                     download_date=datetime.date.today().strftime("%Y-%m-%d"))

In [82]:
drug2gene_ttd.sort_values(by='drugbank_id')

Unnamed: 0,drugbank_id,drug_name,target_id,target_action,ttd_drug_id,uniprot_id,hgnc_id,gene_symbol,source_name,pmid,interaction_type,download_date
520,DB00006,bivalirudin,T94033,Inhibitor,D09HVL,P00734,3535,F2,TTD,PMC4702870,target,2018-07-01
529,DB00006,hirulog,T94033,Inhibitor,D03ZVY,P00734,3535,F2,TTD,PMC4702870,target,2018-07-01
521,DB00006,bivalirudine,T94033,Inhibitor,D09FGS,P00734,3535,F2,TTD,PMC4702870,target,2018-07-01
2573,DB00106,abarelix,T12475,Modulator,D01AHO,P30968,4421,GNRHR,TTD,PMC4702870,target,2018-07-01
530,DB00114,lepirudine,T94033,Inhibitor,D0U5YK,P00734,3535,F2,TTD,PMC4702870,target,2018-07-01
2520,DB00120,l-phenylalanine,T62390,Binder,D0R1CR,P07101,11782,TH,TTD,PMC4702870,target,2018-07-01
2749,DB00123,l-lysine,T72032,Agonist,D07TMZ,Q5T6X5,18510,GPRC6A,TTD,PMC4702870,target,2018-07-01
3064,DB00126,[14c]ascorbic acid,T37382,Modulator,D0U4HT,Q9UHI7,10974,SLC23A1,TTD,PMC4702870,target,2018-07-01
1059,DB00127,spermine,T64567,Inhibitor,D0LD9S,Q16790,1383,CA9,TTD,PMC4702870,target,2018-07-01
1075,DB00127,spermine,T53378,Inhibitor,D0LD9S,P22748,1375,CA4,TTD,PMC4702870,target,2018-07-01


In [83]:
drug_data_containers[db_name] = drug2gene_ttd

## IUPHAR

In [84]:
IUPHAR_INTERACTIONS_DOWNLOAD="http://www.guidetopharmacology.org/DATA/interactions.csv"
IUPHAR_HGNC_MAPPING_DOWNLOAD="http://www.guidetopharmacology.org/DATA/GtP_to_HGNC_mapping.csv"
IUPHAR_LIGANDS_DOWNLOAD="http://www.guidetopharmacology.org/DATA/ligands.csv"

In [85]:
iuphar_ligands = pd.read_csv(IUPHAR_LIGANDS_DOWNLOAD)
drug_ids = iuphar_ligands["Ligand id"].tolist()

xrefs_drugs = []
url_db_xlink_base = "http://www.guidetopharmacology.org/services/ligands/{}/databaseLinks"
missed = []
iuphar2drugbank = []
print(len(drug_ids), "ligands in IUPHAR.")

for i, x in enumerate(drug_ids):
    r = requests.get(url_db_xlink_base.format(x))
    if r.status_code == 200:
        if i % 500 == 0:
            print(i)
        drugbank_ids = [d["accession"] for d in r.json() if d['database'] == 'DrugBank Ligand']
        if len(drugbank_ids) > 0:
            for d in drugbank_ids:
                iuphar2drugbank.append([x, d])
        else:
            missed.append(x)

iuphar2drugbank = pd.DataFrame(iuphar2drugbank, columns=["iuphar_id", "drugbank_id"])
print(len(set(missed)), " ligands without drugbank id.")

9336 ligands in IUPHAR.
0
1000
1500
2000
2500
3000
3500
5000
5500
6000
6500
7000
7500
8500
9000
7032  ligands without drugbank id.


In [86]:
iuphar2drugbank.to_csv(data_path + "\iuphar2drugbank.csv", sep="\t")

In [87]:
iuphar2hgnc = pd.read_csv(IUPHAR_HGNC_MAPPING_DOWNLOAD)

In [88]:
iuphar_interactions = pd.read_csv(IUPHAR_INTERACTIONS_DOWNLOAD)\
                        .rename(columns={'type': 'target_action',
                                         'action': 'target_action_detailed',
                                         'pubmed_id': 'pmid',
                                         'target': 'target_name',
                                         'target_species': 'target_organism',
                                         'target_gene_symbol': 'gene_symbol',
                                         'target_uniprot': 'uniprot_id',
                                         'ligand': 'drug_name',
                                         'target_id': 'iuphar_id'})

  interactivity=interactivity, compiler=compiler, result=result)


In [89]:
drug2gene_iuphar = iuphar_interactions.merge(iuphar2hgnc[['iuphar_id', 'hgnc_id']],
                                             left_on="iuphar_id", right_on="iuphar_id")\
                                      .merge(iuphar2drugbank,
                                             left_on="ligand_id", right_on="iuphar_id")

In [90]:
drug2gene_iuphar = drug2gene_iuphar.loc[drug2gene_iuphar.target_organism == 'Human']

In [91]:
drug2gene_iuphar = drug2gene_iuphar[[x for x in drug2gene_iuphar.columns
                                     if not x.startswith("target_ligand") and not x.endswith("_x")
                                     and not x.endswith("_y") and not "original" in x
                                     and not x in ['ligand_gene_symbol', 'ligand_species', 'ligand_pubchem_sid']]]

In [95]:
# drug2gene_iuphar.head(n=3)
list(drug2gene_iuphar)

['target_name',
 'gene_symbol',
 'uniprot_id',
 'target_organism',
 'drug_name',
 'ligand_id',
 'target_action',
 'target_action_detailed',
 'action_comment',
 'endogenous',
 'primary_target',
 'concentration_range',
 'affinity_units',
 'affinity_high',
 'affinity_median',
 'affinity_low',
 'assay_description',
 'receptor_site',
 'ligand_context',
 'pmid',
 'hgnc_id',
 'drugbank_id',
 'source_name',
 'interaction_type',
 'download_date']

In [92]:
db_name = "IUPHAR"
interaction_type = 'target'
drug2gene_iuphar = drug2gene_iuphar.assign(source_name=db_name, interaction_type=interaction_type,
                                           download_date=datetime.date.today().strftime("%Y-%m-%d"))


In [93]:
drug_data_containers[db_name] = drug2gene_iuphar

## Santos dataset
this is the supplement from the following paper. A frequently maintained version should be available on Pharos, but downloading the relevant data would require downloading and extracting the database dump first.


In [96]:
SANTOS_SUPPLEMENT_DOWNLOAD="http://www.nature.com/nrd/journal/v16/n1/extref/nrd.2016.230-s2.xlsx"

In [104]:
rename_cols = {"ACCESSION": "uniprot_id",
               "ORGANISM": "target_species",
               "PROTEIN_NAME": "target_name",
               "PARENT_PREF_NAME": "drug_name",
#                                       "PARENT_PREF_NAME": "drug_name",
               "MECHANISM_OF_ACTION": "action_comment"}
drug2gene_santos = pd.read_excel(SANTOS_SUPPLEMENT_DOWNLOAD)\
                     .rename(columns=rename_cols)

In [105]:
drug2gene_santos = drug2gene_santos.loc[drug2gene_santos["target_species"] == "Homo sapiens"]
drug2gene_santos = drug2gene_santos.dropna(subset=["uniprot_id"])
drug2gene_santos["drug_name"] = drug2gene_santos["drug_name"].str.lower()

In [106]:
drug2gene_santos.head(n=1)

Unnamed: 0,drug_name,action_comment,TARGET_CHEMBL_ID,TARGET_PREF_NAME,uniprot_id,target_name,target_species,PROTEIN_CLASS_DESC
2,abarelix,Gonadotropin-releasing hormone receptor antago...,CHEMBL1855,Gonadotropin-releasing hormone receptor,P30968,Gonadotropin-releasing hormone receptor,Homo sapiens,membrane receptor 7tm1 peptide short peptid...


In [107]:
drug2gene_santos = drug2gene_santos.merge(drugbank_synonyms, left_on="drug_name", right_on="name")\
                                   .rename(columns={"DrugBank ID": "drugbank_id"})

In [108]:
list(rename_cols.values())

['uniprot_id', 'target_species', 'target_name', 'drug_name', 'action_comment']

In [109]:
drug2gene_santos = drug2gene_santos[list(rename_cols.values()) + ["drugbank_id"]]\
    .merge(genes2uniprot, left_on="uniprot_id", right_on="uniprot_ids")\
    .drop("uniprot_ids", axis=1)\
    .merge(genes[["hgnc_id", "gene_symbol"]], left_on="hgnc_id", right_on="hgnc_id",
                  suffixes=["", "_dup"])


In [110]:
drug2gene_santos.head(n=1)

Unnamed: 0,uniprot_id,target_species,target_name,drug_name,action_comment,drugbank_id,hgnc_id,gene_symbol
0,P30968,Homo sapiens,Gonadotropin-releasing hormone receptor,abarelix,Gonadotropin-releasing hormone receptor antago...,DB00106,4421,GNRHR


In [111]:
db_name = "Santos"
pmid = "27910877"
interaction_type = 'target'
drug2gene_santos = drug2gene_santos.assign(source_name=db_name, pmid=pmid,
                                           download_date=datetime.date.today().strftime("%Y-%m-%d"),
                                           interaction_type=interaction_type)

In [112]:
drug_data_containers[db_name] = drug2gene_santos

# Let's wrap it up

In [113]:
include_cols = [u'hgnc_id', u'gene_symbol', u'drugbank_id', u'drug_name',
                u'target_action', 'target_id', u'interaction_type',
                #u'target_action_detailed',
                #u'primary_target', 
                u'target_known_action',
                u'pmid', u'source_name', u'download_date' ]
drug_df_final = reduce(lambda left,right: pd.concat([left,right], ignore_index=True,
                                                    join='outer'), drug_data_containers.values())[include_cols]
drug_df_final = drug_df_final.replace('', np.nan)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




In [114]:
drug_df_final = drug_df_final.merge(pd.DataFrame(list(drug_atc.items()), columns=["drugbank_id", "ATC_code"]))\
                             .merge(drug_data_containers["DrugBank"][["drugbank_id", "approval_status"]], how="outer")\
                             .merge(pd.DataFrame(list(cancer_drugs.items()), columns=["drugbank_id", "is_cancer_drug"]))\
                             .drop_duplicates()

In [115]:
drug_df_final = drug_df_final[drug_df_final['hgnc_id'].notnull()]#.loc[drug_df_final['hgnc_id']=='null']

In [116]:
drug_df_final.fillna('null',inplace=True)

In [117]:
drug_df_final['hgnc_id'] = drug_df_final['hgnc_id'].astype(int).astype(str)

In [118]:
drug_df_final

Unnamed: 0,hgnc_id,gene_symbol,drugbank_id,drug_name,target_action,target_id,interaction_type,target_known_action,pmid,source_name,download_date,ATC_code,approval_status,is_cancer_drug
0,3535,F2,DB00001,lepirudin,inhibitor,BE0000048,target,yes,10505536|10912644|11055889|11467439|11807012|1...,DrugBank,2018-07-01,B01AE02,approved,False
1,3535,F2,DB00001,lepirudin,,,target,,27910877,Santos,2018-07-01,B01AE02,approved,False
2,3236,EGFR,DB00002,cetuximab,antagonist,BE0000767,target,yes,10480573|10601294|10628369|11408594|11431346|1...,DrugBank,2018-07-01,L01XC06,approved,True
14,3620,FCGR3B,DB00002,cetuximab,,BE0000901,target,unknown,16336752,DrugBank,2018-07-01,L01XC06,approved,True
26,1246,C1R,DB00002,cetuximab,,BE0002093,target,unknown,17139284|17016423,DrugBank,2018-07-01,L01XC06,approved,True
38,1241,C1QA,DB00002,cetuximab,,BE0002094,target,unknown,17139284|17016423,DrugBank,2018-07-01,L01XC06,approved,True
50,1242,C1QB,DB00002,cetuximab,,BE0002095,target,unknown,17139284|17016423,DrugBank,2018-07-01,L01XC06,approved,True
62,1245,C1QC,DB00002,cetuximab,,BE0002096,target,unknown,17139284|17016423,DrugBank,2018-07-01,L01XC06,approved,True
74,3619,FCGR3A,DB00002,cetuximab,,BE0002097,target,unknown,17139284|17016423|17704420,DrugBank,2018-07-01,L01XC06,approved,True
86,1247,C1S,DB00002,cetuximab,,BE0001529,target,unknown,17139284|17016423,DrugBank,2018-07-01,L01XC06,approved,True


In [None]:
# drug_df_final['drug_name'] = drug_df_final['drug_name'].to_string()

In [119]:
group_cols = ["hgnc_id"]
exclude_cols = group_cols + ["gene_symbol"]
groups = drug_df_final.groupby(group_cols)
for n, g in groups:
    all_genes[n]['drugs'] = g[[c for c in g.columns if c not in exclude_cols]].to_dict('records')
    all_genes[n]['drug_evidence_score'] = g.groupby(['drug_name'])\
        .agg({'source_name': lambda x: len(x)})\
        .reset_index()\
        .rename(columns={'source_name': 'drug_score'})\
        .to_dict('records')

In [128]:
drug_df_final['source_name'].unique()

array(['DrugBank', 'Santos', 'TTD', 'IUPHAR'], dtype=object)

In [125]:
all_genes['3236']
#all_genes['24301']


{'hgnc_id': '3236',
 'gene_symbol': 'EGFR',
 'name': 'epidermal growth factor receptor',
 'status': 'Approved',
 'locus_group': 'protein-coding gene',
 'locus_type': 'gene with protein product',
 'gene_family': 'Erb-b2 receptor tyrosine kinases',
 'alias_symbol': 'ERBB1',
 'prev_symbol': 'ERBB',
 'location': '7p11.2',
 'uniprot_ids': 'P00533',
 'entrez_id': 1956.0,
 'ensembl_gene_id': 'ENSG00000146648',
 'date_approved_reserved': '1986-01-01',
 'date_modified': '2018-06-30',
 'date_symbol_changed': 'null',
 'cancer': [{'driver_type': 'Oncogene',
   'pmid': '23539594',
   'source_name': 'Vogelstein',
   'score': 4},
  {'driver_type': 'Oncogene',
   'pmid': '25759023',
   'source_name': 'Rubio-Perez',
   'score': 4},
  {'driver_type': 'Oncogene',
   'pmid': '14681372',
   'source_name': 'Uniprot',
   'score': 4},
  {'driver_type': 'Unknown',
   'pmid': '14993899',
   'source_name': 'Cosmic',
   'score': 4}],
 'drugs': [{'drugbank_id': 'DB00002',
   'drug_name': 'cetuximab',
   'target_ac

# Dump json to file - for fun

In [None]:
import json

In [None]:
all_genes_json = json.dumps(all_genes, indent=4, sort_keys=True)

In [None]:
%pwd

In [None]:
with open("/Users/Heisenberg/DockerStation/clinicalReporting_DB_RESTAPI/data/driver_db_dump_v2.json", 'w') as f:
    f.write(all_genes_json)

# MongoDB stuff

Install MongoDB: https://docs.mongodb.com/getting-started/shell/tutorial/install-mongodb-on-os-x/
You could just save everyting to a .json and import it at the command line: https://docs.mongodb.com/getting-started/shell/import-data/

Or,Use pymongo to place the documents into the database one by one. here is a link to a helpful tutorial: http://api.mongodb.com/python/current/tutorial.html

You can use ```conda install pymongo```

Although the already mentioned tutorial seems quicker, for completeness: https://docs.mongodb.com/getting-started/python/introduction/

### MongoDB Compass
It's a seperate install. Intall after you have mongoDB installed. You'll need a running instance of your db up and running. You can do this by calling ```mongod``` on the command line. Then open up Compass and you'll see all the databases that you currently have running. After running the code below, you won't see the changes immediatly in a running Compass session, you'll have to press the sync button somewhere on the top left hand side of the interface.

In [None]:
# need pymongo
#import pymongo
#from pymongo import MongoClient

In [None]:
#client = MongoClient()
# you can call the database what you want; I chose "drivers"
#db = client['drivers']
# the collection will be called posts; change accordingly
#posts = db.cancer_genes

# alright pop it into the database

In [None]:
#posts.insert_many(all_genes.values())

In [None]:
# for i in range(len(df_scored)):
#     posts.insert_one(df_scored.iloc[i].to_dict())

# Check that it happened with example query

In [None]:
#for post in posts.find({"gene_symbol": "BRAF"}):
#    print(post)

In [None]:
#post["cancer"]

In [None]:
#found_targets = []
#for post in posts.find({"drugs":  {"$elemMatch": {"drugbank_id": "DB00002"}}}):
    #found_targets.append((post["gene_symbol"])
    #print(post)

In [None]:
#post["gene_symbol"]

In [None]:
#found_targets

In [None]:
#len(post["drugs"]), post["drugs"]