In [47]:
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
import xmlschema
import networkx as nx
from xml.etree import cElementTree as ElementTree
import matplotlib.pyplot as plt
import re
import csv


In [48]:
"""
DISEASE-gene from Orphanet: get only link for VWD for now? (VWD<-VWF)
"""

filepath = "../Data/Orphanet_disease-gene.xml"

# REFERENCE 1
class XmlListConfig(list):
    def __init__(self, aList):
        for element in aList:
            if element:
                # treat like dict
                if len(element) == 1 or element[0].tag != element[1].tag:
                    self.append(XmlDictConfig(element))
                # treat like list
                elif element[0].tag == element[1].tag:
                    self.append(XmlListConfig(element))
            elif element.text:
                text = element.text.strip()
                if text:
                    self.append(text)

class XmlDictConfig(dict):
    '''
    Example usage:

    >>> tree = ElementTree.parse('your_file.xml')
    >>> root = tree.getroot()
    >>> xmldict = XmlDictConfig(root)
    '''
    def __init__(self, parent_element):
        if parent_element.items():
            self.update(dict(parent_element.items()))
        for element in parent_element:
            if element:
                # treat like dict - we assume that if the first two tags
                # in a series are different, then they are all different.
                if len(element) == 1 or element[0].tag != element[1].tag:
                    aDict = XmlDictConfig(element)
                # treat like list - we assume that if the first two tags
                # in a series are the same, then the rest are the same.
                else:
                    # here, we put the list in dictionary; the key is the
                    # tag name the list elements all share in common, and
                    # the value is the list itself 
                    aDict = {element[0].tag: XmlListConfig(element)}
                # if the tag has attributes, add those to the dict
                if element.items():
                    aDict.update(dict(element.items()))
                self.update({element.tag: aDict})
            # this assumes that if you've got an attribute in a tag,
            # you won't be having any text. This may or may not be a 
            # good idea -- time will tell. It works for the way we are
            # currently doing XML configuration files...
#             elif element.items():
#                 self.update({element.tag: dict(element.items())})
            # finally, if there are no child tags and no attributes, extract
            # the text
            else:
                self.update({element.tag: element.text})
                

etree = ET.parse(filepath) #create an ElementTree object 
root = etree.getroot()
xmldict = XmlDictConfig(root)

disease_list = xmldict['DisorderList']['Disorder']

## Make a dataframe with only relevant fields: 
    # OrphaNumber (Orphanet ID)
    # Name (disease name)
    # DisorderGeneAssociationList['Count'] (number of genes associated with the disease)
    ## May need loop for:
    # DisorderGeneAssociationList['DisorderGeneAssociation']['Gene']['Name'] (name of associated gene)
    # DisorderGeneAssociationList['DisorderGeneAssociation']['Gene']['ExternalReferenceList']['ExternalReference'] (Gene IDs and sources)
    # DisorderGeneAssociationList['DisorderGeneAssociation']['Gene']['DisorderGeneAssociationType']['Name']?
    # DisorderGeneAssociationList['DisorderGeneAssociation']['Gene']['DisorderGeneAssociationStatus']['Name'] (whether association is 'Assessed')



  # This is added back by InteractiveShellApp.init_path()


In [49]:
disease_list[0]

{'id': '17601',
 'OrphaNumber': '166024',
 'Name': 'Multiple epiphyseal dysplasia, Al-Gazali type',
 'DisorderGeneAssociationList': {'count': '1',
  'DisorderGeneAssociation': {'SourceOfValidation': '22587682[PMID]',
   'Gene': {'id': '20160',
    'OrphaNumber': '268061',
    'Name': 'kinesin family member 7',
    'Symbol': 'KIF7',
    'SynonymList': {'count': '1', 'Synonym': 'JBTS12'},
    'ExternalReferenceList': {'ExternalReference': [{'id': '57240',
       'Source': 'Ensembl',
       'Reference': 'ENSG00000166813'},
      {'id': '51758', 'Source': 'Genatlas', 'Reference': 'KIF7'},
      {'id': '51756', 'Source': 'HGNC', 'Reference': '30497'},
      {'id': '51757', 'Source': 'OMIM', 'Reference': '611254'},
      {'id': '97306', 'Source': 'Reactome', 'Reference': 'Q2M1P5'},
      {'id': '51759', 'Source': 'SwissProt', 'Reference': 'Q2M1P5'}],
     'count': '6'},
    'LocusList': {'count': '1', 'Locus': '\n              '}},
   'DisorderGeneAssociationType': {'id': '17949',
    'Name'

In [50]:
OrphaNumbers = []
Disease_names = []
GeneAssociationCounts = []
Associated_genes = []
Gene_IDs = []
Association_types = []
Association_statuses = []
counter = -1

for disease in disease_list:
    OrphaNumber = int(disease['OrphaNumber'])
    dnames = disease['Name']
    Count = int(disease['DisorderGeneAssociationList']['count'])
    
    associations = disease['DisorderGeneAssociationList']['DisorderGeneAssociation'] # dict if 1 association, list if >1
    genes = []
    gene_symbols = []
    geneIDs = []
    types = []
    statuses = []
    if Count > 1:
        for i, association in enumerate(associations):
            genes.append(associations[i]['Gene']['Name'])
            gene_symbols.append(associations[i]['Gene']['Symbol'])
            geneIDs.append(associations[i]['Gene']['ExternalReferenceList'])
            types.append(associations[i]['DisorderGeneAssociationType']['Name'])
            statuses.append(associations[i]['DisorderGeneAssociationStatus']['Name'])
    else:
        genes.append(associations['Gene']['Name'])
        gene_symbols.append(associations['Gene']['Symbol'])
        geneIDs.append(associations['Gene']['ExternalReferenceList'])
        types.append(associations['DisorderGeneAssociationType']['Name'])
        statuses.append(associations['DisorderGeneAssociationStatus']['Name'])        
    
    # Update column lists
    OrphaNumbers.append(OrphaNumber)
    Disease_names.append(dnames)
    GeneAssociationCounts.append(Count)
    Associated_genes.append(genes)
    Gene_IDs.append(geneIDs)
    Association_types.append(types)
    Association_statuses.append(statuses)
    
    counter += 1

print("Number of terms:" + str(counter))
# colnames = ['OrphaNumber', 'Disease_name', 'GeneAssociationCount', 
#             'Associated_genes', 'Gene_IDs', 'Association_types', 'Association_statuses'] 
coldict = {'OrphaNumber': OrphaNumbers,
          'Disease_name': Disease_names,
          'GeneAssociationCount': GeneAssociationCounts,
          'Associated_genes': Associated_genes,
          'Gene_IDs': Gene_IDs,
          'Association_types': Association_types,
          'Association_statuses': Association_statuses}

disease_df = pd.DataFrame(coldict)
disease_df.to_csv("../Data/Orphanet_disease-gene_df.csv")
disease_df.head(10)


Number of terms:3802


Unnamed: 0,OrphaNumber,Disease_name,GeneAssociationCount,Associated_genes,Gene_IDs,Association_types,Association_statuses
0,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",1,[kinesin family member 7],"[{'ExternalReference': [{'id': '57240', 'Sourc...",[Disease-causing germline mutation(s) in],[Assessed]
1,93,Aspartylglucosaminuria,1,[aspartylglucosaminidase],"[{'ExternalReference': [{'id': '126323', 'Sour...",[Disease-causing germline mutation(s) in],[Assessed]
2,166035,Brachydactyly-short stature-retinitis pigmento...,1,[CWC27 spliceosome associated protein homolog],[\n ],[Disease-causing germline mutation(s) in],[Assessed]
3,585,Multiple sulfatase deficiency,1,[sulfatase modifying factor 1],"[{'ExternalReference': [{'id': '56683', 'Sourc...",[Disease-causing germline mutation(s) in],[Assessed]
4,118,Beta-mannosidosis,1,[mannosidase beta],"[{'ExternalReference': [{'id': '100307', 'Sour...",[Disease-causing germline mutation(s) in],[Assessed]
5,166068,Pontocerebellar hypoplasia type 5,1,[tRNA splicing endonuclease subunit 54],"[{'ExternalReference': [{'id': '58222', 'Sourc...",[Disease-causing germline mutation(s) in],[Assessed]
6,166063,Pontocerebellar hypoplasia type 4,1,[tRNA splicing endonuclease subunit 54],"[{'ExternalReference': [{'id': '58222', 'Sourc...",[Disease-causing germline mutation(s) in],[Assessed]
7,166078,Von Willebrand disease type 1,1,[von Willebrand factor],"[{'ExternalReference': [{'id': '60135', 'Sourc...",[Disease-causing germline mutation(s) in],[Assessed]
8,166073,Pontocerebellar hypoplasia type 6,1,"[arginyl-tRNA synthetase 2, mitochondrial]","[{'ExternalReference': [{'id': '60133', 'Sourc...",[Disease-causing germline mutation(s) in],[Assessed]
9,166084,Von Willebrand disease type 2A,1,[von Willebrand factor],"[{'ExternalReference': [{'id': '60135', 'Sourc...",[Disease-causing germline mutation(s) in],[Assessed]


In [51]:
# Disease associations to use: 'Disease-causing' substring
# Other potential associations: 'Part of fusion gene', 'Major susceptibility factor', 'Role in phenotype' substrings

# Keep only entries with causal disease associations:
for i, association in enumerate(disease_df['Association_types']):
    if 'disease-causing' not in str(association).lower():
        disease_df = disease_df.drop(i)
        
print(len(disease_df))
disease_df.head(10)


3415


Unnamed: 0,OrphaNumber,Disease_name,GeneAssociationCount,Associated_genes,Gene_IDs,Association_types,Association_statuses
0,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",1,[kinesin family member 7],"[{'ExternalReference': [{'id': '57240', 'Sourc...",[Disease-causing germline mutation(s) in],[Assessed]
1,93,Aspartylglucosaminuria,1,[aspartylglucosaminidase],"[{'ExternalReference': [{'id': '126323', 'Sour...",[Disease-causing germline mutation(s) in],[Assessed]
2,166035,Brachydactyly-short stature-retinitis pigmento...,1,[CWC27 spliceosome associated protein homolog],[\n ],[Disease-causing germline mutation(s) in],[Assessed]
3,585,Multiple sulfatase deficiency,1,[sulfatase modifying factor 1],"[{'ExternalReference': [{'id': '56683', 'Sourc...",[Disease-causing germline mutation(s) in],[Assessed]
4,118,Beta-mannosidosis,1,[mannosidase beta],"[{'ExternalReference': [{'id': '100307', 'Sour...",[Disease-causing germline mutation(s) in],[Assessed]
5,166068,Pontocerebellar hypoplasia type 5,1,[tRNA splicing endonuclease subunit 54],"[{'ExternalReference': [{'id': '58222', 'Sourc...",[Disease-causing germline mutation(s) in],[Assessed]
6,166063,Pontocerebellar hypoplasia type 4,1,[tRNA splicing endonuclease subunit 54],"[{'ExternalReference': [{'id': '58222', 'Sourc...",[Disease-causing germline mutation(s) in],[Assessed]
7,166078,Von Willebrand disease type 1,1,[von Willebrand factor],"[{'ExternalReference': [{'id': '60135', 'Sourc...",[Disease-causing germline mutation(s) in],[Assessed]
8,166073,Pontocerebellar hypoplasia type 6,1,"[arginyl-tRNA synthetase 2, mitochondrial]","[{'ExternalReference': [{'id': '60133', 'Sourc...",[Disease-causing germline mutation(s) in],[Assessed]
9,166084,Von Willebrand disease type 2A,1,[von Willebrand factor],"[{'ExternalReference': [{'id': '60135', 'Sourc...",[Disease-causing germline mutation(s) in],[Assessed]


In [21]:
for ref in disease_df.iloc[0]['Gene_IDs'][0]['ExternalReference']:
    if ref['Source'] == 'HGNC':
        snode = ref['Reference']
# disease_df.iloc[0]['Gene_IDs'][0]['ExternalReference']

[{'id': '57240', 'Source': 'Ensembl', 'Reference': 'ENSG00000166813'},
 {'id': '51758', 'Source': 'Genatlas', 'Reference': 'KIF7'},
 {'id': '51756', 'Source': 'HGNC', 'Reference': '30497'},
 {'id': '51757', 'Source': 'OMIM', 'Reference': '611254'},
 {'id': '97306', 'Source': 'Reactome', 'Reference': 'Q2M1P5'},
 {'id': '51759', 'Source': 'SwissProt', 'Reference': 'Q2M1P5'}]

In [25]:
dict == type(disease_df.iloc[0]['Gene_IDs'][0])

True

In [52]:
# Build graph! Dataframe columns: source node, source node attributes, target node, target node attributes, edge attributes
NE_cols = ['source_node', 'snode_type', 'gene_name', 
           'target_node', 'tnode_type', 'disease_name', 
           'assoc_type', 'assoc_status']
NE_df = pd.DataFrame(columns = NE_cols)

for i, genelist in enumerate(disease_df['Associated_genes']):
    for j, gene in enumerate(genelist):
        if type(disease_df.iloc[i]['Gene_IDs'][0]) != dict:
            pass
        else:
            for ref in disease_df.iloc[i]['Gene_IDs'][0]['ExternalReference']:
                if ref['Source'] == 'HGNC': # Using HGNC ID since pretty much all databases reference it for genes
                    snode = ref['Reference']
                    snode_type = 'gene'
                    gene_name = gene

                    tnode = disease_df.iloc[i]['OrphaNumber']
                    tnode_type = 'disease'
                    disease_name = disease_df.iloc[i]['Disease_name']

                    assoc_type = disease_df.iloc[i]['Association_types'][j]
                    status = disease_df.iloc[i]['Association_statuses'][j]
                    newrow = [snode, snode_type, gene_name, tnode, tnode_type, disease_name, assoc_type, status]
                    NE_df.loc[len(NE_df)] = newrow
                else:
                    pass

NE_df.head(10)
                                                       

Unnamed: 0,source_node,snode_type,gene_name,target_node,tnode_type,disease_name,assoc_type,assoc_status
0,30497,gene,kinesin family member 7,166024,disease,"Multiple epiphyseal dysplasia, Al-Gazali type",Disease-causing germline mutation(s) in,Assessed
1,318,gene,aspartylglucosaminidase,93,disease,Aspartylglucosaminuria,Disease-causing germline mutation(s) in,Assessed
2,20376,gene,sulfatase modifying factor 1,585,disease,Multiple sulfatase deficiency,Disease-causing germline mutation(s) in,Assessed
3,6831,gene,mannosidase beta,118,disease,Beta-mannosidosis,Disease-causing germline mutation(s) in,Assessed
4,27561,gene,tRNA splicing endonuclease subunit 54,166068,disease,Pontocerebellar hypoplasia type 5,Disease-causing germline mutation(s) in,Assessed
5,27561,gene,tRNA splicing endonuclease subunit 54,166063,disease,Pontocerebellar hypoplasia type 4,Disease-causing germline mutation(s) in,Assessed
6,12726,gene,von Willebrand factor,166078,disease,Von Willebrand disease type 1,Disease-causing germline mutation(s) in,Assessed
7,21406,gene,"arginyl-tRNA synthetase 2, mitochondrial",166073,disease,Pontocerebellar hypoplasia type 6,Disease-causing germline mutation(s) in,Assessed
8,12726,gene,von Willebrand factor,166084,disease,Von Willebrand disease type 2A,Disease-causing germline mutation(s) in,Assessed
9,735,gene,N-acylsphingosine amidohydrolase 1,333,disease,Farber disease,Disease-causing germline mutation(s) in,Assessed


In [53]:
# Create diseases graph object
disease_gene_graph = nx.from_pandas_edgelist(NE_df, source = 'source_node', target = 'target_node', 
                                             edge_attr = ['assoc_type', 'assoc_status'])


In [54]:
# Node attributes: key is node, value is dict of attribute-value pairs
snode_attr = {node: {'node_type': NE_df.iloc[i]['snode_type'], 
                     'gene_name': NE_df.iloc[i]['gene_name']} for i, node in enumerate(NE_df['source_node'])}
tnode_attr = {node: {'node_type': NE_df.iloc[i]['tnode_type'], 
                     'disease_name': NE_df.iloc[i]['disease_name']} for i, node in enumerate(NE_df['target_node'])}

nx.set_node_attributes(disease_gene_graph, snode_attr)
nx.set_node_attributes(disease_gene_graph, tnode_attr)


In [55]:
# Create edgelist file? Test with BioNEV code
filepath = '../Graphs/OrphaNet_disease_gene.edgelist'
with open(filepath, 'wb') as file:
    nx.write_edgelist(disease_gene_graph, file)

In [56]:
# Visualize graph (not really helpful at this point)
# nx.draw_networkx(disease_gene_graph, with_labels = False, nodelist = NE_df.source_node.tolist(), node_color = 'r', alpha = 0.5)
# nx.draw_networkx(disease_gene_graph, with_labels = False, nodelist = NE_df.target_node.tolist(), node_color = 'b', alpha = 0.5)
# plt.show()


In [8]:
"""
DRUG-gene target from DrugBank*

*Combine with KEGG? Or just use DB or KEGG alone

"""
filepath = "../Data/DrugBank full database.xml"
xsdpath = "../Data/drugbank_schema.xsd"

# XML schema object
xs = xmlschema.XMLSchema(xsdpath)

# Decode to dictionary
### TAKES A WHILE (LIKE 30-ish minutes) TO RUN
db_dict = xs.to_dict(filepath)


In [57]:
db_dict.keys() # drug data is stored in 'drug' key in list of dictionary form


dict_keys(['@xmlns', '@xmlns:xsi', '@xsi:schemaLocation', '@version', '@exported-on', 'drug'])

In [160]:
# Examine db_dict
db_dict['drug'][24]['targets']['target'][0]

{'@position': 1,
 'id': 'BE0000565',
 'name': 'Interleukin-1 receptor type 1',
 'organism': 'Humans',
 'actions': {'action': ['antagonist']},
 'references': {'articles': {'article': [{'ref-id': 'A1826',
     'pubmed-id': '17498496',
     'citation': 'Tang YH, Zhang SP, Liang Y, Deng CQ: [Effects of Panax notoginseng saponins on mRNA expressions of interleukin-1 beta, its correlative factors and cysteinyl-aspartate specific protease after cerebral ischemia-reperfusion in rats]. Zhong Xi Yi Jie He Xue Bao. 2007 May;5(3):328-32.'},
    {'ref-id': 'A1828',
     'pubmed-id': '12817089',
     'citation': 'Dayer JM: The pivotal role of interleukin-1 in the clinical manifestations of rheumatoid arthritis. Rheumatology (Oxford). 2003 May;42 Suppl 2:ii3-10.'},
    {'ref-id': 'A1830',
     'pubmed-id': '12355453',
     'citation': 'Vamvakopoulos J, Green C, Metcalfe S: Genetic control of IL-1beta bioactivity through differential regulation of the IL-1 receptor antagonist. Eur J Immunol. 2002 Oct;

In [10]:
# Create list of dictionaries, with each dict containing info for a single drug
drugs_list = db_dict['drug']

drugs_w_targets = list()
counter_notpp = 0
for drug in drugs_list:
    if 'target' not in str(drug['targets']):
        pass
    elif 'polypeptide' not in str(drug['targets']['target'][0]):
        counter_notpp += 1
    elif 'HUGO Gene Nomenclature Committee (HGNC)' in str(drug['targets']['target'][0]['polypeptide'][0]['external-identifiers']['external-identifier']):
        drugs_w_targets.append(drug)
        
print("Number of drugs with polypeptide targets: " + str(len(drugs_w_targets)))
print("Number of drugs with non-polypeptide targets: " + str(counter_notpp))
print("Number of drugs in DrugBank total: " + str(len(drugs_list)))



Number of drugs with polypeptide targets: 5232
Number of drugs with non-polypeptide targets: 202
Number of drugs in DrugBank total: 13339


In [11]:
# Need source(drug) columns: drugs_list<insert indices>
    # drug_ID: id[i] if id[i]['@primary'] == True for id in ['drugbank-id']
    # drug_name: ['name'] 
    # drug_action: ['targets']['target'][i]['actions']['action']
    # drug_type: ['classification']['direct-parent']
# Need target(gene) columns: drugs_list['targets']['target']<insert indices>
    # target_ID: ['polypeptide'][i]['external-identifiers']['external-identifier'][i][]
    # target_name: ['name']

DB_cols = ['drug_ID', 'drug_name', 'drug_action', 'drug_type', 'snode_type',
          'target_ID', 'target_name', 'tnode_type']
DB_df = pd.DataFrame(columns = DB_cols)  
# idx = 0
for i, drug in enumerate(drugs_w_targets):
    for ID in drug['drugbank-id']:
        if ID['@primary'] == True:
            drug_ID = ID['$']
    drug_name = drug['name']
    if 'classification' not in drug.keys():
        drug_type = ''
    else:
        drug_type = drug['classification']['direct-parent']
    snode_type = 'drug'
    
    for target in drug['targets']['target']:
        if 'polypeptide' not in target.keys():
            pass
        elif 'HGNC' not in str(target['polypeptide'][0]['external-identifiers']['external-identifier'][0]['identifier']):
            pass
        else:
            HGNC = target['polypeptide'][0]['external-identifiers']['external-identifier'][0]['identifier'].split(':')[1]
            target_ID = HGNC
            target_name = target['name']
            tnode_type = 'target'
            if target['actions'] == None:
                drug_action = ''
            else:
                drug_action = target['actions']['action']
            newrow = [drug_ID, drug_name, drug_action, drug_type, snode_type, target_ID, target_name, tnode_type]
            DB_df.loc[len(DB_df)] = newrow
#     print(idx)
#     idx += 1
    

In [12]:
DB_df.tail(10)

Unnamed: 0,drug_ID,drug_name,drug_action,drug_type,snode_type,target_ID,target_name,tnode_type
13414,DB14738,Turoctocog alfa pegol,[activator],Peptides,drug,3551,Coagulation factor IX,target
13415,DB14738,Turoctocog alfa pegol,[activator],Peptides,drug,3528,Coagulation factor X,target
13416,DB14738,Turoctocog alfa pegol,[binder],Peptides,drug,3535,Prothrombin,target
13417,DB14751,Mecasermin rinfabate,[agonist],Peptides,drug,5465,Insulin-like growth factor 1 receptor,target
13418,DB14751,Mecasermin rinfabate,,Peptides,drug,6091,Insulin receptor,target
13419,DB14751,Mecasermin rinfabate,,Peptides,drug,5467,Cation-independent mannose-6-phosphate receptor,target
13420,DB14754,Solriamfetol,,,drug,11049,Sodium-dependent dopamine transporter,target
13421,DB14754,Solriamfetol,,,drug,11048,Sodium-dependent noradrenaline transporter,target
13422,DB14762,Risankizumab,[inhibitor],Peptides,drug,5970,Interleukin-23,target
13423,DB15250,Vercirnon,,,drug,1610,C-C chemokine receptor type 9,target


In [58]:
# Create DrugBank drug graph object
DBdrug_target_graph = nx.from_pandas_edgelist(DB_df, source = 'drug_ID', target = 'target_ID', 
                                             edge_attr = ['drug_action'])

In [59]:
# Node attributes: key is node, value is dict of attribute-value pairs
snode_attr = {node: {'node_type': DB_df.iloc[i]['snode_type'], 
                     'drug_name': DB_df.iloc[i]['drug_name'],
                     'drug_type': DB_df.iloc[i]['drug_type']} for i, node in enumerate(DB_df['drug_ID'])}
tnode_attr = {node: {'node_type': DB_df.iloc[i]['tnode_type'], 
                     'gene_name': DB_df.iloc[i]['target_name']} for i, node in enumerate(DB_df['target_ID'])}

nx.set_node_attributes(DBdrug_target_graph, snode_attr)
nx.set_node_attributes(DBdrug_target_graph, tnode_attr)

In [60]:
# Create edgelist file for DrugBank drug-targets
filepath = '../Graphs/DrugBank_drug_target.edgelist'
with open(filepath, 'wb') as file:
    nx.write_edgelist(DBdrug_target_graph, file)

In [61]:
# Combine DB drug-gene graph with OrphaNet disease-gene graph by gene nodes?
ORDO_DB_graph = nx.compose(disease_gene_graph, DBdrug_target_graph)

# Write to file
filepath = '../Graphs/ORDO_DB_combined.edgelist'
with open(filepath, 'wb') as file:
    nx.write_edgelist(ORDO_DB_graph, file)
    

In [27]:
"""
CHEMICAL-gene target from CTD
***Gene IDs are NCBI identifiers, which are not in OrphaNet data...have to map if want to connect the graphs

"""
### THIS ALSO TAKES A WHILE TO RUN (not as long as DrugBank though)
filepath = '../Data/CTD_chem_gene_ixns.csv'
with open(filepath, 'r') as file:
    reader = csv.reader(file, delimiter=',', quotechar='"')
    data_rows = [line for line in reader if line[0][0] != '#']
    
file.close()


In [45]:
# Create CTD data frame
CTD_cols = ['chem_name', 'chem_ID', 'CAS', 'gene', 'gene_ID', 'gene_forms', 
           'organism', 'organism_ID', 'ixn', 'ixn_actions', 'PMIDs']
CTD_df = pd.DataFrame(data_rows, columns = CTD_cols)

# Keep only interactions with human genes ('organism_ID' == '9606')
CTD_df = CTD_df[CTD_df['organism_ID'] == '9606']
print(len(CTD_df))
CTD_df.head(10)


835389


Unnamed: 0,chem_name,chem_ID,CAS,gene,gene_ID,gene_forms,organism,organism_ID,ixn,ixn_actions,PMIDs
2,10074-G5,C534883,,MYC,4609,protein,Homo sapiens,9606,10074-G5 analog results in decreased expressio...,decreases^expression,26036281
3,10074-G5,C534883,,MYC,4609,protein,Homo sapiens,9606,10074-G5 results in decreased activity of MYC ...,decreases^activity,25716159
4,10074-G5,C534883,,MYC,4609,protein,Homo sapiens,9606,10074-G5 results in decreased expression of MY...,decreases^expression,26036281
12,"10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine...",C004822,35079-97-1,EPHX1,2052,gene,Homo sapiens,9606,[EPHX1 gene SNP affects the metabolism of carb...,affects^chemical synthesis|affects^metabolic p...,15692831
13,"10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine...",C004822,35079-97-1,EPHX1,2052,protein,Homo sapiens,9606,[EPHX1 protein results in increased metabolism...,increases^chemical synthesis|increases^metabol...,15692831
14,"10,11-dihydro-10-hydroxycarbamazepine",C039775,,ABCB1,5243,protein,Homo sapiens,9606,ABCB1 protein results in increased transport o...,increases^transport,16190932
19,10-(2-pyrazolylethoxy)camptothecin,C534422,,CASP3,836,protein,Homo sapiens,9606,10-(2-pyrazolylethoxy)camptothecin results in ...,increases^activity,18708040
20,10-(2-pyrazolylethoxy)camptothecin,C534422,,CASP8,841,protein,Homo sapiens,9606,10-(2-pyrazolylethoxy)camptothecin results in ...,increases^activity,18708040
21,103D5R,C496879,,AKT1,207,protein,Homo sapiens,9606,103D5R results in decreased phosphorylation of...,decreases^phosphorylation,15695405
22,103D5R,C496879,,HIF1A,3091,protein,Homo sapiens,9606,103D5R inhibits the reaction [cobaltous chlori...,decreases^reaction|increases^expression,15695405


In [None]:
# Create graph from CTD dataframe
chem_gene_graph = nx.from_pandas_edgelist(CTD_df, source = 'chem_ID', target = 'gene_ID', 
                                             edge_attr = ['assoc_type', 'assoc_status'])


In [82]:
"""
DRUG-gene target from KEGG*
*Combine with DrugBank??
Honestly KEGG's format is a pain though
########################

filepath = "../Data/KEGG_drug"
with open(filepath, 'r') as file:
    # Separate entries (///), then create dictionary
    entries = file.read().split('\n///\n')
    entrylist = list()
    for i, entry in enumerate(entries):
        entry = entry.split('\n')
        entrydict = dict()
        for row in entry:
            pattern = '\s{3,}'
            row = re.split(pattern, row)
            if row[0] != '':
                key = row[0]
                entrydict[key] = row[1:]
            else:
                if key in entrydict.keys():
                    entrydict[key].append(row[1:])  
        entrylist.append(entrydict)

file.close()

# Only keep drug entries that contain target genes
entrylist = [entry for entry in entrylist if 'TARGET' in list(entry.keys())]
"""

In [80]:
# KEGG API for retrieving target gene references
# URL = "http://rest.kegg.jp/find/genes/{}"
# entryID = ''

{'ENTRY': ['D00045', 'Drug'],
 'NAME': ['Adenosine (JAN/USP);', ['Adenocard (TN);'], ['Adenoscan (TN)']],
 'FORMULA': ['C10H13N5O4'],
 'EXACT_MASS  267.0968': [],
 'MOL_WEIGHT  267.2413': [],
 'REMARK': ['Same as: C00212',
  ['Therapeutic category: 7990'],
  ['ATC code: C01EB10'],
  ['Chemical structure group: DG00243'],
  ['Product (DG00243): D00045<JP/US> D02300<JP>']],
 'EFFICACY': ['Antiarrhythmic, Cardiac depressant, Diagnostic aid (cardiac stress test), Adenosine receptor agonist'],
 'TARGET': ['ADORA [HSA:134 135 136 140] [KO:K04265 K04266 K04267 K04268]'],
 'INTERACTION  ': [],
 'DBLINKS': ['CAS: 58-61-7',
  ['PubChem: 7847113'],
  ['ChEBI: 16335'],
  ['ChEMBL: CHEMBL477'],
  ['DrugBank: DB00640'],
  ['PDB-CCD: ADN'],
  ['LigandBox: D00045'],
  ['NIKKAJI: J4.501B']],
 'ATOM': ['19',
  ['1', 'N4y N', '27.7735  -15.1762'],
  ['2', 'C8y C', '28.7550  -16.0205'],
  ['3', 'C1y C', '26.4808  -15.5916'],
  ['4', 'C8x C', '28.3947  -13.8350'],
  ['5', 'C8y C', '30.0785  -15.2503'],
  [

In [None]:
"""
Gene pathways from KEGG

"""



In [None]:
"""
Drug chemical/molecular structure data (ChEMBL?)

"""

In [None]:
"""
REFERENCES:

Orphanet data: http://www.orphadata.org/cgi-bin/index.php
DrugBank data (large, like 1.5 GB) + XSD file: https://www.drugbank.ca/releases/latest
CTD data: http://ctdbase.org/downloads/
KEGG DRUG data file: https://www.genome.jp/kegg/drug/
KEGG PATHWAYS: 

Reading from Orphanet disease-gene XML file
>https://stackoverflow.com/questions/2148119/how-to-convert-an-xml-string-to-a-dictionary
    >http://code.activestate.com/recipes/410469-xml-as-dictionary/
    
"""