In [2]:
import os,requests,zlib
import pandas as pd
import numpy as np
from py2neo import Graph, NodeMatcher
from pprint import pprint

def _download_file(response, filename):
    with open(filename, 'wb') as f:
        for chunk in response.iter_content(chunk_size=1024):
            f.write(chunk)

def _download_and_decompress_file(response, filename):
    decompressor = zlib.decompressobj(16 + zlib.MAX_WBITS)
    filename = filename[:-3]
    with open(filename, 'w+') as f:
        while True:
            chunk = response.raw.read(1024)
            if not chunk:
                break
            string = decompressor.decompress(chunk).decode("latin-1") 
            f.write(string)

def download_datasets(selected_datasets, selected_downloads, decompress=False):
    for dataset, path in selected_datasets:
        if not os.path.exists(dataset):
            os.mkdir(dataset)
        for downloadable in selected_downloads:
            url = 'https://maayanlab.cloud/static/hdfs/harmonizome/data/%s/%s' %\
                  (path, downloadable)
            response = requests.get(url, stream=True)
            filename = '%s/%s' % (dataset, downloadable)
            # Not every dataset has all downloadables.
            if response.status_code != 200:
                continue
            if decompress and 'txt.gz' in filename:
                _download_and_decompress_file(response, filename)
            else:
                _download_file(response, filename)
        print('%s downloaded.' % dataset)

dwFiles = [
           'gene_attribute_edges.txt.gz',
           'gene_set_library_crisp.txt.gz',
           'gene_set_library_up_crisp.txt.gz',
           'gene_set_library_dn_crisp.txt.gz',
           'attribute_set_library_crisp.txt.gz',
           'attribute_set_library_up_crisp.txt.gz',
           'attribute_set_library_dn_crisp.txt.gz',
           'gene_list_terms.txt.gz',
           'attribute_list_entries.txt.gz']

In [11]:
# set up authenticated connection
#graph = GraphDatabase.driver("bolt://localhost:7687/PO2_Neo", auth=("neo4j", "ngs4"))
graph = Graph("bolt://localhost:7687/PublicOmics", auth=("neo4j", "ngs4"))

In [2]:
download_datasets([('Biocarta Pathways', 'biocarta')], dwFiles, decompress=True)
download_datasets([('HumanCyc Pathways', 'humancyc')], dwFiles, decompress=True)
download_datasets([('KEGG Pathways', 'kegg')], dwFiles, decompress=True)

Biocarta Pathways downloaded.


In [17]:
pathways = pd.read_csv('Biocarta Pathways/gene_attribute_edges.txt', sep='\t')
pathways = pathways.drop(["source_desc","target_desc"], axis=1)
pathways = pathways.iloc[1:]
pathways.head(3)
for row in pathways.itertuples():
    s2 = "MERGE (g:Gene { Symbol: '%s'}) SET g.EntrezId='%s' "%(row.source, row.source_id)
    s3 = "MERGE (p:Pathway { Name: '%s', Id: '%s', Origin: 'Biocarta'})"%(row.target,row.target_id)
    r1 = "MERGE (g)-[:BELONGS_TO{weight:%s}]->(p)"%(row.weight)
    subStmt = "{} {} {}".format(s2,s3,r1)
    graph.run(subStmt)
    
pathways = pd.read_csv('HumanCyc Pathways/gene_attribute_edges.txt', sep='\t')
pathways = pathways.drop(["target_id","target_desc"], axis=1)
pathways = pathways.iloc[1:]
pathways.head(3)
for row in pathways.itertuples():
    s2 = "MERGE (g:Gene { Symbol: '%s'}) SET g.EntrezId='%s', g.UniprotACC='%s' "%(row.source, row.source_id, row.source_desc)
    s3 = "MERGE (p:Pathway { Name: '%s', Origin: 'HumanCyc'})"%(row.target.replace('&','').replace(';','').replace("'",''))
    r1 = "MERGE (g)-[:BELONGS_TO{weight:%s}]->(p)"%(row.weight)
    subStmt = "{} {} {}".format(s2,s3,r1)
    graph.run(subStmt)
    
pathways = pd.read_csv('KEGG Pathways/gene_attribute_edges.txt', sep='\t')
pathways = pathways.drop(["source_desc","target_id"], axis=1)
pathways = pathways.iloc[1:]
pathways.head(3)
for row in pathways.itertuples():
    #pprint(row)  , : '%s'
    s2 = "MERGE (g:Gene {Symbol: '%s'}) SET g.EntrezId='%s'"%(row.source, row.source_id)
    s3 = "MERGE (p:Pathway {Name: '%s', Id: '%s', Origin:'Kegg'})"%(row.target,row.target_desc)
    r1 = "MERGE (g)-[:BELONGS_TO{weight:%s}]->(p)"%(row.weight)
    subStmt = "{} {} {}".format(s2,s3,r1)
    graph.run(subStmt)

Unnamed: 0,source,source_id,target,target_id,weight
1,ADAR,103,antisense pathway,100241,1.000000
2,NONO,4841,antisense pathway,100241,1.000000
3,SFPQ,6421,antisense pathway,100241,1.000000
4,MATR3,9782,antisense pathway,100241,1.000000
5,AHSP,51327,hemoglobins chaperone,100250,1.000000
...,...,...,...,...,...
4505,C9,735,alternative complement pathway,100243,1.000000
4506,CFP,5199,alternative complement pathway,100243,1.000000
4507,CFD,1675,alternative complement pathway,100243,1.000000
4508,AGO2,27161,dicer pathway,100188,1.000000


In [3]:
# ('OMIM Gene-Disease Associations', 'omim'),
download_datasets([('OMIM Gene-Disease Associations', 'omim')], dwFiles, decompress=True)
# ('GAD Gene-Disease Associations', 'gad'),
download_datasets([('GAD Gene-Disease Associations', 'gad')], dwFiles, decompress=True)
# ('DISEASES Curated Gene-Disease Assocation Evidence Scores', 'jensendiseasecurated'),
download_datasets([('DISEASES Curated Gene-Disease Assocation Evidence Scores', 'jensendiseasecurated')], dwFiles, decompress=True)


OMIM Gene-Disease Associations downloaded.
GAD Gene-Disease Associations downloaded.
DISEASES Curated Gene-Disease Assocation Evidence Scores downloaded.


In [15]:
disEntries = pd.read_csv('GAD Gene-Disease Associations/gene_attribute_edges.txt', sep='\t')
disEntries = disEntries.drop(["source_desc","target_id"], axis=1)
disEntries = disEntries.iloc[1:]
disEntries.head(3)
for row in disEntries.itertuples():
    for innerDis in row.target.split(';'):
        s1 = "MERGE (g:Gene {Symbol: '%s'}) SET g.EntrezId='%s'"%(row.source, row.source_id)
        s3 = "MERGE (p:Disease {Name: '%s', Origin:'GAD'})"%(innerDis.replace("'","\\'").strip())
        r1 = "MERGE (p)-[:ASSOCIATION_TO{weight:%s}]->(g)"%(row.weight)
        if row.target_desc != "unknown":
            s2 = "MERGE (c:DiseaseCategory {Name: '%s'})"%(row.target_desc)
            r2 = "MERGE (p)-[:CATEGORIZED_BY]->(c)"
            subStmt = "{} {} {} {} {}".format(s1,s2,s3,r1,r2)
        else:
            subStmt = "{} {} {}".format(s1,s3,r1)
        #print(subStmt+"\n")
        graph.run(subStmt)

In [20]:
# A question mark, "?", before the phenotype name indicates that the relationship between 
# the phenotype and gene is provisional. More details about this relationship are provided 
# in the comment field of the map and in the gene and phenotype OMIM entries.
disEntries = pd.read_csv('OMIM Gene-Disease Associations/gene_attribute_edges.txt', sep='\t')
#disEntries = disEntries.drop(["source_desc","target_id"], axis=1)
disEntries = disEntries.iloc[1:]
disEntries.head(3)
for row in disEntries.itertuples():
    cleanedDis = row.target.replace("'","\\'").replace("{","").replace("}","").replace("[","").replace("]","").strip()
    s1 = "MERGE (g:Gene {Symbol: '%s'}) SET g.EntrezId='%s' SET g.OmimId='%s'"%(row.source, row.source_id, row.source_desc.replace("OMIM:",""))
    if cleanedDis.startswith('?'):
        s3 = "MERGE (p:Disease {Name: '%s', Origin:'OMIM', PhenotypeClass: '%s'})"%(cleanedDis.replace('?',''), row.target_id)
        r1 = "MERGE (p)-[:PROVISIONAL{weight:%s}]->(g)"%(row.weight)
    else :
        s3 = "MERGE (p:Disease {Name: '%s', Origin:'OMIM', PhenotypeClass: '%s'})"%(cleanedDis, row.target_id)
        r1 = "MERGE (p)-[:ASSOCIATION_TO{weight:%s}]->(g)"%(row.weight)
    subStmt = "{} {} {}".format(s1,s3,r1)
    #print(subStmt+"\n")
    graph.run(subStmt)
    
    

MERGE (g:Gene {Symbol: 'CCR5'}) SET g.EntrezId='1234' SET g.OmimId='601373' MERGE (p:Disease {Name: 'hepatitis c virus, resistance to', Origin:'OMIM', PhenotypeClass: '3'}) MERGE (p)-[:ASSOCIATION_TO{weight:1.000000}]->(g)

MERGE (g:Gene {Symbol: 'CCR5'}) SET g.EntrezId='1234' SET g.OmimId='601373' MERGE (p:Disease {Name: 'west nile virus, susceptibility to', Origin:'OMIM', PhenotypeClass: '3'}) MERGE (p)-[:ASSOCIATION_TO{weight:1.000000}]->(g)

MERGE (g:Gene {Symbol: 'CCR5'}) SET g.EntrezId='1234' SET g.OmimId='601373' MERGE (p:Disease {Name: 'diabetes mellitus, insulin-dependent, 22', Origin:'OMIM', PhenotypeClass: '3'}) MERGE (p)-[:ASSOCIATION_TO{weight:1.000000}]->(g)

MERGE (g:Gene {Symbol: 'CCR2'}) SET g.EntrezId='729230' SET g.OmimId='601267' MERGE (p:Disease {Name: 'hiv infection, susceptibility/resistance to', Origin:'OMIM', PhenotypeClass: '3'}) MERGE (p)-[:ASSOCIATION_TO{weight:1.000000}]->(g)

MERGE (g:Gene {Symbol: 'CCR5'}) SET g.EntrezId='1234' SET g.OmimId='601373' MERGE

MERGE (g:Gene {Symbol: 'KCTD7'}) SET g.EntrezId='154881' SET g.OmimId='611725' MERGE (p:Disease {Name: 'epilepsy, progressive myoclonic 3, with or without intracellular inclusions', Origin:'OMIM', PhenotypeClass: '3'}) MERGE (p)-[:ASSOCIATION_TO{weight:1.000000}]->(g)

MERGE (g:Gene {Symbol: 'SCARB2'}) SET g.EntrezId='950' SET g.OmimId='602257' MERGE (p:Disease {Name: 'epilepsy, progressive myoclonic 4, with or without renal failure', Origin:'OMIM', PhenotypeClass: '3'}) MERGE (p)-[:ASSOCIATION_TO{weight:1.000000}]->(g)

MERGE (g:Gene {Symbol: 'PRICKLE2'}) SET g.EntrezId='166336' SET g.OmimId='608501' MERGE (p:Disease {Name: 'epilepsy, progressive myoclonic 5', Origin:'OMIM', PhenotypeClass: '3'}) MERGE (p)-[:ASSOCIATION_TO{weight:1.000000}]->(g)

MERGE (g:Gene {Symbol: 'GOSR2'}) SET g.EntrezId='9570' SET g.OmimId='604027' MERGE (p:Disease {Name: 'epilepsy, progressive myoclonic 6', Origin:'OMIM', PhenotypeClass: '3'}) MERGE (p)-[:ASSOCIATION_TO{weight:1.000000}]->(g)

MERGE (g:Gene {S


MERGE (g:Gene {Symbol: 'C19orf12'}) SET g.EntrezId='83636' SET g.OmimId='614297' MERGE (p:Disease {Name: 'neurodegeneration with brain iron accumulation 4', Origin:'OMIM', PhenotypeClass: '3'}) MERGE (p)-[:ASSOCIATION_TO{weight:1.000000}]->(g)

MERGE (g:Gene {Symbol: 'COASY'}) SET g.EntrezId='80347' SET g.OmimId='609855' MERGE (p:Disease {Name: 'neurodegeneration with brain iron accumulation 6', Origin:'OMIM', PhenotypeClass: '3'}) MERGE (p)-[:ASSOCIATION_TO{weight:1.000000}]->(g)

MERGE (g:Gene {Symbol: 'EWSR1'}) SET g.EntrezId='2130' SET g.OmimId='133450' MERGE (p:Disease {Name: 'ewing sarcoma', Origin:'OMIM', PhenotypeClass: '3'}) MERGE (p)-[:ASSOCIATION_TO{weight:1.000000}]->(g)

MERGE (g:Gene {Symbol: 'EWSR1'}) SET g.EntrezId='2130' SET g.OmimId='133450' MERGE (p:Disease {Name: 'neuroepithelioma', Origin:'OMIM', PhenotypeClass: '3'}) MERGE (p)-[:ASSOCIATION_TO{weight:1.000000}]->(g)

MERGE (g:Gene {Symbol: 'HMNJ'}) SET g.EntrezId='80768' SET g.OmimId='605726' MERGE (p:Disease {Na

MERGE (g:Gene {Symbol: 'DHCR7'}) SET g.EntrezId='1717' SET g.OmimId='602858' MERGE (p:Disease {Name: 'smith-lemli-opitz syndrome', Origin:'OMIM', PhenotypeClass: '3'}) MERGE (p)-[:ASSOCIATION_TO{weight:1.000000}]->(g)

MERGE (g:Gene {Symbol: 'RAI1'}) SET g.EntrezId='10743' SET g.OmimId='607642' MERGE (p:Disease {Name: 'smith-magenis syndrome', Origin:'OMIM', PhenotypeClass: '3'}) MERGE (p)-[:ASSOCIATION_TO{weight:1.000000}]->(g)

MERGE (g:Gene {Symbol: 'DYM'}) SET g.EntrezId='54808' SET g.OmimId='607461' MERGE (p:Disease {Name: 'dyggve-melchior-clausen disease', Origin:'OMIM', PhenotypeClass: '3'}) MERGE (p)-[:ASSOCIATION_TO{weight:1.000000}]->(g)

MERGE (g:Gene {Symbol: 'DYM'}) SET g.EntrezId='54808' SET g.OmimId='607461' MERGE (p:Disease {Name: 'smith-mccort dysplasia', Origin:'OMIM', PhenotypeClass: '3'}) MERGE (p)-[:ASSOCIATION_TO{weight:1.000000}]->(g)

MERGE (g:Gene {Symbol: 'RAB33B'}) SET g.EntrezId='83452' SET g.OmimId='605950' MERGE (p:Disease {Name: 'smith-mccort dysplasia 2'


MERGE (g:Gene {Symbol: 'PCSK9'}) SET g.EntrezId='255738' SET g.OmimId='607786' MERGE (p:Disease {Name: 'low density lipoprotein cholesterol level qtl 1', Origin:'OMIM', PhenotypeClass: '3'}) MERGE (p)-[:ASSOCIATION_TO{weight:1.000000}]->(g)

MERGE (g:Gene {Symbol: 'CYP11B2'}) SET g.EntrezId='1585' SET g.OmimId='124080' MERGE (p:Disease {Name: 'aldosterone to renin ratio raised', Origin:'OMIM', PhenotypeClass: '3'}) MERGE (p)-[:ASSOCIATION_TO{weight:1.000000}]->(g)

MERGE (g:Gene {Symbol: 'CYP11B2'}) SET g.EntrezId='1585' SET g.OmimId='124080' MERGE (p:Disease {Name: 'hypoaldosteronism, congenital, due to cmo ii deficiency', Origin:'OMIM', PhenotypeClass: '3'}) MERGE (p)-[:ASSOCIATION_TO{weight:1.000000}]->(g)

MERGE (g:Gene {Symbol: 'CYP11B2'}) SET g.EntrezId='1585' SET g.OmimId='124080' MERGE (p:Disease {Name: 'low renin hypertension, susceptibility to', Origin:'OMIM', PhenotypeClass: '3'}) MERGE (p)-[:ASSOCIATION_TO{weight:1.000000}]->(g)

MERGE (g:Gene {Symbol: 'CYP11B2'}) SET g.En

In [21]:
disEntries = pd.read_csv('DISEASES Curated Gene-Disease Assocation Evidence Scores/gene_attribute_edges.txt', sep='\t')
#disEntries = disEntries.drop(["source_desc","target_id"], axis=1)
disEntries = disEntries.iloc[1:]
disEntries

for row in pathways.itertuples():
    #pprint(row)  , : '%s'
    s2 = "MERGE (g:Gene {Symbol: '%s'}) SET g.EntrezId='%s' SET g.EnsembleId='%s'"%(row.source, row.source_id, row.source_desc)
    s3 = "MERGE (p:Disease {Name: '%s', Id: '%s', Origin:'JensenDiseaseCurated'})"%(row.target,row.target_desc)
    r1 = "MERGE (g)-[:BELONGS_TO{weight:%s}]->(p)"%(row.weight)
    subStmt = "{} {} {}".format(s2,s3,r1)
    graph.run(subStmt)



Unnamed: 0,source,source_desc,source_id,target,target_desc,target_id,weight
1,VHL,ENSP00000256474,7428,hemangioblastoma,DOID:5241,-666.0,1.000000
2,VHL,ENSP00000256474,7428,von hippel-lindau disease,DOID:14175,-666.0,1.000000
3,VHL,ENSP00000256474,7428,hemangioma,DOID:255,-666.0,1.000000
4,VHL,ENSP00000256474,7428,cell type benign neoplasm,DOID:0060084,-666.0,1.000000
5,VHL,ENSP00000256474,7428,benign neoplasm,DOID:0060072,-666.0,1.000000
...,...,...,...,...,...,...,...
18140,ABCC2,ENSP00000359478,1244,dubin-johnson syndrome,DOID:12308,-666.0,1.000000
18141,UGT1A1,ENSP00000304845,54658,crigler-najjar syndrome,DOID:3803,-666.0,1.000000
18142,UGT1A1,ENSP00000304845,54658,gilbert syndrome,DOID:2739,-666.0,1.000000
18143,SERPINA1,ENSP00000348068,5265,alpha 1-antitrypsin deficiency,DOID:13372,-666.0,1.000000


In [None]:

        # ('CTD Gene-Disease Associations', 'ctddisease'),
        # ('GEO Signatures of Differentially Expressed Genes for Diseases', 'geodisease'),
        # ('HPO Gene-Disease Associations', 'hpo'),
        
        # ('GWASdb SNP-Disease Associations', 'gwasdbdisease'),
        # ('PhosphoSitePlus Phosphosite-Disease Associations', 'phosphositeplusdisease'),
        # ('DISEASES Text-mining Gene-Disease Assocation Evidence Scores', 'jensendiseasetextmining'),


In [None]:
https://civicdb.org/downloads/nightly/nightly-civic_accepted.vcf