In [None]:
import os,requests,zlib
import pandas as pd
from py2neo import Graph, NodeMatcher


def _download_file(response, filename):
    with open(filename, 'wb') as f:
        for chunk in response.iter_content(chunk_size=1024):
            f.write(chunk)

def _download_and_decompress_file(response, filename):
    decompressor = zlib.decompressobj(16 + zlib.MAX_WBITS)
    filename = filename[:-3]
    with open(filename, 'w+') as f:
        while True:
            chunk = response.raw.read(1024)
            if not chunk:
                break
            string = decompressor.decompress(chunk).decode("latin-1") 
            f.write(string)

def download_datasets(selected_datasets, selected_downloads, decompress=False):
    for dataset, path in selected_datasets:
        if not os.path.exists(dataset):
            os.mkdir(dataset)
        for downloadable in selected_downloads:
            url = 'https://maayanlab.cloud/static/hdfs/harmonizome/data/%s/%s' %\
                  (path, downloadable)
            response = requests.get(url, stream=True)
            filename = '%s/%s' % (dataset, downloadable)
            # Not every dataset has all downloadables.
            if response.status_code != 200:
                continue
            if decompress and 'txt.gz' in filename:
                _download_and_decompress_file(response, filename)
            else:
                _download_file(response, filename)
        print('%s downloaded.' % dataset)

dwFiles = [
           'gene_attribute_edges.txt.gz',
           'gene_list_terms.txt.gz',
           'attribute_list_entries.txt.gz']

In [None]:
# Assuming Neo4j has been created with this name
dbName = "PublicOmics"
pw = "ngs4"
graph = Graph("bolt://localhost:7687/"+dbName, auth=("neo4j", pw))

In [None]:
download_datasets([('Biocarta Pathways', 'biocarta')], dwFiles, decompress=True)
download_datasets([('HumanCyc Pathways', 'humancyc')], dwFiles, decompress=True)
download_datasets([('KEGG Pathways', 'kegg')], dwFiles, decompress=True)

In [None]:
pathways = pd.read_csv('Biocarta Pathways/gene_attribute_edges.txt', sep='\t')
pathways = pathways.drop(["source_desc","target_desc"], axis=1)
pathways = pathways.iloc[1:]
pathways.head(3)
for row in pathways.itertuples():
    s2 = "MERGE (g:Gene { Symbol: '%s'}) SET g.EntrezId='%s' "%(row.source, row.source_id)
    s3 = "MERGE (p:Pathway { Name: '%s', Id: '%s', Origin: 'Biocarta'})"%(row.target,row.target_id)
    r1 = "MERGE (g)-[:BELONGS_TO{weight:%s}]->(p)"%(row.weight)
    subStmt = "{} {} {}".format(s2,s3,r1)
    graph.run(subStmt)
    
pathways = pd.read_csv('HumanCyc Pathways/gene_attribute_edges.txt', sep='\t')
pathways = pathways.drop(["target_id","target_desc"], axis=1)
pathways = pathways.iloc[1:]
pathways.head(3)
for row in pathways.itertuples():
    s2 = "MERGE (g:Gene { Symbol: '%s'}) SET g.EntrezId='%s', g.UniprotACC='%s' "%(row.source, row.source_id, row.source_desc)
    s3 = "MERGE (p:Pathway { Name: '%s', Origin: 'HumanCyc'})"%(row.target.replace('&','').replace(';','').replace("'",''))
    r1 = "MERGE (g)-[:BELONGS_TO{weight:%s}]->(p)"%(row.weight)
    subStmt = "{} {} {}".format(s2,s3,r1)
    graph.run(subStmt)
    
pathways = pd.read_csv('KEGG Pathways/gene_attribute_edges.txt', sep='\t')
pathways = pathways.drop(["source_desc","target_id"], axis=1)
pathways = pathways.iloc[1:]
pathways.head(3)
for row in pathways.itertuples():
    s2 = "MERGE (g:Gene {Symbol: '%s'}) SET g.EntrezId='%s'"%(row.source, row.source_id)
    s3 = "MERGE (p:Pathway {Name: '%s', Id: '%s', Origin:'Kegg'})"%(row.target,row.target_desc)
    r1 = "MERGE (g)-[:BELONGS_TO{weight:%s}]->(p)"%(row.weight)
    subStmt = "{} {} {}".format(s2,s3,r1)
    graph.run(subStmt)