This notebooks downloads genesets as text files.

In [13]:
import ndex2
import requests
import json
from multiprocessing import Pool
import pandas as pd
import zipfile
import io

# Getting Wikipathways

In [2]:
r = requests.get("http://public.ndexbio.org/v2/networkset/453c1c63-5c10-11e9-9f06-0ac135e8bacf")
tmp = json.loads(r.content.decode("utf-8"))
wikipathway_networks = tmp['networks']

In [3]:
def get_nodes(uuid): 
    n = ndex2.create_nice_cx_from_server("http://public.ndexbio.org", uuid=uuid)
    dG = n.to_networkx()
    
    genes = [attrs['name'] for node, attrs in dG.node.items() \
                if (attrs['Type'] == 'GeneProduct') and (attrs['__gpml:XrefDatasource'] in ['Ensembl',  'Entrez Gene'])]
    
    name_array = dG.name.split(' - ')
    name = name_array[1].strip() + f" ({name_array[0].strip()})"

    return (name, genes)

In [4]:
# n = ndex2.create_nice_cx_from_server("http://public.ndexbio.org", uuid="cbcce7eb-7785-11e9-848d-0ac135e8bacf")
# dG = n.to_networkx()

In [5]:
pool = Pool(16)
result = pool.map(get_nodes, wikipathway_networks)
wikipathway_nodes = dict(result)

wikipathway_nodes = {
    k:[i.strip().replace('\n', '') for i in v] for k,v in wikipathway_nodes.items() \
        if len(v) >= 2
}

In [6]:
with open("genesets/wikipathways.txt", "w") as f: 
    for name, genelist in wikipathway_nodes.items(): 
        line = f"{name};{','.join(genelist)}\n"
        f.write(line)

# Getting Cancer Hallmarks from msigdb

In [7]:
hallmarks='''HALLMARK_ADIPOGENESIS
HALLMARK_ALLOGRAFT_REJECTION
HALLMARK_ANDROGEN_RESPONSE
HALLMARK_ANGIOGENESIS
HALLMARK_APICAL_JUNCTION
HALLMARK_APICAL_SURFACE
HALLMARK_APOPTOSIS
HALLMARK_BILE_ACID_METABOLISM
HALLMARK_CHOLESTEROL_HOMEOSTASIS
HALLMARK_COAGULATION
HALLMARK_COMPLEMENT
HALLMARK_DNA_REPAIR
HALLMARK_E2F_TARGETS
HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION
HALLMARK_ESTROGEN_RESPONSE_EARLY
HALLMARK_ESTROGEN_RESPONSE_LATE
HALLMARK_FATTY_ACID_METABOLISM
HALLMARK_G2M_CHECKPOINT
HALLMARK_GLYCOLYSIS
HALLMARK_HEDGEHOG_SIGNALING
HALLMARK_HEME_METABOLISM
HALLMARK_HYPOXIA
HALLMARK_IL2_STAT5_SIGNALING
HALLMARK_IL6_JAK_STAT3_SIGNALING
HALLMARK_INFLAMMATORY_RESPONSE
HALLMARK_INTERFERON_ALPHA_RESPONSE
HALLMARK_INTERFERON_GAMMA_RESPONSE
HALLMARK_KRAS_SIGNALING_DN
HALLMARK_KRAS_SIGNALING_UP
HALLMARK_MITOTIC_SPINDLE
HALLMARK_MTORC1_SIGNALING
HALLMARK_MYC_TARGETS_V1
HALLMARK_MYC_TARGETS_V2
HALLMARK_MYOGENESIS
HALLMARK_NOTCH_SIGNALING
HALLMARK_OXIDATIVE_PHOSPHORYLATION
HALLMARK_P53_PATHWAY
HALLMARK_PANCREAS_BETA_CELLS
HALLMARK_PEROXISOME
HALLMARK_PI3K_AKT_MTOR_SIGNALING
HALLMARK_PROTEIN_SECRETION
HALLMARK_REACTIVE_OXIGEN_SPECIES_PATHWAY
HALLMARK_SPERMATOGENESIS
HALLMARK_TGF_BETA_SIGNALING
HALLMARK_TNFA_SIGNALING_VIA_NFKB
HALLMARK_UNFOLDED_PROTEIN_RESPONSE
HALLMARK_UV_RESPONSE_DN
HALLMARK_UV_RESPONSE_UP
HALLMARK_WNT_BETA_CATENIN_SIGNALING
HALLMARK_XENOBIOTIC_METABOLISM'''

In [8]:
def get_cancer_hallmarks(name):
    address = lambda x:f"http://software.broadinstitute.org/gsea/msigdb/download_geneset.jsp?geneSetName={x}&fileType=txt"
    response =(requests
         .get(address(name))
         .content
         .decode('utf-8')
         .split('\n'))
    
    name = response[0].replace('_', ' ').title()
    geneset = response[2:]
    
    return (name, geneset)

In [9]:
pool = Pool(16)
out = pool.map(get_cancer_hallmarks, hallmarks.split('\n'))
msigdb_hallmarks = dict(out)

In [10]:
with open("genesets/msigdb_hallmarks.txt", "w") as f: 
    for name, geneset in msigdb_hallmarks.items(): 
        line = f"{name};{','.join(geneset)}\n"
        f.write(line)

# Getting CORUM complexes

In [35]:
r = requests.get("https://mips.helmholtz-muenchen.de/corum/download/allComplexes.txt.zip", stream=True)
z = zipfile.ZipFile(io.BytesIO(r.content))
corum = pd.read_csv(z.open("allComplexes.txt"), sep='\t', index_col='ComplexID')

corum_map = corum[['ComplexName', 'subunits(Gene name)']].set_index('ComplexName').to_dict()['subunits(Gene name)']
corum_map = {k: v.split(';') for k,v in corum_map.items()}

In [39]:
with open("genesets/corum.txt", "w") as f: 
    for name, geneset in corum_map.items(): 
        line = f"{name};{','.join(geneset)}\n"
        f.write(line)

# Combining geneset

In [40]:
def parse_genesets(file): 
    with open(file) as f: 
        geneset = {}
        for line in f.readlines(): 
            line = line.strip()
            if line: 
                arr = line.split(';')
                genes = arr[1].split(',')
                geneset[arr[0]] = genes
            
    return geneset

In [41]:
msigdb = parse_genesets('genesets/msigdb_hallmarks.txt')
wikipathways = parse_genesets('genesets/wikipathways.txt')
corum = parse_genesets('genesets/corum.txt')

In [42]:
msigdb.update(corum)
msigdb.update(wikipathways)

In [43]:
with open("genesets/combined_genesets.txt", "w") as f: 
    for k, v in msigdb.items(): 
        line = f"{k};{','.join(v)}\n"
        f.write(line)

# Parsing DO

In [44]:
# Parse Disease Ontology .obo file
# Expect that the ID and name follows this pattern
# [Term] on line n
# id: on line n + 1
# name: on line n + 2
# If this expectation is not True, a ValueError will be raised

r = requests.get("https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/master/src/ontology/HumanDO.obo")

found = False
found_line = None
doid = None
name = None

header = []

line_count = 0

results = []
for line in r.content.decode('utf-8').split('\n'): 
    line = line.strip()
    line_count += 1

    if line_count <= 3: 
        header.append(line)

    if line == '[Term]': 
        found = True
        found_line = line_count 

    elif found and line[:3] == 'id:': 
        doid = line.split('id:')[1].strip()
        if line_count - found_line != 1: 
            raise ValueError(f"Expected line {found_line + 1}, but line {line_count} found.")

    elif found and line[:5] == "name:": 
        name = line.split("name:")[1].strip()
        if line_count - found_line != 2: 
            raise ValueError(f"Expected line {found_line + 2}, but line {line_count} found.")

        results.append((doid, name))
        found = False
        found_line = None
        doid = None
        name = None
            
doid_name_mapping = dict(results)

In [45]:
with open("doid/doid_name_mappings.txt", "w") as f: 
    for line in header: 
        line = f"#{line}\n"
        f.write(line) 
        
    for i,j in doid_name_mapping.items():
        line = f"{i}%{j}\n"
        f.write(line)