After we saw the following [tweet](https://twitter.com/cecilejanssens/status/1142864439638679554) from [@cecilejanssens](https://twitter.com/cecilejanssens):

<blockquote class="twitter-tweet" data-lang="en"><p lang="en" dir="ltr">Why I think (current) polygenic risk scores are just a phase? Because I believe that scientists will find better ways ways to model the role of SNPs in molecular pathways and their combined contribution to disease risk. This cannot be it. <a href="https://t.co/p9nP5hgQV0">pic.twitter.com/p9nP5hgQV0</a></p>&mdash; Cecile Janssens (@cecilejanssens) <a href="https://twitter.com/cecilejanssens/status/1142864439638679554?ref_src=twsrc%5Etfw">June 23, 2019</a></blockquote>
<script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>

Daniel and I asked how easy it would be to map single nucleotide polymorphisms (SNPs) from a [Bio2BEL](https://github.com/bio2bel) repository like [``bio2bel_phewascatalog``](https://github.com/bio2bel/phewascatalog) to one of the major pathway databases like KEGG, Reactome, and WikiPathways.

In [None]:
import getpass
import os
import sys
import time
from collections import defaultdict, Counter

import bio2bel_kegg
import bio2bel_phewascatalog
import bio2bel_reactome
import bio2bel_wikipathways
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm_notebook as tqdm

In [None]:
%matplotlib inline

In [None]:
print(sys.version)

In [None]:
print(time.asctime())

In [None]:
print(getpass.getuser())

In [None]:
print(bio2bel_kegg.get_version())
print(bio2bel_reactome.get_version())
print(bio2bel_wikipathways.get_version())
print(bio2bel_phewascatalog.get_version())

## Map SNPs to Genes

Generate the gene to SNP mappings from PheWAS Catalog. This step can be interchanged with dbSNP, GWAS Catalog, or other sources.

In [None]:
phewascatalog_df = bio2bel_phewascatalog.parser.get_df()
phewascatalog_df = phewascatalog_df[phewascatalog_df.gene_name.notna()]
phewascatalog_df.head()

In [None]:
gene_to_snps = defaultdict(set)
for snp, gene_symbol in phewascatalog_df[['snp', 'gene_name']].values:
    gene_to_snps[gene_symbol].add(snp)
gene_to_snps = dict(gene_to_snps)

In [None]:
# promiscuity of SNPs
Counter(len(v) for v in gene_to_snps.values())

## Map Genes to Pathways

In [None]:
def get_pathway_to_gene(manager):
    pathway_to_gene = defaultdict(set)

    for pathway in tqdm(manager.get_all_pathways(), desc='Getting pathways/genes'):
        for protein in pathway.proteins:
            pathway_to_gene[pathway].add(protein.hgnc_symbol)

    return dict(pathway_to_gene)

def get_pathway_to_snp(pathway_to_gene):
    """Combine the mappings to relate pathways to SNPs.
    
    This could be further extended to weight pathway-SNP associations
    by count, or to normalize by the frequency of each SNP being
    mapped to multiple genes.
    """
    pathway_to_snp = defaultdict(set)

    for pathway, gene_symbols in pathway_to_gene.items():
        for gene_symbol in gene_symbols:
            pathway_to_snp[pathway].update(gene_to_snps.get(gene_symbol, set()))

    return dict(pathway_to_snp)

### WikiPathways

In [None]:
wikipathways_manager = bio2bel_wikipathways.Manager()

if not wikipathways_manager.is_populated():
    wikipathways_manager.populate()

wikipathways_manager.summarize()

In [None]:
wikipathways_pathway_to_gene = get_pathway_to_gene(wikipathways_manager)
wikipathways_pathway_to_snp = get_pathway_to_snp(wikipathways_pathway_to_gene)

In [None]:
wikipathways_df = pd.DataFrame([
    (pathway.wikipathways_id, pathway.name, snp)
    for pathway, snps in wikipathways_pathway_to_snp.items()
    for snp in snps
], columns=['wikipathways_id', 'name', 'snp'])
wikipathways_df.head()

In [None]:
wikipathways_df.to_csv('wikipathways_to_snp.tsv', index=False, sep='\t')

### KEGG

In [None]:
kegg_manager = bio2bel_kegg.Manager()

if not kegg_manager.is_populated():
    kegg_manager.populate()
    
kegg_manager.summarize()

In [None]:
kegg_pathway_to_gene = get_pathway_to_gene(kegg_manager)
kegg_pathway_to_snp = get_pathway_to_snp(kegg_pathway_to_gene)

In [None]:
kegg_df = pd.DataFrame([
    (pathway.kegg_id, pathway.name, snp)
    for pathway, snps in kegg_pathway_to_snp.items()
    for snp in snps
], columns=['kegg_id', 'name', 'snp'])
kegg_df.head()

In [None]:
kegg_df.to_csv('kegg_to_snp.tsv', index=False, sep='\t')

### Reactome

In [None]:
reactome_manager = bio2bel_reactome.Manager()

if not reactome_manager.is_populated():
    reactome_manager.populate()
    
reactome_manager.summarize()

In [None]:
reactome_pathway_to_gene = get_pathway_to_gene(reactome_manager)
reactome_pathway_to_snp = get_pathway_to_snp(reactome_pathway_to_gene)

In [None]:
reactome_df = pd.DataFrame([
    (pathway.reactome_id, pathway.name, snp)
    for pathway, snps in reactome_pathway_to_snp.items()
    for snp in snps
], columns=['reactome_id', 'name', 'snp'])
reactome_df.head()

In [None]:
reactome_df.to_csv('reactome_to_snp.tsv', index=False, sep='\t')