In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib, collections, itertools, os, re, textwrap, logging
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpatches
from functools import reduce

from logging.config import dictConfig
from logging import getLogger

dictConfig(dict(
    version = 1,
    formatters = {'f': {'format': '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'}},
    handlers = {
        'h': {'class': 'logging.StreamHandler','formatter': 'f',
              'level': logging.DEBUG}},
    root = {'handlers': ['h'], 'level': logging.DEBUG,},
))

matplotlib.rc('font',**{'size':16, 'family':'sans-serif','sans-serif':['HelveticaNeue', 'Helvetica']})

logger = getLogger('notebook')

repo_dir=os.path.realpath(
    os.path.join('..', '..')
)
private_data_dir=os.path.realpath(
    os.path.join(repo_dir, 'private_data', os.path.basename(os.path.realpath(os.getcwd()))))
public_data_dir=os.path.realpath(
    os.path.join(repo_dir, 'public_data', os.path.basename(os.path.realpath(os.getcwd()))))


## read InBio dataset

In [2]:
inbio_df = pd.read_csv(
    os.path.join(
        '/oak/stanford/groups/mrivas/public_data',
        'InWeb_InBioMap_2016_09_12/downloads',
        'InBio_Map_core_2016_09_12/core.psimitab.gz'
    ),
    sep='\t',
    compression='gzip',
    names=np.arange(16)
)


In [3]:
def extract_ensembl_id(x):
    try:
        return dict([key_val_pair.split(':') for key_val_pair in x.split('|')])['ensembl']
    except:
        return np.nan

In [4]:
inbio_df['0_Ensembl_Protein'] = inbio_df[2].map(lambda x: extract_ensembl_id(x))
inbio_df['1_Ensembl_Protein'] = inbio_df[3].map(lambda x: extract_ensembl_id(x))


In [5]:
inbio_df['score0'] = inbio_df[14].map(
    lambda x: np.nan if x.split('|')[0] == '-' else float(x.split('|')[0])
)

inbio_df['score1'] = inbio_df[14].map(
    lambda x: np.nan if x.split('|')[1] == '-' else float(x.split('|')[1])
)


In [6]:
inbio_df_scores = inbio_df[['0_Ensembl_Protein', '1_Ensembl_Protein', 'score0', 'score1']].dropna()

In [7]:
print(inbio_df_scores.shape)
inbio_df_scores.head()


(495330, 4)


Unnamed: 0,0_Ensembl_Protein,1_Ensembl_Protein,score0,score1
1,ENSP00000405724,ENSP00000368040,0.155,0.0761
2,ENSP00000418783,ENSP00000266970,0.156,0.0783
3,ENSP00000418783,ENSP00000410561,0.157,0.0821
4,ENSP00000371214,ENSP00000441543,0.144,0.0494
5,ENSP00000371214,ENSP00000388058,0.141,0.041


## read biomart mapping table

In [8]:
biomart_df=pd.read_csv(
    os.path.join(
        public_data_dir,
        'HumanEnsembl.HumanEnsemblProtein.biomart93.map.gz',
    ),
    sep='\t',
    compression='gzip',
).dropna()


In [9]:
print(biomart_df.shape)
biomart_df.head()

(107844, 2)


Unnamed: 0,Gene stable ID,Protein stable ID
50,ENSG00000257215,ENSP00000447114
149,ENSG00000273875,ENSP00000479374
202,ENSG00000282035,ENSP00000487458
292,ENSG00000276256,ENSP00000484596
300,ENSG00000275746,ENSP00000484904


## read UK Biobank annotation files

In [10]:
ukbb_df=pd.read_csv(
    os.path.join(
        '/oak/stanford/groups/mrivas/private_data',
        'ukbb/array_annotation_500k',
        'bims_combined.vep.cf.tsv.gz'
    ),
    sep='\t',
    compression='gzip',
)[['CHROM', 'POS', 'REF', 'ALT', 'Gene']].dropna()


In [11]:
print(ukbb_df.shape)
ukbb_df.head()

(551600, 5)


Unnamed: 0,CHROM,POS,REF,ALT,Gene
0,1,723307,C,G,ENSG00000237491
1,1,727841,G,A,ENSG00000237491
2,1,729632,C,T,ENSG00000237491
3,1,752721,A,G,ENSG00000240453
4,1,754105,C,T,ENSG00000177757


## Map InWeb_InBio to Ensemble Genes

In [12]:
inbio_EnsemblGenes_df = inbio_df_scores.merge(
    biomart_df,
    left_on='0_Ensembl_Protein',
    right_on='Protein stable ID'
).merge(
    biomart_df,
    left_on='1_Ensembl_Protein',
    right_on='Protein stable ID'
)[['Gene stable ID_x', 'Gene stable ID_y', 'score0', 'score1']].dropna()

In [13]:
print(inbio_EnsemblGenes_df.shape)
inbio_EnsemblGenes_df.head()

(486902, 4)


Unnamed: 0,Gene stable ID_x,Gene stable ID_y,score0,score1
0,ENSG00000144445,ENSG00000123240,0.155,0.0761
1,ENSG00000104365,ENSG00000123240,0.216,0.205
2,ENSG00000164405,ENSG00000123240,0.149,0.0622
3,ENSG00000036672,ENSG00000123240,0.184,0.143
4,ENSG00000135636,ENSG00000123240,0.153,0.0704


In [15]:
inbio_EnsemblGenes_df.to_csv(
    os.path.join(
        public_data_dir, 'InWeb_InBio.Ensembl.tsv.gz'
    ),
    sep='\t',
    compression='gzip',
    index=False
)

### creat a dict from Ensembl Gene ID to list of SNPs

In [49]:
ukbb_df['Genes'] = ukbb_df['Gene'].map(lambda x: x.split(','))

In [56]:
ukbb_df['SNP'] = [
    '{}:{}{}/{}'.format(x[0], x[1], x[2], x[3]) for x in 
    zip(ukbb_df['CHROM'], ukbb_df['POS'], ukbb_df['REF'], ukbb_df['ALT'])
]

In [57]:
EnsemblGene2SNPs = collections.defaultdict(list)

In [58]:
for genes, snp in zip(ukbb_df['Genes'], ukbb_df['SNP']):
    for gene in genes:
        EnsemblGene2SNPs[gene].append(snp)

In [59]:
len(EnsemblGene2SNPs), np.sum([len(x) for x in EnsemblGene2SNPs.values()])

(39055, 590940)

In [60]:
EnsemblGene2SNPs['ENSG00000237491']

['1:723307C/G', '1:727841G/A', '1:729632C/T']

In [61]:
inbio_EnsemblGenes_df['SNPs_x']=inbio_EnsemblGenes_df['Gene stable ID_x'].map(lambda x: EnsemblGene2SNPs[x])
inbio_EnsemblGenes_df['SNPs_y']=inbio_EnsemblGenes_df['Gene stable ID_y'].map(lambda x: EnsemblGene2SNPs[x])


In [63]:
print(inbio_EnsemblGenes_df.shape)
inbio_EnsemblGenes_df.head()

(486902, 6)


Unnamed: 0,Gene stable ID_x,Gene stable ID_y,score0,score1,SNPs_x,SNPs_y
0,ENSG00000144445,ENSG00000123240,0.155,0.0761,"[2:210887737G/A, 2:210894606T/A, 2:210897931C/...","[10:13141608G/A, 10:13142087G/T, 10:13142251G/..."
1,ENSG00000104365,ENSG00000123240,0.216,0.205,"[8:42140549G/T, 8:42143087A/G, 8:42145305A/G, ...","[10:13141608G/A, 10:13142087G/T, 10:13142251G/..."
2,ENSG00000164405,ENSG00000123240,0.149,0.0622,"[5:132202539C/T, 5:132203057G/T, 5:132203272T/G]","[10:13141608G/A, 10:13142087G/T, 10:13142251G/..."
3,ENSG00000036672,ENSG00000123240,0.184,0.143,"[11:119227996T/A, 11:119228834C/T, 11:11922937...","[10:13141608G/A, 10:13142087G/T, 10:13142251G/..."
4,ENSG00000135636,ENSG00000123240,0.153,0.0704,"[2:71677701C/T, 2:71681139T/G, 2:71686337A/G, ...","[10:13141608G/A, 10:13142087G/T, 10:13142251G/..."


In [64]:
inbio_EnsemblGenes_df[
    ['SNPs_x', 'SNPs_y', 'Gene stable ID_x', 'Gene stable ID_y', 'score0', 'score1']
].to_csv(
    os.path.join(
        public_data_dir, 'InWeb_InBio.Ensembl.SNPs.tsv.gz'
    ),
    sep='\t',
    compression='gzip',
    index=False
)