In [None]:
## 
http://grch37.ensembl.org/biomart



In [2]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib, collections, itertools, os, re, textwrap, logging
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpatches
from functools import reduce

from logging.config import dictConfig
from logging import getLogger

dictConfig(dict(
    version = 1,
    formatters = {'f': {'format': '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'}},
    handlers = {
        'h': {'class': 'logging.StreamHandler','formatter': 'f',
              'level': logging.DEBUG}},
    root = {'handlers': ['h'], 'level': logging.DEBUG,},
))

matplotlib.rc('font',**{'size':16, 'family':'sans-serif','sans-serif':['HelveticaNeue', 'Helvetica']})

logger = getLogger('notebook')

repo_dir=os.path.realpath(
    os.path.join('..', '..')
)
private_data_dir=os.path.realpath(
    os.path.join(repo_dir, 'private_data', os.path.basename(os.path.realpath(os.getcwd()))))
public_data_dir=os.path.realpath(
    os.path.join(repo_dir, 'public_data', os.path.basename(os.path.realpath(os.getcwd()))))


## read InBio dataset

In [57]:
inbio_df = pd.read_csv(
    os.path.join(
        '/oak/stanford/groups/mrivas/public_data',
        'InWeb_InBioMap_2016_09_12/downloads',
        'InBio_Map_core_2016_09_12/core.psimitab.gz'
    ),
    sep='\t',
    compression='gzip',
    names=np.arange(16)
)


In [58]:
inbio_df['0_Uniprot'] = inbio_df[0].map(lambda x: x[10:])
inbio_df['1_Uniprot'] = inbio_df[1].map(lambda x: x[10:])


In [61]:
np.nan

nan

In [62]:
inbio_df['score0'] = inbio_df[14].map(
    lambda x: np.nan if x.split('|')[0] == '-' else float(x.split('|')[0])
)
inbio_df['score1'] = inbio_df[14].map(
    lambda x: np.nan if x.split('|')[1] == '-' else float(x.split('|')[1])
)


In [63]:
inbio_df_scores = inbio_df[['0_Uniprot', '1_Uniprot', 'score0', 'score1']]

In [64]:
print(inbio_df_scores.shape)
inbio_df_scores.head()


(625641, 4)


Unnamed: 0,0_Uniprot,1_Uniprot,score0,score1
0,A0A5B9,P01892,0.417,0.458
1,A0AUZ9,Q96CV9,0.155,0.0761
2,A0AV02,P24941,0.156,0.0783
3,A0AV02,Q00526,0.157,0.0821
4,A0AV96,P0CG48,0.144,0.0494


## read biomart mapping table

In [65]:
biomart_df=pd.read_csv(
    os.path.join(
        private_data_dir,
        'HumanEnsemblID.HumanGeneSymbol.HumanUniProt.biomart.grch37.tsv.gz',
    ),
    sep='\t',
    compression='gzip',
)


In [66]:
biomart_df_noNAs = biomart_df[['Gene stable ID', 'UniProtKB/TrEMBL ID']].dropna()


In [67]:
print(biomart_df_noNAs.shape)
biomart_df_noNAs.head()

(60950, 2)


Unnamed: 0,Gene stable ID,UniProtKB/TrEMBL ID
0,ENSG00000261657,F8WAB8
4,ENSG00000261125,H7C295
5,ENSG00000261125,H7BZC9
7,ENSG00000261577,Q7KZ86
8,ENSG00000261577,K7ERP3


## read UK Biobank annotation files

In [68]:
ukbb_df=pd.read_csv(
    os.path.join(
        '/oak/stanford/groups/mrivas/private_data',
        'ukbb/array_annotation_500k',
        'bims_combined.vep.cf.tsv.gz'
    ),
    sep='\t',
    compression='gzip',
)


In [69]:
ukbb_df2 = ukbb_df[['CHROM', 'POS', 'REF', 'ALT', 'Gene']].dropna()

In [70]:
print(ukbb_df2.shape)
ukbb_df2.head()

(551600, 5)


Unnamed: 0,CHROM,POS,REF,ALT,Gene
0,1,723307,C,G,ENSG00000237491
1,1,727841,G,A,ENSG00000237491
2,1,729632,C,T,ENSG00000237491
3,1,752721,A,G,ENSG00000240453
4,1,754105,C,T,ENSG00000177757


## join tables

In [103]:
import mygene

mg = mygene.MyGeneInfo()

ModuleNotFoundError: No module named 'mygene'

In [99]:
inbio_df_scores.merge(
    biomart_df_noNAs,
    left_on='0_Uniprot',
    right_on='UniProtKB/TrEMBL ID'
)

Unnamed: 0,0_Uniprot,1_Uniprot,score0,score1,Gene stable ID,UniProtKB/TrEMBL ID
0,C9J7I0,P0CG48,0.145,0.0496,ENSG00000219545,C9J7I0
1,C9J7I0,Q53EZ4,0.142,0.0430,ENSG00000219545,C9J7I0
2,C9J7I0,Q92734,0.142,0.0419,ENSG00000219545,C9J7I0
3,C9J7I0,Q9H3S7,0.142,0.0429,ENSG00000219545,C9J7I0
4,C9J7I0,Q9UK41,0.142,0.0426,ENSG00000219545,C9J7I0
5,E9PAV3,O00444,0.145,0.0512,ENSG00000196531,E9PAV3
6,E9PAV3,O15205,0.155,0.0770,ENSG00000196531,E9PAV3
7,E9PAV3,O60216,0.152,0.0687,ENSG00000196531,E9PAV3
8,E9PAV3,O60260,0.151,0.0673,ENSG00000196531,E9PAV3
9,E9PAV3,O60356,0.166,0.1030,ENSG00000196531,E9PAV3


In [80]:
ukbb_UniProt_df = ukbb_df2.merge(
    biomart_df_noNAs,
    left_on='Gene',
    right_on='Gene stable ID',
)[
    list(ukbb_df2.columns) + ['UniProtKB/TrEMBL ID']
]

In [81]:
print(ukbb_UniProt_df.shape)
ukbb_UniProt_df.head()

(1531893, 6)


Unnamed: 0,CHROM,POS,REF,ALT,Gene,UniProtKB/TrEMBL ID
0,1,865625,G,A,ENSG00000187634,Q5SV95
1,1,865625,G,A,ENSG00000187634,I7FV93
2,1,865625,G,A,ENSG00000187634,A6PWC8
3,1,865628,G,A,ENSG00000187634,Q5SV95
4,1,865628,G,A,ENSG00000187634,I7FV93


In [88]:
inbio_df_mapped0 = inbio_df_scores.merge(
    ukbb_UniProt_df,
    left_on='0_Uniprot',
    right_on='UniProtKB/TrEMBL ID'
)

In [92]:
print(inbio_df_mapped0.shape)
inbio_df_mapped0.head()

(1792, 10)


Unnamed: 0,0_Uniprot,1_Uniprot,score0,score1,CHROM,POS,REF,ALT,Gene,UniProtKB/TrEMBL ID
0,C9J7I0,P0CG48,0.145,0.0496,7,7773926,C,T,ENSG00000219545,C9J7I0
1,C9J7I0,P0CG48,0.145,0.0496,7,7777191,C,T,ENSG00000219545,C9J7I0
2,C9J7I0,P0CG48,0.145,0.0496,7,7782919,G,A,ENSG00000219545,C9J7I0
3,C9J7I0,P0CG48,0.145,0.0496,7,7785040,C,T,ENSG00000219545,C9J7I0
4,C9J7I0,P0CG48,0.145,0.0496,7,7787006,G,A,ENSG00000219545,C9J7I0


In [93]:
inbio_df_mapped1 = inbio_df_mapped0.merge(
    ukbb_UniProt_df,
    left_on='1_Uniprot',
    right_on='UniProtKB/TrEMBL ID'
)

In [94]:
inbio_df_mapped1

Unnamed: 0,0_Uniprot,1_Uniprot,score0,score1,CHROM_x,POS_x,REF_x,ALT_x,Gene_x,UniProtKB/TrEMBL ID_x,CHROM_y,POS_y,REF_y,ALT_y,Gene_y,UniProtKB/TrEMBL ID_y


In [96]:
set(inbio_df_mapped0['0_Uniprot'])

{'C9J7I0', 'E9PAV3', 'E9PRG8'}

Unnamed: 0,0_Uniprot,1_Uniprot,score0,score1
0,A0A5B9,P01892,0.417,0.4580
1,A0AUZ9,Q96CV9,0.155,0.0761
2,A0AV02,P24941,0.156,0.0783
3,A0AV02,Q00526,0.157,0.0821
4,A0AV96,P0CG48,0.144,0.0494
5,A0AV96,Q13315,0.141,0.0410
6,A0AV96,Q15717,0.147,0.0572
7,A0AV96,Q6P1W5,0.143,0.0454
8,A0AV96,Q92731,0.155,0.0755
9,A0AV96,Q96LI6,0.143,0.0468
