## Ensembl Gene ID to Gene symbol mapping

#### 2019/4/9 Yosuke Tanigawa (ytanigaw@stanford.edu)

We use both Ensembl biomart and mygene (https://mygene.info/) API to map Ensembl gene IDs to gene symbols.


In [2]:
import pandas as pd 
import numpy as np


In [3]:
import mygene 

In [4]:
import itertools as it

In [10]:
import collections

In [5]:
in_f = '/oak/stanford/groups/mrivas/private_data/ukbb/variant_filtering/variant_filter_table.old.tsv.gz'


#### read list of Ensembl IDs

In [6]:
df_Ensembls = pd.read_csv(
    in_f, compression='gzip', usecols=[5], sep='\t'
)

In [7]:
ensembl_ids = set(list(it.chain.from_iterable([str(x).split(',') for x in set(df_Ensembls['Gene'].dropna())])))

In [8]:
len(ensembl_ids)

39055

In [9]:
list(ensembl_ids)[:10]

['ENSG00000146281',
 'ENSG00000265589',
 'ENSG00000225359',
 'ENSG00000250561',
 'ENSG00000252687',
 'ENSG00000234457',
 'ENSG00000198783',
 'ENSG00000221756',
 'ENSG00000169495',
 'ENSG00000259081']

#### read mapping files (from EMBL Ensembl! biomart)

In [16]:
def mart_export_to_dict(file):
    df = pd.read_csv(
        file,
        skiprows=1, 
        names=['ensembl.gene', 'symbol'],
        sep='\t'
    )
    return(dict(zip(df['ensembl.gene'], df['symbol'])))

In [17]:
biomart_hg18 = mart_export_to_dict('mart_export_ensembl54_NCBI36.txt')
biomart_hg19 = mart_export_to_dict('mart_export_ensembl97_GRCh37.txt')
biomart_hg38 = mart_export_to_dict('mart_export_ensembl97.txt')

In [18]:
len(biomart_hg18), len(biomart_hg19), len(biomart_hg38)

(37435, 63677, 65065)

#### Apply MyGene 

In [19]:
mg = mygene.MyGeneInfo()

In [20]:
df = mg.getgenes(
list(ensembl_ids), 
fields='ensembl.gene,symbol',
as_dataframe=True, df_index=False, species='human')

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-21000...done.
querying 21001-22000...done.
querying 22001-23000...done.
querying 23001-24000...done.
querying 24001-25000...done.
querying 25001-26000...done.
querying 26001-27000...done.
querying 27001-28000...done.
querying 28001-29000...done.
querying 29001-30000...done.
querying 30001-31000...done.
querying 31001-32000...done.
querying 32001-33000...done.
querying 33001-34000...done.
querying 34001-35000...done.
queryin

In [21]:
df[df['notfound'] != True][['ensembl.gene', 'symbol']].to_csv(
    'mygene.human.ensembl_ID.gene_symbol.tsv', sep='\t', index=False
)

In [24]:
!gzip -9 mygene.human.ensembl_ID.gene_symbol.tsv

In [26]:
gene_id_mapping_df = pd.read_csv(
    'mygene.human.ensembl_ID.gene_symbol.tsv.gz', sep='\t', compression='gzip'
)

In [38]:
gene_id_map_dict = dict(zip(
    gene_id_mapping_df['ensembl.gene'],
    gene_id_mapping_df['symbol']
))

In [39]:
print(len(gene_id_map_dict))
gene_id_map_dict.update(biomart_hg18)
print(len(gene_id_map_dict))
gene_id_map_dict.update(biomart_hg19)
print(len(gene_id_map_dict))
gene_id_map_dict.update(biomart_hg38)
print(len(gene_id_map_dict))

35632
53934
74686
85096


### process actual files

In [40]:
df_new = pd.read_csv(
    in_f, compression='gzip', usecols=[4,5], sep='\t'
)

In [41]:
df_new['Genes'] = df_new['Gene'].map(lambda x: str(x).split(','))

In [42]:
df_new.head()

Unnamed: 0,ID,Gene,Genes
0,rs28659788,ENSG00000237491,[ENSG00000237491]
1,rs116587930,ENSG00000237491,[ENSG00000237491]
2,rs116720794,ENSG00000237491,[ENSG00000237491]
3,rs3131972,ENSG00000240453,[ENSG00000240453]
4,rs12184325,ENSG00000177757,[ENSG00000177757]


In [43]:
df_new['Gene_symbols'] = df_new['Genes'].map(lambda x: [gene_id_map_dict[y] for y in x if y in gene_id_map_dict])

In [44]:
df_new['Gene_symbol'] = df_new['Gene_symbols'].map(lambda x: ','.join(x))

In [45]:
df_new.head()

Unnamed: 0,ID,Gene,Genes,Gene_symbols,Gene_symbol
0,rs28659788,ENSG00000237491,[ENSG00000237491],[AL669831.5],AL669831.5
1,rs116587930,ENSG00000237491,[ENSG00000237491],[AL669831.5],AL669831.5
2,rs116720794,ENSG00000237491,[ENSG00000237491],[AL669831.5],AL669831.5
3,rs3131972,ENSG00000240453,[ENSG00000240453],[RP11-206L10.10],RP11-206L10.10
4,rs12184325,ENSG00000177757,[ENSG00000177757],[FAM87B],FAM87B


In [46]:
df_new.shape

(784256, 5)

In [47]:
len(set(df_new['ID']))

784256

In [52]:
df_new[df_new['Gene_symbol'].map(lambda x: len(x) == 0)].dropna(subset=['Gene'])

Unnamed: 0,ID,Gene,Genes,Gene_symbols,Gene_symbol


In [54]:
df_new[['ID', 'Gene', 'Gene_symbol']].to_csv(
    'variant_id_to_gene_symbol.tsv', sep='\t', index=False
)

In [56]:
!gzip -9 variant_id_to_gene_symbol.tsv