In [2]:
# This script use ensemble api to check overlapping genes in the in the MutSig output
# What really checked is the exact mutation that overlap with other genes
# Author: Yiyun
import json
from os.path import join, isdir, exists
from os import listdir
import requests, sys
import time
import pandas as pd
from multiprocessing import Pool
import pickle
import mmap
from tqdm import tqdm

server = "https://grch37.rest.ensembl.org"

In [3]:
nsyn_dir = '../mutsig_out/nsyn/out10212020'
syn_dir = '../mutsig_out/syn/out10212020'
org_dir = '../org_cov_maf'
outdir = '../proc_09152020/dovgene'

In [4]:
# From forum https://stackoverflow.com/questions/667508/whats-a-good-rate-limiting-algorithm
def RateLimited(maxPerSecond):
    minInterval = 1.0 / float(maxPerSecond)
    def decorate(func):
        lastTimeCalled = [0.0]
        def rateLimitedFunction(*args,**kargs):
            elapsed = time.clock() - lastTimeCalled[0]
            leftToWait = minInterval - elapsed
            if leftToWait>0:
                time.sleep(leftToWait)
            ret = func(*args,**kargs)
            lastTimeCalled[0] = time.clock()
            return ret
        return rateLimitedFunction
    return decorate

In [5]:
# send the query to ensembl by position and variant, currently only work for SNP
# return a dictionary of organ-gene-mutation-overlap gene-consequence
@RateLimited(12)
def get_overlapping_cons(muts):
    ext = '/vep/human/region/'+muts+'/?'
    r = requests.get(server+ext,headers={"Content-Type" : "application/json"})

    if not r.ok:
      r.raise_for_status()
      sys.exit()

    decoded = r.json()
    
    return decoded[0]['transcript_consequences']

In [6]:
def progress(n):
    if n%500==0:
        print "finish reading "+ str(n)+ ' mutations'

In [20]:
def get_targe_gene(org):
    organ_dir = join(syn_dir,org)
    for f in listdir(organ_dir):
        if f.endswith('sig_genes.txt'):
            f_dir = join(organ_dir,f)
            df_org = pd.read_csv(f_dir,sep = '\t', index_col=None)
            lg = df_org['gene'].tolist()
    
    # divide genelist into 3 list
    chunks = [lg[x] for x in xrange(0, len(lg), 200)]
    
#     tc = chunks[0]
#     bc = chunks[-1]
#     mc = chunks[int(len(chunks)/2)]
    
    return chunks

In [21]:
# This function get a temporaty maf file
def get_variant_maf(org,lgene):
    
    dmut = {}
    
    maf = join(org_dir,org,'merged_tumor_covmerged_'+org+'.txt_filtered.maf')

    with open(maf,'r+b') as f:
        next(f)
        mmaf = mmap.mmap(f.fileno(), 0)
        for lines in iter(mmaf.readline, ""): 
            name,chrom,startp,endp,strand,vc,vt, ref, tum1, tum2= lines.split('\t')[0:10]
        # get variant
            if any(name in l for l in lgene) and vt == 'SNP' and vc != 'RNA' and vc!= 'lincRNA' and name != 'Unknown':
                if tum1 == ref and tum2 != ref:
                    var = tum2
                elif tum2 == ref and tum1 != ref:
                    var = tum1
                elif tum2 != ref and tum1 != ref:
                    var = tum1
                else:
                    print "NO VARIANT" + name +'\t'+ starp
                    continue
                    
                mut = str(chrom)+':'+str(startp)+'-'+str(startp)+'/'+str(var)
                if name not in dmut:
                    dmut[name] = {}
                if mut not in dmut[name]:
                    dmut[name][mut] = {}
                if vc not in dmut[name][mut]:
                    dmut[name][mut] = {}
                    dmut[name][mut]['vc'] = vc
                
        mmaf.close()
        
    pickle.dump(dmut, open(org+'_tmp.pkl','wb'))
    
    return dmut

In [22]:
# this function go through maf file and get potential overlapping genes for mutation
def get_variant(org):
    print 'Start: '+org
    global org_dir, outdir
    # a numerator for mutations
    n = 0
    
    listgene = get_targe_gene(org)
    dict_mut = get_variant_maf(org,listgene)
    
    # open maf file and get variant
    outf = join(outdir,org+'_102120.tsv')
    lf = []
    if exists(outf):
        with open(outf,'r') as fo:
            for lines in fo:
                gpos = lines.split('\t')[1].strip('\n')
                lf.append(gpos)
    else:
        fo = open(outf,'wb')
        fo.close()
            
    for gene in dict_mut: 
        for mut in dict_mut[gene]:
            n+=1
            progress(n)
            
            if mut in lf:
                continue

            # ------send to ensembl api ------

            res = get_overlapping_cons(mut)
            for cons in res:
                # if it's not the target gene, append to dictionary
                if cons['gene_symbol'] != gene: 
                    ovpgene = cons['gene_symbol']

                    with open(outf, 'a') as of:
                         of.write(gene+'\t'+mut+'\t'+str(dict_mut[gene][mut]['vc'])+'\t'+ovpgene+'\t'+str(cons['consequence_terms'])+'\n')
                else:
                    with open(outf, 'a') as of:
                         of.write(gene+'\t'+mut+'\n')

    
    print 'Finish: '+ org

In [24]:
lorg = listdir(org_dir)
lorg = [l for l in lorg if not l.startswith('.')]
print lorg
p = Pool(4)
p.map(get_variant, lorg)
p.join

['Ovary', 'Thy', 'Biliary', 'Breast', 'Panc', 'Lung', 'Liver', 'CNS', 'Lymph', 'Bone', 'Eso', 'ColoRect', 'Kidney', 'Myeloid', 'Stomach', 'Prost', 'Skin', 'Head', 'Uterus', 'SoftTissue', 'Cervix', 'Bladder']
Start: Ovary
Start: Liver
Start: Panc
Start: Biliary
finish reading 500 mutations
finish reading 1000 mutations
finish reading 1500 mutations
Finish: Biliary
Start: Breast
finish reading 500 mutations
finish reading 1000 mutations
finish reading 1500 mutations
Finish: Ovary
Start: Thy
Finish: Thy
Start: Lymph
finish reading 500 mutations
finish reading 1000 mutations
finish reading 1500 mutations
finish reading 2000 mutations
finish reading 2500 mutations
finish reading 3000 mutations
finish reading 3500 mutations
Finish: Panc
Start: Lung
finish reading 500 mutations
finish reading 1000 mutations
finish reading 1500 mutations
finish reading 2000 mutations
finish reading 2500 mutations
finish reading 3000 mutations
finish reading 3500 mutations
finish reading 4000 mutations
finish r

OSError: [Errno 2] No such file or directory: '../mutsig_out/syn/out10212020/SoftTissue'

finish reading 500 mutations
finish reading 1000 mutations
finish reading 1500 mutations
Finish: Cervix
Start: Bladder
finish reading 500 mutations
finish reading 1000 mutations
finish reading 1500 mutations
Finish: Bladder
finish reading 500 mutations
finish reading 1000 mutations
finish reading 1500 mutations
finish reading 2000 mutations
finish reading 2500 mutations
finish reading 3000 mutations
finish reading 3500 mutations
finish reading 500 mutations
finish reading 1000 mutations
finish reading 1500 mutations
finish reading 2000 mutations
finish reading 2500 mutations
finish reading 3000 mutations
finish reading 3500 mutations
finish reading 4000 mutations
finish reading 4500 mutations
finish reading 5000 mutations
finish reading 5500 mutations
finish reading 6000 mutations
finish reading 4000 mutations
finish reading 6500 mutations
Finish: Prost
finish reading 7000 mutations
finish reading 4500 mutations
finish reading 7500 mutations
