In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [2]:
# wrap it into a function
def addTssDistance2eQTMwithZscoreFile(eQTM_path,tss_path,save_path = None):
    # read tss file from tss_path
    colnames = ['chr','regionFunction','regionType','startSite',
                'endSite','score','strand','sthunknown','geneInfo']
    dtype = {'chr':object,'regionFunction':object,'regionType':object,
             'startSite':int,'endSite':int,'score':object,
             'strand':object,'sthunknown':object,'geneInfo':object}
    tss_raw = pd.read_csv(tss_path,sep='\t',header=None,names=colnames,dtype=dtype)
    
    # reading the eQTMs
    eQTMs = pd.read_csv(eQTM_path,sep='\t')
    
    # extract gene name from geneInfo for tss file
    def findGeneName(item):
        item = [thing for thing in list(filter(None,item.strip().split(";")))][0]
        name = item.replace('"','').replace(';','').strip().split(' ')[1]
        return name
    tss_raw['geneName'] = tss_raw['geneInfo'].apply(findGeneName)
    
    # find the tss sites for each gene in the tss file
    groupbyTss = tss_raw.groupby('geneName').agg({
        'chr':lambda x: x.unique(),
        'startSite':np.min,
        'endSite':np.max,
        'strand':lambda x: x.unique()
    })
    def findTssSite(series):
        if series[3] == '-':
            return series[2]
        else:
            return series[1]
    groupbyTss['TssSite'] = groupbyTss.apply(findTssSite,axis=1)
    
    # add tss sites and tss distance to the eqtm file
    def mapSite(row):
        return groupbyTss.loc[row]['TssSite']
    def calculateDis(row):
        return abs(row[0]-row[1])
    def findChr(row):
        return groupbyTss.loc[row]['chr']
    def checkChr(row):
        if str(row[0])==str(row[1]):
            return True
        else:
            return False
    eQTMs['TssSite'] = eQTMs['ProbeName'].apply(mapSite)
    eQTMs['chr'] = eQTMs['ProbeName'].apply(findChr)
    eQTMs['TssDistance'] = eQTMs[['SNPChrPos','TssSite']].apply(calculateDis,axis=1)
    eQTMs['checkChr'] = eQTMs[['chr','SNPChr']].apply(checkChr,axis=1)
    # check whether they are from the same chromosome
    assert len(eQTMs['checkChr'].unique()) == 1
    
    if save_path:
        # save the eQTM file
        eQTMs.to_csv(save_path,index=False)
        print('Saved eQTM file to: ',save_path)

In [3]:
# Reading the gene positions 
PROJECT_DIR='/home/shuang/projects/eqtm'
tss_raw_path = os.path.join(PROJECT_DIR,'data','TSSdistance','Homo_sapiens.GRCh37.71.gtf')
colnames = ['chr','regionFunction','regionType','startSite','endSite','score','strand','sthunknown','geneInfo']
dtype = {'chr':object,'regionFunction':object,'regionType':object,'startSite':int,
         'endSite':int,'score':object,'strand':object,'sthunknown':object,
         'geneInfo':object}
tss_raw = pd.read_csv(tss_raw_path,sep='\t',header=None,names=colnames,dtype=dtype)

In [4]:
eQTM_path = '/home/shuang/projects/boxy_eqtm/data/eqtmZscores/fdr_gt0.05/random20k_gt0.5.txt'
save_path = '/home/shuang/projects/boxy_eqtm/data/eqtmZscores/fdr_gt0.05/random20k_gt0.5_withTss.txt'
addTssDistance2eQTMwithZscoreFile(eQTM_path,tss_raw_path,save_path)

Saved eQTM file to:  /home/shuang/projects/boxy_eqtm/data/eqtmZscores/fdr_gt0.05/random20k_gt0.5_withTss.txt


In [4]:
tss_raw_path = os.path.join(PROJECT_DIR,'data','TSSdistance','Homo_sapiens.GRCh37.71.gtf')
eQTM_path = os.path.join(PROJECT_DIR,'data',
                         'eqtmZscores','2017-12-09-eQTLsFDR-et0.0-flipped.txt')
save_path = os.path.join(PROJECT_DIR,'data','output','et0.0-eQTMwithZscoreTssDistance.csv')
addTssDistance2eQTMwithZscoreFile(eQTM_path,tss_raw_path,save_path)

Saved eQTM file to:  /home/shuang/projects/eqtm/data/output/et0.0-eQTMwithZscoreTssDistance.csv


In [190]:
tss_raw_path = os.path.join(PROJECT_DIR,'data','TSSdistance','Homo_sapiens.GRCh37.71.gtf')
eQTM_path = os.path.join(PROJECT_DIR,'data',
                         'eqtmZscores','2017-12-09-eQTLsFDR-gt0.0-flipped.txt')
save_path = os.path.join(PROJECT_DIR,'data','output','gt0.0-eQTMwithZscoreTssDistance.csv')
addTssDistance2eQTMwithZscoreFile(eQTM_path,tss_raw_path,save_path)

Saved eQTM file to:  /home/shuang/projects/eqtm/data/output/gt0.0-eQTMwithZscoreTssDistance.csv


In [6]:
tss_raw_path = os.path.join(PROJECT_DIR,'data','TSSdistance','Homo_sapiens.GRCh37.71.gtf')
eQTM_path = os.path.join(PROJECT_DIR,'data',
                         'eqtmZscores','random20000-eQTLsFDR-gt0.05-flipped.txt')
save_path = os.path.join(PROJECT_DIR,'data','output','gt0.05-eQTMwithZscoreTssDistance.csv')
addTssDistance2eQTMwithZscoreFile(eQTM_path,tss_raw_path,save_path)

Saved eQTM file to:  /home/shuang/projects/eqtm/data/output/gt0.05-eQTMwithZscoreTssDistance.csv


# test phase

In [61]:
# reading the eQTMs
eQTM_path = os.path.join(PROJECT_DIR,'data','eqtmZscores','2017-12-09-eQTLsFDR-et0.0-flipped.txt')
eQTMs = pd.read_csv(eQTM_path,sep='\t')
eQTMs.head()

Unnamed: 0,PValue,SNPName,SNPChr,SNPChrPos,ProbeName,ProbeChr,ProbeCenterChrPos,CisTrans,SNPType,AlleleAssessed,...,DatasetsZScores,DatasetsNrSamples,IncludedDatasetsMeanProbeExpression,IncludedDatasetsProbeExpressionVariance,HGNCName,IncludedDatasetsCorrelationCoefficient,Meta-Beta (SE),Beta (SE),FoldChange,FDR
0,3.27167e-310,cg07772999,2,113993052,ENSG00000189223,2,113996843,cis,C/T,C,...,13.3954353;26.201855;19.4459641;10.3181315;7.7...,554;741;732;429;263;186,0.0;0.0;0.0;0.0;0.0;0.0,25622.5;45818.5;44713.0;15372.5;5786.0;2898.5,AC016683.6,0.5270649;0.7784176;0.6356204;0.4698918;0.4564...,0E0 (0E0),0.5270649 (0.036171);0.7784176 (0.023092);0.63...,0;0;0;0;0;0,0.0
1,3.27167e-310,cg03085549,16,75150819,ENSG00000166816,16,75148213,cis,C/T,C,...,15.8873241;22.8059151;20.1369623;13.1688411;8....,554;741;732;429;263;186,0.0;0.0;0.0;0.0;0.0;0.0,25622.5;45818.5;44713.0;15372.5;5786.0;2898.5,LDHD,0.6057864;0.7109548;0.6530147;0.5783938;0.4695...,0E0 (0E0),0.6057864 (0.0338641);0.7109548 (0.025869);0.6...,0;0;0;0;0;0,0.0
2,3.27167e-310,cg24429836,16,75150744,ENSG00000166816,16,75148213,cis,C/T,C,...,15.6887132;23.7103174;21.6281806;12.4772442;8....,554;741;732;429;263;186,0.0;0.0;0.0;0.0;0.0;0.0,25622.5;45818.5;44713.0;15372.5;5786.0;2898.5,LDHD,0.5995889;0.7296311;0.6878608;0.5530699;0.4897...,0E0 (0E0),0.5995889 (0.0340634);0.7296311 (0.0251555);0....,0;0;0;0;0;0,0.0
3,3.27167e-310,cg16540789,16,75150736,ENSG00000166816,16,75148213,cis,C/T,C,...,16.493962;23.2769005;20.8489664;13.6811314;8.2...,554;741;732;429;263;186,0.0;0.0;0.0;0.0;0.0;0.0,25622.5;45818.5;44713.0;15372.5;5786.0;2898.5,LDHD,0.6237894;0.7210936;0.669591;0.5960725;0.47972...,0E0 (0E0),0.6237894 (0.0332668);0.7210936 (0.0254864);0....,0;0;0;0;0;0,0.0
4,3.27167e-310,cg07320140,16,75150563,ENSG00000166816,16,75148213,cis,C/T,C,...,16.0541779;23.1818341;21.6722909;13.8547848;8....,554;741;732;429;263;186,0.0;0.0;0.0;0.0;0.0;0.0,25622.5;45818.5;44713.0;15372.5;5786.0;2898.5,LDHD,0.6113259;0.7185615;0.6891731;0.6017535;0.4778...,0E0 (0E0),0.6113259 (0.0336833);0.7185615 (0.025583);0.6...,0;0;0;0;0;0,0.0


In [76]:
first_layer = [item for item in list(filter(None,tss_raw['geneInfo'].loc[1,].strip().split(";")))]
second_layer = {}
for item in first_layer:
    keypair = item.replace('"','').replace(';','').strip().split(' ')
    second_layer[keypair[0]] = keypair[1]
print(first_layer)
print(tss_raw['geneInfo'].loc[1,])

['gene_id "ENSG00000261516"', ' transcript_id "ENST00000561901"', ' exon_number "2"', ' gene_name "ZNF707"', ' gene_biotype "protein_coding"', ' transcript_name "ZNF707-012"', ' exon_id "ENSE00003242903"']
 gene_id "ENSG00000261516"; transcript_id "ENST00000561901"; exon_number "2"; gene_name "ZNF707"; gene_biotype "protein_coding"; transcript_name "ZNF707-012"; exon_id "ENSE00003242903";


In [80]:
testitem = 'gene_id "ENSG00000261516"; transcript_id "ENST00000561901"; exon_number "2"; gene_name "ZNF707"; gene_biotype "protein_coding"; transcript_name "ZNF707-012"; exon_id "ENSE00003242903";'
def findGeneName(item):
    item = [thing for thing in list(filter(None,item.strip().split(";")))][0]
    name = item.replace('"','').replace(';','').strip().split(' ')[1]
    return name
print(findGeneName(testitem))
tss_raw['geneName'] = tss_raw['geneInfo'].apply(findGeneName)

ENSG00000261516


In [104]:
groupbyTss = tss_raw.groupby('geneName').agg({
    'chr':lambda x: x.unique(),
    'startSite':np.min,
    'endSite':np.max,
    'strand':lambda x: x.unique()
})
def findTssSite(series):
    if series[3] == '-':
        return series[2]
    else:
        return series[1]
groupbyTss['TssSite'] = groupbyTss.apply(findTssSite,axis=1)

In [160]:
groupbyTss.head()

Unnamed: 0_level_0,chr,startSite,endSite,strand,TssSite
geneName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000000003,X,99883667,99894988,-,99894988
ENSG00000000005,X,99839799,99854882,+,99839799
ENSG00000000419,20,49551404,49575092,-,49575092
ENSG00000000457,1,169821804,169863408,-,169863408
ENSG00000000460,1,169631245,169823221,+,169631245


In [169]:
def mapSite(row):
    return groupbyTss.loc[row]['TssSite']
def calculateDis(row):
    return abs(row[0]-row[1])
def findChr(row):
    return groupbyTss.loc[row]['chr']
def checkChr(row):
    if str(row[0])==str(row[1]):
        return True
    else:
        return False
eQTMs['TssSite'] = eQTMs['ProbeName'].apply(mapSite)
eQTMs['chr'] = eQTMs['ProbeName'].apply(findChr)
eQTMs['TssDistance'] = eQTMs[['SNPChrPos','TssSite']].apply(calculateDis,axis=1)
eQTMs['checkChr'] = eQTMs[['chr','SNPChr']].apply(checkChr,axis=1) # check whether they are from the same chromosome
eQTMs['checkChr'].unique()

array([ True])

In [177]:
eQTMs.to_csv(os.path.join(PROJECT_DIR,'data','output','et0.0-eQTMwithZscoreTssDistance.csv'),index=False)

Saved eQTM file to:  /home/shuang/projects/eqtm/data/output/et0.0-eQTMwithZscoreTssDistance.csv


Saved eQTM file to:  /home/shuang/projects/eqtm/data/output/gt0.0-eQTMwithZscoreTssDistance.csv


In [187]:
tss_raw_path = os.path.join(PROJECT_DIR,'data','TSSdistance','Homo_sapiens.GRCh37.71.gtf')
eQTM_path = os.path.join(PROJECT_DIR,'data',
                         'eqtmZscores','2017-12-09-eQTLsFDR-gt0.05-flipped.txt')
save_path = os.path.join(PROJECT_DIR,'data','output','gt0.05-eQTMwithZscoreTssDistance.csv')
addTssDistance2eQTMwithZscoreFile(eQTM_path,tss_raw_path,save_path)

KeyboardInterrupt: 