In [6]:
import os
import pandas as pd
import numpy as np
PROJECT_DIR = '/home/shuang/projects/development_eqtm'
tss_filepath = '/home/shuang/projects/development_eqtm/data/features/TSSDistance/Homo_sapiens.GRCh37.71.gtf'
eqtm_filepath = '/home/shuang/projects/development_eqtm/data/eqtmZscores/withExpressionTSSMethy/2017-12-09-eQTLsFDR-et0_withExpressionTssMethy.txt'

In [5]:
def add_TSS_basedOnProbeName_toEQTMFile(eqtm_filepath,tss_raw,
                                      eqtm_savepath=None):
    '''
    add TssDistance to eqtm file
    INPUT:
        eQTM_path, string, path to eqtmZscore file
        tss_filepath, string, path to TssSite file
        save_path, string, path to save the new eqtmZscore_withTSSDistance file
    OUTPUT:
        eQTMs, pandas dataframe
    '''
    # tss_raw = read_tss_data(tss_filepath)
    # reading the eQTMs
    eQTMs = pd.read_csv(eqtm_filepath,sep=',')

    # extract gene name from geneInfo for tss file
    def findGeneName(item):
        item = [thing for thing in list(filter(None,item.strip().split(";")))][0]
        name = item.replace('"','').replace(';','').strip().split(' ')[1]
        return name
    tss_raw['geneName'] = tss_raw['geneInfo'].apply(findGeneName)

    # find the tss sites for each gene in the tss file
    groupbyTss = tss_raw.groupby('geneName').agg({
        'chr':lambda x: x.unique(),
        'startSite':np.min,
        'endSite':np.max,
        'strand':lambda x: x.unique()
    })
    def findTssSite(series):
        if series[3] == '-':
            return series[2]
        else:
            return series[1]
    groupbyTss['TssSite'] = groupbyTss.apply(findTssSite,axis=1)

    # add tss sites and tss distance to the eqtm file
    def mapSite(row):
        return groupbyTss.loc[row]['TssSite']
    def calculateDis(row):
        return abs(row[0]-row[1])
    def findChr(row):
        return groupbyTss.loc[row]['chr']
    def checkChr(row):
        if str(row[0])==str(row[1]):
            return True
        else:
            return False
    eQTMs['TssSite'] = eQTMs['ProbeName'].apply(mapSite)
    eQTMs['chr'] = eQTMs['ProbeName'].apply(findChr)
    eQTMs['TssDistance'] = eQTMs[['SNPChrPos','TssSite']].apply(calculateDis,axis=1)
    eQTMs['checkChr'] = eQTMs[['chr','SNPChr']].apply(checkChr,axis=1)
    # check whether they are from the same chromosome
    assert len(eQTMs['checkChr'].unique()) == 1

    if eqtm_savepath:
        # save the eQTM file
        eQTMs.to_csv(eqtm_savepath,index=False)
        print('Saved eQTM file to: ',eqtm_savepath)

    return eQTMs

def read_tss_data(tss_filepath):
    '''
    read tss file from tss_filepath
    '''
    colnames = ['chr','regionFunction','regionType','startSite',
                'endSite','score','strand','sthunknown','geneInfo']
    dtype = {'chr':object,'regionFunction':object,'regionType':object,
             'startSite':int,'endSite':int,'score':object,
             'strand':object,'sthunknown':object,'geneInfo':object}
    tss_raw = pd.read_csv(tss_filepath,sep='\t',header=None,
                          names=colnames,dtype=dtype)
    return tss_raw

In [None]:
    tss_raw = read_tss_data(tss_filepath) # slow...

In [7]:
    eQTMs = pd.read_csv(eqtm_filepath,sep=',')

    # extract gene name from geneInfo for tss file
    def findGeneName(item):
        item = [thing for thing in list(filter(None,item.strip().split(";")))][0]
        name = item.replace('"','').replace(';','').strip().split(' ')[1]
        return name
    
    tss_raw['geneName'] = tss_raw['geneInfo'].apply(findGeneName)

    # find the tss sites for each gene in the tss file
    groupbyTss = tss_raw.groupby('geneName').agg({
        'chr':lambda x: x.unique(),
        'startSite':np.min,
        'endSite':np.max,
        'strand':lambda x: x.unique()
    })
    def findTssSite(series):
        if series[3] == '-':
            return series[2]
        else:
            return series[1]
    groupbyTss['TssSite'] = groupbyTss.apply(findTssSite,axis=1)

#     # add tss sites and tss distance to the eqtm file
#     def mapSite(row):
#         return groupbyTss.loc[row]['TssSite']
#     def calculateDis(row):
#         return abs(row[0]-row[1])
#     def findChr(row):
#         return groupbyTss.loc[row]['chr']
#     def checkChr(row):
#         if str(row[0])==str(row[1]):
#             return True
#         else:
#             return False
#     eQTMs['TssSite'] = eQTMs['ProbeName'].apply(mapSite)
#     eQTMs['chr'] = eQTMs['ProbeName'].apply(findChr)
#     eQTMs['TssDistance'] = eQTMs[['SNPChrPos','TssSite']].apply(calculateDis,axis=1)
#     eQTMs['checkChr'] = eQTMs[['chr','SNPChr']].apply(checkChr,axis=1)
#     # check whether they are from the same chromosome
#     assert len(eQTMs['checkChr'].unique()) == 1

#     if eqtm_savepath:
#         # save the eQTM file
#         eQTMs.to_csv(eqtm_savepath,index=False)
#         print('Saved eQTM file to: ',eqtm_savepath)

In [8]:
groupbyTss.head()

Unnamed: 0_level_0,chr,startSite,endSite,strand,TssSite
geneName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000000003,X,99883667,99894988,-,99894988
ENSG00000000005,X,99839799,99854882,+,99839799
ENSG00000000419,20,49551404,49575092,-,49575092
ENSG00000000457,1,169821804,169863408,-,169863408
ENSG00000000460,1,169631245,169823221,+,169631245


In [13]:
for row in groupbyTss.itertuples():
    if row[2]>row[3]:
        print(row)

In [1]:
groupbyTss.to_csv('/home/shuang/projects/development_eqtm/data/geneSites/all_geneSites.txt')

NameError: name 'groupbyTss' is not defined