In [1]:
import os
import pandas as pd
import numpy as np

In [139]:
PROJECT_DIR = '/home/shuang/projects/eqtm'
mendelian_folder = os.path.join(PROJECT_DIR,'paper_data')
eqtm_folder = os.path.join(PROJECT_DIR,'data','eqtm')
bed_folder = os.path.join(PROJECT_DIR,'bedFiles')

mendelian = os.path.join(mendelian_folder,'mmc2.xlsx')

stringent = os.path.join(mendelian_folder,'41588_2018_62_MOESM4_ESM.txt')

eqtm_et = os.path.join(eqtm_folder,'2017-12-09-eQTLsFDR-et0.0-flipped.txt')
eqtm_gt = os.path.join(eqtm_folder,'2017-12-09-eQTLsFDR-gt0.0-flipped.txt')

In [230]:
# read mendelian disease causing variants
xls = pd.ExcelFile(mendelian)
mendelian_variantsDict = {}

sheets_needsJoin = [name for name in xls.sheet_names if name not in ['Table S6 legend']]
fields = ['Chr','Position','Ref','Alt','OMIM','Gene','PMID']

mendelian_variants = xls.parse(sheets_needsJoin[0])
mendelian_variants['function_type'] = sheets_needsJoin[0]
for sheet_name in sheets_needsJoin[1:]:
#     mendelian_variantsDict[sheet_name] = xls.parse(sheet_name)
    # try to concat here
    mendelian_variants2 = xls.parse(sheet_name)
    mendelian_variants2['function_type'] = sheet_name
    mendelian_variants = pd.concat([mendelian_variants,mendelian_variants2])

# add end position for mendelian variants
def findEnd(row):
    return row['Position']+max(len(row['Ref']),len(row['Alt']))
mendelian_variantsLimitedCols = mendelian_variants[fields].copy()
mendelian_variantsLimitedCols['posEnd'] = mendelian_variantsLimitedCols['Position']+1

# add a column for the mendelian variants indicating the chr number
def extract_nr(row):
    try:
        return int(row[3:])
    except:
        return row[3:]
mendelian_variantsLimitedCols['chr_nr'] = \
mendelian_variantsLimitedCols['Chr'].apply(extract_nr)
sorted_mendelian = mendelian_variantsLimitedCols.sort_values(by=['chr_nr','Position'])
sorted_mendelian[['Chr','Position','posEnd']].to_csv(os.path.join(bed_folder,'mendelian.bed'),
                           sep='\t',
                           header=False,
                           index=False)

In [162]:
mendelian_variantsLimitedCols.head()

Unnamed: 0,Chr,Position,Ref,Alt,OMIM,Gene,PMID,posEnd,chr_nr
0,chr1,21890663,G,A,MIM 241500,ALPL,10679946,21891116,1
1,chr1,209989478,C,CA,MIM 119300,IRF6,24442519,209989931,1
2,chr2,219524871,A,G,MIM 124000,BCS1L,19389488,219525324,2
3,chr7,156061506,C,T,MIM 142945,SHH,18836447,156061959,7
4,chr7,156583831,T,C,MIM 174500,SHH,17152067,156584284,7


In [167]:
# read eqtms
eqtms_et = pd.read_csv(eqtm_et,sep='\t')
eqtms_gt = pd.read_csv(eqtm_gt,sep='\t')
# add/minus 25 for start and end position of eqtms
def addString(row):
    return 'chr{}'.format(row)

for eqtms in [eqtms_et,eqtms_gt]:
    eqtms['chromosome'] = eqtms['SNPChr'].apply(addString)
    eqtms['posStart'] = eqtms['SNPChrPos'] -25
    eqtms['posEnd'] = eqtms['SNPChrPos'] +25

sorted_eqtmet = eqtms_et.sort_values(by=['SNPChr','posStart'])
sorted_eqtmgt = eqtms_gt.sort_values(by=['SNPChr','posStart'])

    
sorted_eqtmet[['chromosome','posStart','posEnd']].to_csv(os.path.join(bed_folder,
                                                             'eqtms_et.bed'),
                                                sep='\t',
                                                header=False,
                                                index=False)
sorted_eqtmgt[['chromosome','posStart','posEnd']].to_csv(os.path.join(bed_folder,
                                                             'eqtms_gt.bed'),
                                                sep='\t',
                                                header=False,
                                                index=False)
# eqtms_et[['chromosome','posStart','posEnd']].head()

In [168]:
# read the stringent variants
stringent_variants = pd.read_csv(stringent,sep='\t')
stringent_variants['chr_nr'] = stringent_variants['chromosome'].apply(extract_nr)
print(stringent_variants.shape)
stringent_variants['posEnd'] = stringent_variants['position (leftmost coordinate)']+\
max(len(stringent_variants['ref']),len(stringent_variants['alt']))

sorted_stringent = stringent_variants.sort_values(by=['chr_nr','position (leftmost coordinate)'])
sorted_stringent[['chromosome','position (leftmost coordinate)','posEnd']].to_csv(os.path.join(bed_folder,
                                                                           'stringent.bed'),
                                                              sep='\t',
                                                              header=False,
                                                              index=False)

(15741, 7)


In [144]:
stringent_variants.head()

Unnamed: 0,chromosome,position (leftmost coordinate),ref,alt,All,Stringent,chr_nr
0,chr1,2406719,C,G,1,0,1
1,chr1,2408451,C,T,1,0,1
2,chr1,3816372,T,C,1,0,1
3,chr1,3839606,A,G,1,0,1
4,chr1,5866372,C,A,1,0,1


# Use Bedtools for finding intersect

In [210]:
# examine bedtools results
res_folder = os.path.join(PROJECT_DIR,'bedFiles')
overlapping_files = ['et_mend.txt','et_stringent.txt','gt_mend.txt','gt_stringent.txt']
# et means fdr>0, gt means fdr==0

# all mendelian disease variants that has overlap area with significant CpG sites
# None

In [231]:
# examine fdr==0 and mendelian disease
examine_file = os.path.join(res_folder,'gt_mend.txt')
overlapping_variants_etMend = pd.read_csv(examine_file,sep='\t',header=None)
alist = []
for row in overlapping_variants_etMend.iterrows():
    chr_nr,pos = row[1][[0,1]]
    alist.append(mendelian_variants[(mendelian_variants['Chr']==chr_nr) & 
                       (mendelian_variants['Position']==pos)])
for element in alist:
    print(element[['Ref','Alt','function_type','OMIM']].values)

EmptyDataError: No columns to parse from file

# all mendelian disease variants that has overlap area with insignificant CpG sites

# MIM 102200 - PITUITARY ADENOMA PREDISPOSITION, INCLUDED; PAP, INCLUDED
# MIM 176000 - PORPHYRIA, ACUTE INTERMITTENT; AIP


In [233]:
# examine fdr==0 and mendelian disease
examine_file = os.path.join(res_folder,'et_mend.txt')
overlapping_variants_etMend = pd.read_csv(examine_file,sep='\t',header=None)
alist = []
for row in overlapping_variants_etMend.iterrows():
    chr_nr,pos = row[1][[0,1]]
    alist.append(mendelian_variants[(mendelian_variants['Chr']==chr_nr) & 
                       (mendelian_variants['Position']==pos)])
for element in alist:
    print(element[['function_type','OMIM']].values)

[['Enhancer Mutations' 'MIM 102200']]
[['5’ UTR Mutations' 'MIM 176000']]


# all stringent variants that has overlap area with significant CpG sites
# 1066

In [234]:
# examine fdr==0 and mendelian disease
examine_file = os.path.join(res_folder,'gt_stringent.txt')
overlapping_variants_etMend = pd.read_csv(examine_file,sep='\t',header=None)
overlapping_variants_etMend
alist = []
for row in overlapping_variants_etMend.iterrows():
    chr_nr,pos = row[1][[0,1]]
#     print(chr_nr,pos)
    alist.append(stringent_variants[(stringent_variants['chromosome']==chr_nr) & 
                       (stringent_variants['position (leftmost coordinate)']==pos)])
print(len(alist))
print(stringent_variants.columns)
for element in alist:
    print(element.values)

1066
Index(['chromosome', 'position (leftmost coordinate)', 'ref', 'alt', 'All',
       'Stringent', 'chr_nr', 'posEnd'],
      dtype='object')
[['chr1' 2406719 'C' 'G' 1 0 '1' 2422460]]
[['chr1' 2406719 'C' 'G' 1 0 '1' 2422460]]
[['chr1' 2408451 'C' 'T' 1 0 '1' 2424192]]
[['chr1' 2408451 'C' 'T' 1 0 '1' 2424192]]
[['chr1' 150508280 'G' 'A' 1 0 '1' 150524021]
 ['chr1' 150508280 'G' 'C' 1 0 '1' 150524021]]
[['chr1' 150508280 'G' 'A' 1 0 '1' 150524021]
 ['chr1' 150508280 'G' 'C' 1 0 '1' 150524021]]
[['chr1' 150508280 'G' 'A' 1 0 '1' 150524021]
 ['chr1' 150508280 'G' 'C' 1 0 '1' 150524021]]
[['chr1' 150508280 'G' 'A' 1 0 '1' 150524021]
 ['chr1' 150508280 'G' 'C' 1 0 '1' 150524021]]
[['chr1' 150509582 'G' 'C' 1 0 '1' 150525323]]
[['chr1' 150509582 'G' 'C' 1 0 '1' 150525323]]
[['chr1' 150511832 'G' 'A' 1 0 '1' 150527573]]
[['chr1' 150511832 'G' 'A' 1 0 '1' 150527573]]
[['chr1' 150512573 'G' 'A' 1 0 '1' 150528314]]
[['chr1' 150512573 'G' 'A' 1 0 '1' 150528314]]
[['chr1' 150513236 'GA' 'G' 1 

[['chr8' 11703860 'G' 'T' 1 1 '8' 11719601]]
[['chr8' 11703860 'G' 'T' 1 1 '8' 11719601]]
[['chr8' 11708307 'G' 'C' 1 1 '8' 11724048]]
[['chr8' 11708307 'G' 'C' 1 1 '8' 11724048]]
[['chr9' 32984630 'C' 'T' 1 0 '9' 33000371]]
[['chr9' 35657748 'A' 'G' 1 1 '9' 35673489]]
[['chr9' 36236828 'T' 'C' 1 0 '9' 36252569]]
[['chr9' 36236988 'T' 'C' 1 0 '9' 36252729]]
[['chr9' 36246029 'AC' 'A' 1 0 '9' 36261770]]
[['chr9' 36249191 'C' 'T' 1 0 '9' 36264932]]


# all stringent variants that has overlap area with significant CpG sites
# 4761

In [235]:
# examine fdr==0 and mendelian disease
examine_file = os.path.join(res_folder,'et_stringent.txt')
overlapping_variants_etMend = pd.read_csv(examine_file,sep='\t',header=None)
overlapping_variants_etMend
alist = []
for row in overlapping_variants_etMend.iterrows():
    chr_nr,pos = row[1][[0,1]]
#     print(chr_nr,pos)
    alist.append(stringent_variants[(stringent_variants['chromosome']==chr_nr) & 
                       (stringent_variants['position (leftmost coordinate)']==pos)])
print(len(alist))
for element in alist:
    print(element.values)

4761
[['chr1' 2406719 'C' 'G' 1 0 '1' 2422460]]
[['chr1' 2406719 'C' 'G' 1 0 '1' 2422460]]
[['chr1' 2406719 'C' 'G' 1 0 '1' 2422460]]
[['chr1' 2406719 'C' 'G' 1 0 '1' 2422460]]
[['chr1' 2406719 'C' 'G' 1 0 '1' 2422460]]
[['chr1' 2406719 'C' 'G' 1 0 '1' 2422460]]
[['chr1' 2406719 'C' 'G' 1 0 '1' 2422460]]
[['chr1' 2406719 'C' 'G' 1 0 '1' 2422460]]
[['chr1' 2406719 'C' 'G' 1 0 '1' 2422460]]
[['chr1' 2406719 'C' 'G' 1 0 '1' 2422460]]
[['chr1' 2408451 'C' 'T' 1 0 '1' 2424192]]
[['chr1' 2408451 'C' 'T' 1 0 '1' 2424192]]
[['chr1' 2408451 'C' 'T' 1 0 '1' 2424192]]
[['chr1' 2408451 'C' 'T' 1 0 '1' 2424192]]
[['chr1' 2408451 'C' 'T' 1 0 '1' 2424192]]
[['chr1' 2408451 'C' 'T' 1 0 '1' 2424192]]
[['chr1' 3816372 'T' 'C' 1 0 '1' 3832113]]
[['chr1' 11790916 'C' 'T' 1 1 '1' 11806657]]
[['chr1' 11790916 'C' 'T' 1 1 '1' 11806657]]
[['chr1' 11790916 'C' 'T' 1 1 '1' 11806657]]
[['chr1' 11790916 'C' 'T' 1 1 '1' 11806657]]
[['chr1' 11790916 'C' 'T' 1 1 '1' 11806657]]
[['chr1' 11790916 'C' 'T' 1 1 '1' 11806

[['chr16' 681881 'G' 'C' 1 0 '16' 697622]]
[['chr16' 723856 'C' 'G' 1 0 '16' 739597]]
[['chr16' 723856 'C' 'G' 1 0 '16' 739597]]
[['chr16' 1361871 'G' 'C' 1 0 '16' 1377612]]
[['chr16' 1361871 'G' 'C' 1 0 '16' 1377612]]
[['chr16' 1361871 'G' 'C' 1 0 '16' 1377612]]
[['chr16' 1361871 'G' 'C' 1 0 '16' 1377612]]
[['chr16' 1362037 'G' 'C' 1 0 '16' 1377778]]
[['chr16' 1362037 'G' 'C' 1 0 '16' 1377778]]
[['chr16' 1362037 'G' 'C' 1 0 '16' 1377778]]
[['chr16' 1362037 'G' 'C' 1 0 '16' 1377778]]
[['chr16' 1362442 'G' 'A' 1 0 '16' 1378183]]
[['chr16' 1362442 'G' 'A' 1 0 '16' 1378183]]
[['chr16' 1362442 'G' 'A' 1 0 '16' 1378183]]
[['chr16' 1362442 'G' 'A' 1 0 '16' 1378183]]
[['chr16' 1362535 'G' 'C' 1 0 '16' 1378276]]
[['chr16' 1362535 'G' 'C' 1 0 '16' 1378276]]
[['chr16' 1362535 'G' 'C' 1 0 '16' 1378276]]
[['chr16' 1362535 'G' 'C' 1 0 '16' 1378276]]
[['chr16' 1362609 'AG' 'A' 1 0 '16' 1378350]
 ['chr16' 1362609 'A' 'G' 1 0 '16' 1378350]]
[['chr16' 1362609 'AG' 'A' 1 0 '16' 1378350]
 ['chr16' 136260

[['chr16' 28936152 'G' 'T' 1 0 '16' 28951893]]
[['chr16' 28936152 'G' 'T' 1 0 '16' 28951893]]
[['chr16' 28936152 'G' 'T' 1 0 '16' 28951893]]
[['chr16' 28936152 'G' 'T' 1 0 '16' 28951893]]
[['chr16' 28936152 'G' 'T' 1 0 '16' 28951893]]
[['chr16' 28936152 'G' 'T' 1 0 '16' 28951893]]
[['chr16' 29814331 'A' 'T' 1 0 '16' 29830072]]
[['chr16' 29814331 'A' 'T' 1 0 '16' 29830072]]
[['chr16' 29814331 'A' 'T' 1 0 '16' 29830072]]
[['chr16' 29814331 'A' 'T' 1 0 '16' 29830072]]
[['chr16' 30088841 'T' 'A' 1 0 '16' 30104582]]
[['chr16' 30751095 'G' 'A' 1 0 '16' 30766836]]
[['chr16' 30751548 'G' 'C' 1 0 '16' 30767289]]
[['chr16' 30751604 'G' 'A' 1 0 '16' 30767345]]
[['chr16' 30963431 'A' 'G' 1 0 '16' 30979172]]
[['chr16' 31190960 'TA' 'T' 1 0 '16' 31206701]]
[['chr16' 31190961 'A' 'G' 1 0 '16' 31206702]]
[['chr16' 31191397 'A' 'C' 1 0 '16' 31207138]
 ['chr16' 31191397 'A' 'T' 1 0 '16' 31207138]]
[['chr16' 31191397 'A' 'C' 1 0 '16' 31207138]
 ['chr16' 31191397 'A' 'T' 1 0 '16' 31207138]]
[['chr16' 3119

[['chr19' 11200714 'A' 'G' 1 0 '19' 11216455]]
[['chr19' 11200714 'A' 'G' 1 0 '19' 11216455]]
[['chr19' 11200714 'A' 'G' 1 0 '19' 11216455]]
[['chr19' 11200714 'A' 'G' 1 0 '19' 11216455]]
[['chr19' 11200714 'A' 'G' 1 0 '19' 11216455]]
[['chr19' 11200714 'A' 'G' 1 0 '19' 11216455]]
[['chr19' 11200714 'A' 'G' 1 0 '19' 11216455]]
[['chr19' 11200714 'A' 'G' 1 0 '19' 11216455]]
[['chr19' 11213175 'C' 'T' 1 0 '19' 11228916]]
[['chr19' 11214650 'C' 'G' 1 0 '19' 11230391]]
[['chr19' 11215382 'C' 'A' 1 0 '19' 11231123]]
[['chr19' 11437972 'G' 'C' 1 0 '19' 11453713]]
[['chr19' 11446267 'CGCAG' 'C' 1 0 '19' 11462008]]
[['chr19' 11446267 'CGCAG' 'C' 1 0 '19' 11462008]]
[['chr19' 11446267 'CGCAG' 'C' 1 0 '19' 11462008]]
[['chr19' 11446267 'CGCAG' 'C' 1 0 '19' 11462008]]
[['chr19' 11446352 'T' 'C' 1 0 '19' 11462093]]
[['chr19' 11446352 'T' 'C' 1 0 '19' 11462093]]
[['chr19' 11446352 'T' 'C' 1 0 '19' 11462093]]
[['chr19' 11446352 'T' 'C' 1 0 '19' 11462093]]
[['chr19' 11449074 'A' 'G' 1 0 '19' 11464815

[['chr6' 32039458 'G' 'A' 1 0 '6' 32055199]]
[['chr6' 32039458 'G' 'A' 1 0 '6' 32055199]]
[['chr6' 32039458 'G' 'A' 1 0 '6' 32055199]]
[['chr6' 32039458 'G' 'A' 1 0 '6' 32055199]]
[['chr6' 32039458 'G' 'A' 1 0 '6' 32055199]]
[['chr6' 32039458 'G' 'A' 1 0 '6' 32055199]]
[['chr6' 32039458 'G' 'A' 1 0 '6' 32055199]]
[['chr6' 32039458 'G' 'A' 1 0 '6' 32055199]]
[['chr6' 32039458 'G' 'A' 1 0 '6' 32055199]]
[['chr6' 32039458 'G' 'A' 1 0 '6' 32055199]]
[['chr6' 32039458 'G' 'A' 1 0 '6' 32055199]]
[['chr6' 32039458 'G' 'A' 1 0 '6' 32055199]]
[['chr6' 32039458 'G' 'A' 1 0 '6' 32055199]]
[['chr6' 32039458 'G' 'A' 1 0 '6' 32055199]]
[['chr6' 32039458 'G' 'A' 1 0 '6' 32055199]]
[['chr6' 32039747 'A' 'G' 1 0 '6' 32055488]]
[['chr6' 32039747 'A' 'G' 1 0 '6' 32055488]]
[['chr6' 32039747 'A' 'G' 1 0 '6' 32055488]]
[['chr6' 32039747 'A' 'G' 1 0 '6' 32055488]]
[['chr6' 32039747 'A' 'G' 1 0 '6' 32055488]]
[['chr6' 32039747 'A' 'G' 1 0 '6' 32055488]]
[['chr6' 32039747 'A' 'G' 1 0 '6' 32055488]]
[['chr6' 3

In [180]:
mendelian_variantsLimitedCols

Unnamed: 0,Chr,Position,Ref,Alt,OMIM,Gene,PMID,posEnd,chr_nr
0,chr1,21890663,G,A,MIM 241500,ALPL,10679946,21891116,1
1,chr1,209989478,C,CA,MIM 119300,IRF6,24442519,209989931,1
2,chr2,219524871,A,G,MIM 124000,BCS1L,19389488,219525324,2
3,chr7,156061506,C,T,MIM 142945,SHH,18836447,156061959,7
4,chr7,156583831,T,C,MIM 174500,SHH,17152067,156584284,7
5,chr7,156583949,G,C,MIM 174500,SHH,17152067,156584402,7
6,chr7,156583951,G,A,MIM 174500,SHH,22903933,156584404,7
7,chr7,156583968,T,TTAAGGAAGTGATT,MIM 174500,SHH,22495965,156584421,7
8,chr7,156584107,A,C,MIM 174500,SHH,20068592,156584560,7
9,chr7,156584153,T,C,MIM 174500,SHH,25382487,156584606,7
