In [28]:
import pandas as pd
from pybedtools import BedTool

In [11]:
select_genes_file = '../../data/genomes/ara_housekeeping_list.out'
all_promoters_bedfile = '../../data/FIMO/promoters_renamedChr.bed'

In [20]:
select_genes = pd.read_table(select_genes_file, sep='\t', header=None)
cols = ['gene','gene_type']
select_genes.columns = cols
all_promoters = pd.read_table(all_promoters_bedfile, sep='\t', header=None)
cols = ['chr', 'start', 'stop', 'gene', 'dot', 'strand', 'source', 'type', 'dot2', 'details']
all_promoters.columns = cols

In [21]:
select_genes

Unnamed: 0,gene,gene_type
0,AT4G34270,housekeeping
1,AT3G32260,housekeeping
2,AT1G59830,housekeeping
3,AT4G33380,housekeeping
4,AT2G28390,housekeeping
...,...,...
401,AT1G65490,randCont
402,AT2G30360,randCont
403,AT5G56270,randCont
404,AT5G57785,randCont


In [17]:
all_promoters

Unnamed: 0,chr,start,stop,gene,dot,strand,source,type,dot2,details
0,1,2630,3630,AT1G01010,.,+,Araport11,gene,.,"Alias=ANAC001,NAC domain containing protein 1;..."
1,1,9130,10130,AT1G01020,.,-,Araport11,gene,.,"Dbxref=PMID:15010618,locus:2200940;ID=AT1G0102..."
2,1,10100,11100,AT1G03987,.,+,Araport11,gene,.,ID=AT1G03987;Name=AT1G03987;locus_type=long_no...
3,1,13714,14714,AT1G01030,.,-,Araport11,gene,.,"Dbxref=PMID:11118137,PMID:15010618,PMID:166036..."
4,1,22120,23120,AT1G01040,.,+,Araport11,gene,.,"Alias=ASU1,ABNORMAL SUSPENSOR 1,ATDCL1,DICER-L..."
...,...,...,...,...,...,...,...,...,...,...
33336,mitochondria,361052,362052,ATmitochondriaG01370,.,-,Araport11,gene,.,Dbxref=locus:504954620;ID=ATmitochondriaG01370...
33337,mitochondria,361179,362179,ATmitochondriaG01380,.,-,Araport11,gene,.,"Dbxref=PmitochondriaID:15067383,PmitochondriaI..."
33338,mitochondria,362724,363724,ATmitochondriaG01400,.,+,Araport11,gene,.,Dbxref=locus:504954623;ID=ATmitochondriaG01400...
33339,mitochondria,363284,364284,ATmitochondriaG01390,.,-,Araport11,gene,.,"Dbxref=PmitochondriaID:14586555,PmitochondriaI..."


### filter out unwanted genes

In [22]:
merged = pd.merge(select_genes, all_promoters, on='gene')

In [23]:
merged

Unnamed: 0,gene,gene_type,chr,start,stop,dot,strand,source,type,dot2,details
0,AT4G34270,housekeeping,4,16406303,16407303,.,-,Araport11,gene,.,"Dbxref=PMID:8893552,PMID:16166256,PMID:1708550..."
1,AT3G32260,housekeeping,3,13218167,13219167,.,+,Araport11,gene,.,"Dbxref=PMID:16166256,locus:2084673;ID=AT3G3226..."
2,AT1G59830,housekeeping,1,22022503,22023503,.,-,Araport11,gene,.,"Dbxref=PMID:10091592,PMID:10080713,PMID:986939..."
3,AT4G33380,housekeeping,4,16070739,16071739,.,+,Araport11,gene,.,"Dbxref=PMID:16166256,PMID:17085508,PMID:179083..."
4,AT2G28390,housekeeping,2,12143665,12144665,.,-,Araport11,gene,.,"Alias=MON1,MONENSIN SENSITIVITY1;Dbxref=PMID:1..."
...,...,...,...,...,...,...,...,...,...,...,...
401,AT1G65490,randCont,1,24353645,24354645,.,+,Araport11,gene,.,"Dbxref=PMID:15894741,PMID:18650403,PMID:207364..."
402,AT2G30360,randCont,2,12938834,12939834,.,-,Araport11,gene,.,"Alias=CIPK11,CBL-INTERACTING PROTEIN KINASE 11..."
403,AT5G56270,randCont,5,22778613,22779613,.,+,Araport11,gene,.,"Alias=ATWRKY2,ARABIDOPSIS THALIANA WRKY DNA-BI..."
404,AT5G57785,randCont,5,23409715,23410715,.,-,Araport11,gene,.,"Dbxref=PMID:16258012,PMID:16792696,PMID:203512..."


In [27]:
#check if any NaN
merged[pd.isna(merged.chr)]

Unnamed: 0,gene,gene_type,chr,start,stop,dot,strand,source,type,dot2,details


### change column order to bed file format

In [30]:
filtered_proms = merged.loc[:, ['chr', 'start', 'stop', 'gene', 'dot', 'strand', 'source', 'type', 'dot2', 'details']]
#sort the df by chromosome then by motif start position
filtered_proms_sorted = filtered_proms.sort_values(['chr','start'])

In [31]:
filtered_proms_sorted

Unnamed: 0,chr,start,stop,gene,dot,strand,source,type,dot2,details
229,1,33171,34171,AT1G01050,.,-,Araport11,gene,.,"Alias=AtPPa1,pyrophosphorylase 1;Dbxref=PMID:1..."
157,1,281760,282760,AT1G01780,.,+,Araport11,gene,.,"Dbxref=PMID:15642518,PMID:15659355,PMID:155317..."
45,1,316090,317090,AT1G01910,.,-,Araport11,gene,.,"Dbxref=PMID:18315867,PMID:18775970,PMID:207364..."
181,1,344515,345515,AT1G02000,.,+,Araport11,gene,.,"Dbxref=PMID:15247385,PMID:15911562,PMID:187759..."
149,1,728829,729829,AT1G03070,.,+,Araport11,gene,.,"Alias=AtLFG4,LFG4,LIFEGUARD 4;Dbxref=PMID:1708..."
...,...,...,...,...,...,...,...,...,...,...
346,5,26120581,26121581,AT5G65360,.,-,Araport11,gene,.,"Alias=H3.1,histone 3.1;Dbxref=PMID:12662315,PM..."
286,5,26171008,26172008,AT5G65470,.,+,Araport11,gene,.,"Dbxref=PMID:18775970,locus:2168297;ID=AT5G6547..."
360,5,26467531,26468531,AT5G66250,.,+,Araport11,gene,.,"Dbxref=PMID:18650403,PMID:18775970,PMID:207364..."
154,5,26817948,26818948,AT5G67210,.,+,Araport11,gene,.,"Dbxref=PMID:21251108,PMID:21288268,locus:21555..."


### write out bed file of merged dfs

In [32]:
BedTool.from_dataframe(filtered_proms_sorted).saveas('../../data/FIMO/responsivepromoters.bed')

<BedTool(../../data/FIMO/responsivepromoters.bed)>