In [20]:
import pandas as pd
from pybedtools import BedTool

In [21]:
select_genes_file = '../../data/genomes/ara_housekeeping_list.out'
select_genes_file_100_random = '../../data/genes/constitutive-variable-random_100_each.csv'
genes_overlap_bedfile = '../../data/genomes/promoterandgenes_only_overlap.bed'
#this is actually genome features overlap
promoters_overlapping_bedfile = '../../data/genomes/promoters_overlapping.bed'

In [22]:
select_genes = pd.read_table(select_genes_file, sep='\t', header=None)
cols = ['gene','gene_type']
select_genes.columns = cols
genes_overlap = pd.read_table(genes_overlap_bedfile, sep='\t', header=None)
cols = ['chr', 'start', 'stop', 'gene', 'dot', 'strand', 'source', 'type', 'dot2', 'details']
genes_overlap.columns = cols
promoters_overlapping = pd.read_table(promoters_overlapping_bedfile, sep='\t', header=None)
promoters_overlapping.columns = cols

In [28]:
random_100_only = pd.read_csv(select_genes_file_100_random, header=0)
del random_100_only['Unnamed: 0']
random_100_only.rename(columns={'promoter_AGI':'gene'}, inplace=True)
random_100_only

Unnamed: 0,gene,gene_type
0,AT4G34270,housekeeping
1,AT3G32260,housekeeping
2,AT1G59830,housekeeping
3,AT4G33380,housekeeping
4,AT2G28390,housekeeping
...,...,...
295,AT5G13410,randCont
296,AT5G56090,randCont
297,AT5G05170,randCont
298,AT3G18110,randCont


In [24]:
select_genes

Unnamed: 0,gene,gene_type
0,AT4G34270,housekeeping
1,AT3G32260,housekeeping
2,AT1G59830,housekeeping
3,AT4G33380,housekeeping
4,AT2G28390,housekeeping
...,...,...
401,AT1G65490,randCont
402,AT2G30360,randCont
403,AT5G56270,randCont
404,AT5G57785,randCont


### filter out unwanted genes

In [29]:
merged = pd.merge(select_genes, genes_overlap, on='gene')
merged2 = pd.merge(select_genes, promoters_overlapping, on='gene')
merged_random100 = pd.merge(random_100_only, genes_overlap, on='gene')
merged2_random100 = pd.merge(random_100_only, promoters_overlapping, on='gene')


In [7]:
merged

Unnamed: 0,gene,gene_type,chr,start,stop,dot,strand,source,type,dot2,details
0,AT4G34270,housekeeping,Chr4,16406303,16407303,.,-,Araport11,gene,.,"Dbxref=PMID:8893552,PMID:16166256,PMID:1708550..."
1,AT1G59830,housekeeping,Chr1,22022503,22023503,.,-,Araport11,gene,.,"Dbxref=PMID:10091592,PMID:10080713,PMID:986939..."
2,AT4G33380,housekeeping,Chr4,16070739,16071739,.,+,Araport11,gene,.,"Dbxref=PMID:16166256,PMID:17085508,PMID:179083..."
3,AT5G55840,housekeeping,Chr5,22596992,22597992,.,+,Araport11,gene,.,"Dbxref=PMID:15632092,PMID:15565108,PMID:161662..."
4,AT1G13320,housekeeping,Chr1,4567849,4568849,.,-,Araport11,gene,.,"Dbxref=PMID:7811971,PMID:14973165,PMID:1527645..."
...,...,...,...,...,...,...,...,...,...,...,...
249,AT1G27370,randCont,Chr1,9508542,9509542,.,-,Araport11,gene,.,"Dbxref=PMID:11118137,PMID:14972688,PMID:151310..."
250,AT1G10070,randCont,Chr1,3287086,3288086,.,+,Araport11,gene,.,"Alias=ATBCAT-2,branched-chain amino acid trans..."
251,AT1G65490,randCont,Chr1,24353645,24354645,.,+,Araport11,gene,.,"Dbxref=PMID:15894741,PMID:18650403,PMID:207364..."
252,AT2G30360,randCont,Chr2,12938834,12939834,.,-,Araport11,gene,.,"Alias=CIPK11,CBL-INTERACTING PROTEIN KINASE 11..."


In [30]:
## how many of each gene_type, genes_overlap
merged_counts = merged_random100['gene_type'].value_counts()
merged_counts

housekeeping    66
randCont        59
highVar         56
Name: gene_type, dtype: int64

In [31]:
## how many of each gene_type, genome features overlap
merged_counts2 = merged2_random100['gene_type'].value_counts()
merged_counts2

housekeeping    85
highVar         73
randCont        73
Name: gene_type, dtype: int64

In [24]:
merged2


Unnamed: 0,gene,gene_type,chr,start,stop,dot,strand,source,type,dot2,details
0,AT4G34270,housekeeping,Chr4,16406303,16407303,.,-,Araport11,gene,.,"Dbxref=PMID:8893552,PMID:16166256,PMID:1708550..."
1,AT1G59830,housekeeping,Chr1,22022503,22023503,.,-,Araport11,gene,.,"Dbxref=PMID:10091592,PMID:10080713,PMID:986939..."
2,AT4G33380,housekeeping,Chr4,16070739,16071739,.,+,Araport11,gene,.,"Dbxref=PMID:16166256,PMID:17085508,PMID:179083..."
3,AT2G28390,housekeeping,Chr2,12143665,12144665,.,-,Araport11,gene,.,"Alias=MON1,MONENSIN SENSITIVITY1;Dbxref=PMID:1..."
4,AT5G55840,housekeeping,Chr5,22596992,22597992,.,+,Araport11,gene,.,"Dbxref=PMID:15632092,PMID:15565108,PMID:161662..."
...,...,...,...,...,...,...,...,...,...,...,...
315,AT3G43960,randCont,Chr3,15775873,15776873,.,-,Araport11,gene,.,"Dbxref=PMID:12068095,PMID:12805588,PMID:146972..."
316,AT1G65490,randCont,Chr1,24353645,24354645,.,+,Araport11,gene,.,"Dbxref=PMID:15894741,PMID:18650403,PMID:207364..."
317,AT2G30360,randCont,Chr2,12938834,12939834,.,-,Araport11,gene,.,"Alias=CIPK11,CBL-INTERACTING PROTEIN KINASE 11..."
318,AT5G56270,randCont,Chr5,22778613,22779613,.,+,Araport11,gene,.,"Alias=ATWRKY2,ARABIDOPSIS THALIANA WRKY DNA-BI..."


In [25]:
#check if any NaN - there won't be NaN as did inner merge
merged[pd.isna(merged.chr)]

Unnamed: 0,gene,gene_type,chr,start,stop,dot,strand,source,type,dot2,details


### change column order to bed file format

In [27]:
filtered_proms = merged.loc[:, ['chr', 'start', 'stop', 'gene', 'dot', 'strand', 'source', 'type', 'dot2', 'details']]
filtered_proms2 = merged2.loc[:, ['chr', 'start', 'stop', 'gene', 'dot', 'strand', 'source', 'type', 'dot2', 'details']]
#sort the df by chromosome then by motif start position
filtered_proms_sorted = filtered_proms.sort_values(['chr','start'])
filtered_proms_sorted2 = filtered_proms2.sort_values(['chr','start'])

In [19]:
filtered_proms_sorted

Unnamed: 0,chr,start,stop,gene,dot,strand,source,type,dot2,details
144,Chr1,33171,34171,AT1G01050,.,-,Araport11,gene,.,"Alias=AtPPa1,pyrophosphorylase 1;Dbxref=PMID:1..."
94,Chr1,281760,282760,AT1G01780,.,+,Araport11,gene,.,"Dbxref=PMID:15642518,PMID:15659355,PMID:155317..."
32,Chr1,316090,317090,AT1G01910,.,-,Araport11,gene,.,"Dbxref=PMID:18315867,PMID:18775970,PMID:207364..."
238,Chr1,844251,845251,AT1G03400,.,-,Araport11,gene,.,"Dbxref=PMID:7579161,PMID:12805619,PMID:1761651..."
134,Chr1,892492,893492,AT1G03560,.,-,Araport11,gene,.,"Dbxref=PMID:16709272,PMID:16951057,locus:20208..."
...,...,...,...,...,...,...,...,...,...,...
173,Chr5,25155983,25156983,AT5G62650,.,-,Araport11,gene,.,Dbxref=locus:2172234;ID=AT5G62650;Name=AT5G626...
205,Chr5,25633209,25634209,AT5G64050,.,-,Araport11,gene,.,"Alias=ATERS,OVA3,OVULE ABORTION 3;Dbxref=PMID:..."
27,Chr5,25909014,25910014,AT5G64813,.,+,Araport11,gene,.,"Dbxref=PMID:17683937,PMID:23144185,locus:50500..."
230,Chr5,26467531,26468531,AT5G66250,.,+,Araport11,gene,.,"Dbxref=PMID:18650403,PMID:18775970,PMID:207364..."


### write out bed file of merged dfs

In [29]:
BedTool.from_dataframe(filtered_proms_sorted).saveas('../../data/genomes/promoterandgenes_only_overlap_const_var.bed')
BedTool.from_dataframe(filtered_proms_sorted2).saveas('../../data/genomes/promoters_overlapping__const_var.bed')

<BedTool(../../data/genomes/promoters_overlapping__const_var.bed)>