In [2]:
import pandas as pd
from pybedtools import BedTool
import numpy as np

## variables

In [3]:
fimo_file = "../../data/output/non-overlapping_includingbidirectional_all_genes_newannotation/FIMO/output/promoters_5UTR_FIMO/fimo.tsv"
promoter_bedfile = "../../data/output/non-overlapping_includingbidirectional_all_genes_newannotation/promoters_5UTR.bed"
motifs_bed = "../../data/output/non-overlapping_includingbidirectional_all_genes_newannotation/promoters_5UTR_motifs.bed"

## import data

In [4]:
def fimo_qfilter(fimo_file, q_value):
    """this uses a meme-suite version 5 fimo.tsv file, filters by a q-value, and returns a pandas df"""
    fimo = pd.read_table(fimo_file, sep='\t', engine='python')
    #rename sequence column to just the AGI code
    fimo.sequence_name = fimo.sequence_name.str.extract(r'(.*?)\:')
    #filter
    fimo_qfilter = fimo[fimo['q-value'] <= q_value]    
    return fimo_qfilter
    

In [5]:
def fimo2bed(filtered_fimo_df, promoters_bed, output_bed):
    """This function creates a bed file using fimo.tsv motif file, and the promoter.bed file (chromosome number is used from this). It sorts the bedfile by chromosome then by start"""
    promoters = pd.read_table(promoters_bed, sep='\t', header=None) #read in promoter bed file
    #add column names
    cols = ['chr', 'start', 'stop', 'gene', 'dot', 'strand', 'source', 'type', 'dot2', 'details'] 
    promoters.columns = cols
    #merge promoters.bed with the fimo motif file
    merged = pd.merge(filtered_fimo_df, promoters, left_on='sequence_name', right_on='gene')
    #add motif start position to the promoter start position, and minus 1 (bed file is 0-based), and motif end to promoter start pos (bed file end coord is non-inclusive). Then format with no decimal place
    merged['correct_start'] = (merged['start_x'] + merged['start_y'] -1).astype(np.int64)
    merged['correct_stop'] = (merged['stop_x'] + merged['start_y']).astype(np.int64)
    #create motifs df in bed file column order
    motifs_df = merged.loc[:, ['chr','correct_start','correct_stop', 'motif_id', 'score', 'strand_x', 'sequence_name', 'p-value', 'q-value', 'matched_sequence']]
    #sort the df by chromosome then by motif start position
    motifs_df_sorted = motifs_df.sort_values(['chr','correct_start'])
    #create motifs bed file
    motifs = BedTool.from_dataframe(motifs_df_sorted).saveas(output_bed)

    

In [11]:
fimo = pd.read_table(fimo_file, sep='\t')

In [12]:
fimo.head()

Unnamed: 0,motif_id,motif_alt_id,sequence_name,start,stop,strand,score,p-value,q-value,matched_sequence
0,BBRBPC_tnt.BPC5_col_a_m1,MEME,AT5G14390::5:4639575-4640835,1.0,30.0,-,61.8974,9.83e-20,5.8e-15,AGAGAGAGAGAGAGAGAGAGAGAGAGAGAG
1,BBRBPC_tnt.BPC5_col_a_m1,MEME,AT4G32530::4:15694958-15696283,2.0,31.0,-,61.8974,9.83e-20,5.8e-15,AGAGAGAGAGAGAGAGAGAGAGAGAGAGAG
2,BBRBPC_tnt.BPC5_col_a_m1,MEME,AT5G14390::5:4639575-4640835,3.0,32.0,-,61.8974,9.83e-20,5.8e-15,AGAGAGAGAGAGAGAGAGAGAGAGAGAGAG
3,BBRBPC_tnt.BPC5_col_a_m1,MEME,AT4G32530::4:15694958-15696283,4.0,33.0,-,61.8974,9.83e-20,5.8e-15,AGAGAGAGAGAGAGAGAGAGAGAGAGAGAG
4,BBRBPC_tnt.BPC5_col_a_m1,MEME,AT4G32530::4:15694958-15696283,6.0,35.0,-,61.8974,9.83e-20,5.8e-15,AGAGAGAGAGAGAGAGAGAGAGAGAGAGAG


In [13]:
fimo.sequence_name

0            AT5G14390::5:4639575-4640835
1          AT4G32530::4:15694958-15696283
2            AT5G14390::5:4639575-4640835
3          AT4G32530::4:15694958-15696283
4          AT4G32530::4:15694958-15696283
                        ...              
8661525    AT5G57565::5:23309283-23310871
8661526      AT1G20690::1:7173437-7175085
8661527                               NaN
8661528                               NaN
8661529                               NaN
Name: sequence_name, Length: 8661530, dtype: object

In [14]:
fimo.sequence_name = fimo.sequence_name.str.extract(r'(.*?)\:')

In [15]:
fimo

Unnamed: 0,motif_id,motif_alt_id,sequence_name,start,stop,strand,score,p-value,q-value,matched_sequence
0,BBRBPC_tnt.BPC5_col_a_m1,MEME,AT5G14390,1.0,30.0,-,61.89740,9.830000e-20,5.800000e-15,AGAGAGAGAGAGAGAGAGAGAGAGAGAGAG
1,BBRBPC_tnt.BPC5_col_a_m1,MEME,AT4G32530,2.0,31.0,-,61.89740,9.830000e-20,5.800000e-15,AGAGAGAGAGAGAGAGAGAGAGAGAGAGAG
2,BBRBPC_tnt.BPC5_col_a_m1,MEME,AT5G14390,3.0,32.0,-,61.89740,9.830000e-20,5.800000e-15,AGAGAGAGAGAGAGAGAGAGAGAGAGAGAG
3,BBRBPC_tnt.BPC5_col_a_m1,MEME,AT4G32530,4.0,33.0,-,61.89740,9.830000e-20,5.800000e-15,AGAGAGAGAGAGAGAGAGAGAGAGAGAGAG
4,BBRBPC_tnt.BPC5_col_a_m1,MEME,AT4G32530,6.0,35.0,-,61.89740,9.830000e-20,5.800000e-15,AGAGAGAGAGAGAGAGAGAGAGAGAGAGAG
...,...,...,...,...,...,...,...,...,...,...
8661525,AP2EREBP_tnt.ERF2_colamp_a_m1,MEME,AT5G57565,1338.0,1358.0,+,4.77778,1.000000e-04,2.940000e-01,CTGTGGCCAAATCATCCTCCA
8661526,AP2EREBP_tnt.ERF2_colamp_a_m1,MEME,AT1G20690,1399.0,1419.0,+,4.77778,1.000000e-04,2.940000e-01,CGGCTCATGCTCCGGCCTTCT
8661527,# FIMO (Find Individual Motif Occurrences): Ve...,,,,,,,,,
8661528,# The format of this file is described at http...,,,,,,,,,


In [11]:
len(fimo)

10803002

In [5]:
#filter qvalues below
fimo_qfilter = fimo[fimo['q-value'] <= 0.05]
fimo_qfilter.head()


Unnamed: 0,motif_id,motif_alt_id,sequence_name,start,stop,strand,score,p-value,q-value,matched_sequence
0,BBRBPC_tnt.BPC5_col_a_m1,MEME,AT4G38760,1.0,30.0,+,61.3544,1.4599999999999998e-19,1.35e-14,AGAGAGAGAGAGAGAGAGAGAGAGAGAGAG
1,BBRBPC_tnt.BPC5_col_a_m1,MEME,AT1G33350,2.0,31.0,+,61.3544,1.4599999999999998e-19,1.35e-14,AGAGAGAGAGAGAGAGAGAGAGAGAGAGAG
2,BBRBPC_tnt.BPC5_col_a_m1,MEME,AT3G30580,2.0,31.0,+,61.3544,1.4599999999999998e-19,1.35e-14,AGAGAGAGAGAGAGAGAGAGAGAGAGAGAG
3,BBRBPC_tnt.BPC5_col_a_m1,MEME,AT3G47600,2.0,31.0,+,61.3544,1.4599999999999998e-19,1.35e-14,AGAGAGAGAGAGAGAGAGAGAGAGAGAGAG
4,BBRBPC_tnt.BPC5_col_a_m1,MEME,AT3G55960,2.0,31.0,+,61.3544,1.4599999999999998e-19,1.35e-14,AGAGAGAGAGAGAGAGAGAGAGAGAGAGAG


In [12]:
len(fimo_qfilter)

1346100

In [6]:
#need to read in the promoter gff file to find chromosome no. of each promoter
promoters = pd.read_table(promoter_bedfile, sep='\t', header=None)
cols = ['chr', 'start', 'stop', 'gene', 'dot', 'strand', 'source', 'type', 'dot2', 'details']
promoters.columns = cols
promoters.head()

Unnamed: 0,chr,start,stop,gene,dot,strand,source,type,dot2,details
0,1,2630,3630,AT1G01010,.,+,Araport11,gene,.,"Alias=ANAC001,NAC domain containing protein 1;..."
1,1,9130,10130,AT1G01020,.,-,Araport11,gene,.,"Dbxref=PMID:15010618,locus:2200940;ID=AT1G0102..."
2,1,10100,11100,AT1G03987,.,+,Araport11,gene,.,ID=AT1G03987;Name=AT1G03987;locus_type=long_no...
3,1,13714,14714,AT1G01030,.,-,Araport11,gene,.,"Dbxref=PMID:11118137,PMID:15010618,PMID:166036..."
4,1,22120,23120,AT1G01040,.,+,Araport11,gene,.,"Alias=ASU1,ABNORMAL SUSPENSOR 1,ATDCL1,DICER-L..."


In [7]:
merged = pd.merge(fimo_qfilter, promoters, left_on='sequence_name', right_on='gene')
merged_outer = pd.merge(fimo_qfilter, promoters, left_on='sequence_name', right_on='gene', how='outer')
merged.head()

Unnamed: 0,motif_id,motif_alt_id,sequence_name,start_x,stop_x,strand_x,score,p-value,q-value,matched_sequence,chr,start_y,stop_y,gene,dot,strand_y,source,type,dot2,details
0,BBRBPC_tnt.BPC5_col_a_m1,MEME,AT4G38760,1.0,30.0,+,61.3544,1.4599999999999998e-19,1.35e-14,AGAGAGAGAGAGAGAGAGAGAGAGAGAGAG,4,18096218,18097218,AT4G38760,.,-,Araport11,gene,.,Dbxref=locus:2141767;ID=AT4G38760;Name=AT4G387...
1,BBRBPC_tnt.BPC5_col_a_m1,MEME,AT4G38760,3.0,32.0,+,61.3544,1.4599999999999998e-19,1.35e-14,AGAGAGAGAGAGAGAGAGAGAGAGAGAGAG,4,18096218,18097218,AT4G38760,.,-,Araport11,gene,.,Dbxref=locus:2141767;ID=AT4G38760;Name=AT4G387...
2,BBRBPC_tnt.BPC5_colamp_a_m1,MEME,AT4G38760,2.0,30.0,+,49.186,4.37e-19,3.86e-14,GAGAGAGAGAGAGAGAGAGAGAGAGAGAG,4,18096218,18097218,AT4G38760,.,-,Araport11,gene,.,Dbxref=locus:2141767;ID=AT4G38760;Name=AT4G387...
3,BBRBPC_tnt.BPC5_colamp_a_m1,MEME,AT4G38760,4.0,32.0,+,49.186,4.37e-19,3.86e-14,GAGAGAGAGAGAGAGAGAGAGAGAGAGAG,4,18096218,18097218,AT4G38760,.,-,Araport11,gene,.,Dbxref=locus:2141767;ID=AT4G38760;Name=AT4G387...
4,BBRBPC_tnt.BPC5_col_a_m1,MEME,AT4G38760,5.0,34.0,+,55.443,2.63e-18,2.31e-13,AGAGAGAGAGAGAGAGAGAGAGAGAGAGAT,4,18096218,18097218,AT4G38760,.,-,Araport11,gene,.,Dbxref=locus:2141767;ID=AT4G38760;Name=AT4G387...


In [22]:
len(merged)

1346100

In [25]:
len(merged)

1346710

In [28]:
#these promoters had no motifs found in them - 610 promoters
no_motifs = merged[merged['sequence_name'].isnull()]

In [29]:
no_motifs.head()

Unnamed: 0,motif_id,motif_alt_id,sequence_name,start_x,stop_x,strand_x,score,p-value,q-value,matched_sequence,chr,start_y,stop_y,gene,dot,strand_y,source,type,dot2,details
1346100,,,,,,,,,,,1,201102,202102,AT1G01560,.,+,Araport11,gene,.,"Alias=ATMPK11,MAP kinase 11;Dbxref=PMID:115441..."
1346101,,,,,,,,,,,1,1914895,1915895,AT1G06250,.,-,Araport11,gene,.,"Dbxref=PMID:15130548,PMID:18433503,PMID:235053..."
1346102,,,,,,,,,,,1,1945745,1946745,AT1G06390,.,+,Araport11,gene,.,"Alias=ATGSK1,ATSK2-3,A. THALIANA SHAGGY-LIKE K..."
1346103,,,,,,,,,,,1,2011109,2012109,AT1G04483,.,-,Araport11,gene,.,ID=AT1G04483;Name=AT1G04483;Note=Natural antis...
1346104,,,,,,,,,,,1,2167560,2168560,AT1G07070,.,+,Araport11,gene,.,"Dbxref=PMID:15310832,PMID:16297073,PMID:179342..."


In [None]:
#not all promoters had motifs found in them!


In [8]:
#need to add motif start to the promoter start -1, and motif end to promoter start -1
merged['correct_start'] = merged['start_x'] + merged['start_y'] -1
merged['correct_stop'] = merged['stop_x'] + merged['start_y'] -1

merged.head()

Unnamed: 0,motif_id,motif_alt_id,sequence_name,start_x,stop_x,strand_x,score,p-value,q-value,matched_sequence,...,stop_y,gene,dot,strand_y,source,type,dot2,details,correct_start,correct_stop
0,BBRBPC_tnt.BPC5_col_a_m1,MEME,AT4G38760,1.0,30.0,+,61.3544,1.4599999999999998e-19,1.35e-14,AGAGAGAGAGAGAGAGAGAGAGAGAGAGAG,...,18097218,AT4G38760,.,-,Araport11,gene,.,Dbxref=locus:2141767;ID=AT4G38760;Name=AT4G387...,18096218.0,18096247.0
1,BBRBPC_tnt.BPC5_col_a_m1,MEME,AT4G38760,3.0,32.0,+,61.3544,1.4599999999999998e-19,1.35e-14,AGAGAGAGAGAGAGAGAGAGAGAGAGAGAG,...,18097218,AT4G38760,.,-,Araport11,gene,.,Dbxref=locus:2141767;ID=AT4G38760;Name=AT4G387...,18096220.0,18096249.0
2,BBRBPC_tnt.BPC5_colamp_a_m1,MEME,AT4G38760,2.0,30.0,+,49.186,4.37e-19,3.86e-14,GAGAGAGAGAGAGAGAGAGAGAGAGAGAG,...,18097218,AT4G38760,.,-,Araport11,gene,.,Dbxref=locus:2141767;ID=AT4G38760;Name=AT4G387...,18096219.0,18096247.0
3,BBRBPC_tnt.BPC5_colamp_a_m1,MEME,AT4G38760,4.0,32.0,+,49.186,4.37e-19,3.86e-14,GAGAGAGAGAGAGAGAGAGAGAGAGAGAG,...,18097218,AT4G38760,.,-,Araport11,gene,.,Dbxref=locus:2141767;ID=AT4G38760;Name=AT4G387...,18096221.0,18096249.0
4,BBRBPC_tnt.BPC5_col_a_m1,MEME,AT4G38760,5.0,34.0,+,55.443,2.63e-18,2.31e-13,AGAGAGAGAGAGAGAGAGAGAGAGAGAGAT,...,18097218,AT4G38760,.,-,Araport11,gene,.,Dbxref=locus:2141767;ID=AT4G38760;Name=AT4G387...,18096222.0,18096251.0


In [9]:
#create motifs df
motifs_df = merged.loc[:, ['chr','correct_start','correct_stop', 'motif_id', 'score', 'strand_x',]]
motifs_df.head()

Unnamed: 0,chr,correct_start,correct_stop,motif_id,score,strand_x
0,4,18096218.0,18096247.0,BBRBPC_tnt.BPC5_col_a_m1,61.3544,+
1,4,18096220.0,18096249.0,BBRBPC_tnt.BPC5_col_a_m1,61.3544,+
2,4,18096219.0,18096247.0,BBRBPC_tnt.BPC5_colamp_a_m1,49.186,+
3,4,18096221.0,18096249.0,BBRBPC_tnt.BPC5_colamp_a_m1,49.186,+
4,4,18096222.0,18096251.0,BBRBPC_tnt.BPC5_col_a_m1,55.443,+


In [14]:
#sort by chromosome then by start position
motifs_df_sorted = motifs_df.sort_values(['chr','correct_start'])

In [15]:
#create motifs bed file
motifs = BedTool.from_dataframe(motifs_df_sorted).saveas('../../data/FIMO/motifs.bed')

In [9]:
#same again using the functions
filtered_fimo = fimo_qfilter(fimo_file, 0.05)
fimo2bed(filtered_fimo, promoter_bedfile, motifs_bed)

PermissionError: [Errno 13] Permission denied

In [2]:
fimo_file = "../../data/FIMO/output/responsivepromoters_FIMO/fimo.tsv"
promoter_bedfile = "../../data/FIMO/responsivepromoters.bed"
motifs_bed = "../../data/FIMO/responsivepromoters_motifs.bed"