In [None]:
#try using https://daler.github.io/pybedtools/autodocs/pybedtools.bedtool.BedTool.window_maker.html

In [110]:
import os
import argparse
import pandas as pd
import numpy as np
import io
from pybedtools import BedTool

In [79]:
parser = argparse.ArgumentParser(description='TFBS_coverage_rw')
parser.add_argument('directory_path', type=str, help='Location of base directory')
parser.add_argument('file_names', type=str, help='Name of folder and filenames for the promoters extracted')
parser.add_argument('promoter_bedfile', type=str, help='Input location of promoter bedfile')
parser.add_argument('motifs_bed', type=str, help='Input location of motifs bed file')
parser.add_argument('TFBS_coverage_bed', type=str, help='Output location of rolling window bed file')
parser.add_argument('window_bed', type=str, help='Output location of rolling window bed file')
parser.add_argument('window_size', type=int, help='Size of the rolling window in bp')
parser.add_argument('step_size', type=int, help='Size of the window offset in bp')


_StoreAction(option_strings=[], dest='window_gap', nargs=None, const=None, default=None, type=<class 'int'>, choices=None, help='Size of the gap between each rolling window in bp', metavar=None)

In [2]:
def flag_overlapping_proms(promoter_bed,output_bed):
    #read in promoters
    proms_df = pd.read_table(promoter_bed, sep='\t', header=None)
    cols = ['chr','start','stop','AGI','dot1','strand','source','type','dot2','attributes']
    proms_df.columns = cols
    #create bedtools object of promoters
    proms_bed = BedTool(promoter_bed)
    #c = columns to apply function to
    #o = count number of merged promoters, name the first and last promoter that were merged
    merged = proms_bed.merge(c=4, o=['count_distinct','first', 'last'])
    #write to bufer
    merged_buffer = io.StringIO()
    merged_buffer.write(str(merged))
    merged_buffer.seek(0)
    #read as dataframe
    overlapping = pd.read_table(merged_buffer, sep='\t', header=None)
    cols2 = ['chr','start','stop', 'number_of_overlaps', 'first_overlap','second_overlap']
    overlapping.columns = cols2
    #select only features made of more than one promtoer that were merged as overlapping
    overlapping_only = overlapping[overlapping.number_of_overlaps >= 2]
    overlapping_only.to_csv(output_bed,index=False,sep='\t',header=False)  


In [64]:
def window_splitold(promoter_bed, motifs_bed, output_bed, window_size, step_size):
    #separate promoters into dfs by strand
    proms_df = pd.read_table(promoter_bed, sep='\t', header=None)
    cols1 = ['chr','start','stop','AGI','dot1','strand','source','type','dot2','attributes']
    proms_df.columns = cols1
#     proms_pos = proms_df[proms_df.strand == '+']
#     proms_neg = proms_df[proms_df.strand == '-']
    for i,data in proms_df.iterrows():
        #range function is start, stop, step
        for num in range(proms_df.loc[i, 'start'], proms_df.loc[i, 'start'] - window_size, step_size):
            chunk = proms_df[num:num + window_size]
            #assert len(chunk) == window_size
            yield chunk
    
      

In [65]:
proms = window_split(promoter_bedfile,motifs_bed, window_bed, window_size, step_size)
proms

<generator object window_split at 0x7fbb8e105dd0>

In [66]:
for chunk in proms:
    print(chunk)

In [181]:
def window_split(promoter_bed, motifs_bed, output_bed, window_size, step_size):
    """function to split promoters into rolling windows"""
    #separate promoters into dfs by strand
    proms_df = pd.read_table(promoter_bed, sep='\t', header=None)
    cols1 = ['chr','start','stop','AGI','dot1','strand','source','type','dot2','attributes']
    proms_df.columns = cols1
    proms_pos = proms_df[proms_df.strand == '+']
    proms_neg = proms_df[proms_df.strand == '-']
    #fool bedtools makewindow so that the first window is actually made from the ATG for the negative strand
    proms_neg_copy = proms_neg.copy()
    proms_neg_copy['length'] = proms_neg.stop-proms_neg.start
    proms_neg_copy['altered_start'] = proms_neg.stop - 1
    proms_neg_copy['altered_stop'] = proms_neg.start + 2*proms_neg_copy.length-1
    proms_changed = proms_neg_copy[['chr','altered_start','altered_stop','AGI','dot1','strand','source','type','dot2','attributes']]

   
    #write to temporary bed buffers
    pos_buffer = io.StringIO()
    neg_buffer = io.StringIO()
    proms_pos.to_csv(pos_buffer,index=False,sep='\t',header=None)
    proms_changed.to_csv(neg_buffer,index=False,sep='\t',header=None)
    pos_buffer.seek(0)
    neg_buffer.seek(0)
    
    pos_proms = BedTool(pos_buffer)
    neg_proms = BedTool(neg_buffer)
    #make the sliding windows
    #w = window size
    #s = step size
    #n = no. of windows - note there seems to be a bug, window size is one below what you put
    #i = srcwinnum (use source interval name with the window number)
    #reverse = reverse numbering of windows in output - used this for negative strand promoters so window 1 starts at the ATG
    #note - the windows in the reverse strand get cut off if I use a step_size bigger than 1, so I will manually remove windows of the incorrect step size
    windows_pos = BedTool().window_maker(b=pos_proms, w=window_size,s=50, i='srcwinnum')
    windows_neg = BedTool().window_maker(b=neg_proms, w=window_size,s=50, i='srcwinnum')#,reverse=True
    #make buffer
    window_pos_buffer = io.StringIO()
    window_neg_buffer = io.StringIO()
    #write bedfile_like buffer
    window_pos_buffer.write(str(windows_pos))
    window_pos_buffer.seek(0)
    window_neg_buffer.write(str(windows_neg))
    window_neg_buffer.seek(0)
    #create df
    window_pos_df = pd.read_table(window_pos_buffer, sep='\t', header=None)
    window_neg_df = pd.read_table(window_neg_buffer, sep='\t', header=None)
    cols = ['chr', 'start','stop','window']
    window_pos_df.columns = cols
    window_neg_df.columns = cols
    window_neg_df = window_neg_df.astype({'chr':'int','start': 'int', 'stop':'int', 'window':'str'})
    
    #reverse the start/stop changes that fooled bedtools makewindow
    neg_df_corrected = window_neg_df.copy()
    neg_df_corrected['distance'] = window_neg_df.start ???????? ### need to merge this df with proms_neg using AGI code. then make distance the original stop - 1 to the chunk stop
    neg_df_corrected['correct_start'] = window_neg_df.stop - 2*neg_df_corrected.distance + 1
    neg_df_corrected['correct_stop'] = window_neg_df.start + 1
    
    neg_df_corrected_renamed = neg_df_corrected[['chr','correct_start','correct_stop','window']]
    return neg_df_corrected_renamed

    
    
    
    
    
    
    
#     #Merge positive and negative strand windows
#     merged = pd.merge(window_pos_df,window_neg_df, how='outer')
#     merged.to_csv(output_bed,index=False,sep='\t',header=None)
#     return merged
   
    
#     #filter lengths so they are only = 100bp
#     window_lengths = merged.copy()
#     window_lengths =  merged.assign(length=((merged.stop-1) - merged.start))
#     removed= window_lengths.loc[window_lengths.length == 100]
#     #sort by chr, start
#     sorted_removed = removed.sort_values(['chr','start']).reset_index(drop=True)
#     #write to bed file
#     sorted_removed.to_csv(output_bed,index=False,sep='\t',header=None)
#     pos_buffer.close()
#     neg_buffer.close()
#     window_pos_buffer.close()
#     window_neg_buffer.close()
#     return sorted_removed

In [182]:
proms = window_split(promoter_bedfile,motifs_bed, window_bed, window_size, step_size)
proms

Unnamed: 0,chr,correct_start,correct_stop,window
0,1,,10130,AT1G01020_1
1,1,7352.0,10180,AT1G01020_2
2,1,6782.0,10230,AT1G01020_3
3,1,7616.0,10280,AT1G01020_4
4,1,8150.0,10330,AT1G01020_5
...,...,...,...,...
201035,5,,26972568,AT5G67640_19
201036,5,,26972618,AT5G67640_20
201037,5,,26972668,AT5G67640_21
201038,5,,26972718,AT5G67640_22


In [173]:
proms

Unnamed: 0,chr,start,stop,AGI,dot1,strand,source,type,dot2,attributes
1,1,8666,10130,AT1G01020,.,-,araport11,promoter,.,ID=gene:AT1G01020;Name=ARV1;biotype=protein_co...
2,1,12940,14714,AT1G01030,.,-,araport11,promoter,.,ID=gene:AT1G01030;Name=NGA3;biotype=protein_co...
3,1,37061,38443,AT1G01060,.,-,araport11,promoter,.,ID=gene:AT1G01060;Name=LHY;biotype=protein_cod...
4,1,40877,42017,AT1G01070,.,-,araport11,promoter,.,ID=gene:AT1G01070;biotype=protein_coding;descr...
5,1,46789,47233,AT1G01080,.,-,araport11,promoter,.,ID=gene:AT1G01080;biotype=protein_coding;descr...
...,...,...,...,...,...,...,...,...,...,...
20231,5,26957073,26957914,AT5G67580,.,-,araport11,promoter,.,ID=gene:AT5G67580;Name=TRB2;biotype=protein_co...
20232,5,26963614,26964550,AT5G67610,.,-,araport11,promoter,.,ID=gene:AT5G67610;biotype=protein_coding;descr...
20233,5,26965720,26967010,AT5G67620,.,-,araport11,promoter,.,ID=gene:AT5G67620;biotype=protein_coding;descr...
20234,5,26969306,26969515,AT5G67630,.,-,araport11,promoter,.,ID=gene:AT5G67630;biotype=protein_coding;descr...


In [171]:
proms

Unnamed: 0,chr,start,stop,window,length,correct_start,correct_stop
0,1,10129,10229,AT1G01020_1,100,10030,10130
1,1,10179,10279,AT1G01020_2,100,10080,10180
2,1,10229,10329,AT1G01020_3,100,10130,10230
3,1,10279,10379,AT1G01020_4,100,10180,10280
4,1,10329,10429,AT1G01020_5,100,10230,10330
...,...,...,...,...,...,...,...
201035,5,26972567,26972667,AT5G67640_19,100,26972468,26972568
201036,5,26972617,26972717,AT5G67640_20,100,26972518,26972618
201037,5,26972667,26972767,AT5G67640_21,100,26972568,26972668
201038,5,26972717,26972787,AT5G67640_22,70,26972648,26972718


In [30]:
file_names = 'non-overlapping_includingbidirectional_all_genes_newannotation'
directory_path = '../..'
promoter_bedfile = f'../../data/output/{file_names}/FIMO/promoters_5UTR.bed'
motifs_bed = f'../../data/output/{file_names}/FIMO/promoters_5UTR_motifs.bed'
TFBS_coverage_bed = f'{directory_path}/data/output/{file_names}/rolling_window/TFBS_coverage_rw/promoters_5UTR_TFBS_coverage_rw.bed'
window_bed = f'{directory_path}/data/output/{file_names}/rolling_window/promoters_5UTR_windows.bed'
overlapping_proms = f'{directory_path}/data/output/{file_names}/overlapping_promoters.bed'
window_size = 100
step_size = 50

In [5]:
#make directory for the output files to be exported to
#dirName = f'{args.directory_path}/data/output/{args.file_names}'
dirName = f'{directory_path}/data/output/{file_names}'
try:
    # Create target Directory
    os.mkdir(dirName)
    print("Directory " , dirName ,  " created") 
except FileExistsError:
    print("Directory " , dirName ,  " already exists")

Directory  ../../data/output/non-overlapping_includingbidirectional_all_genes_newannotation  already exists


In [6]:
#make directory for the output files to be exported to
#dirName = f'{args.directory_path}/data/output/{args.file_names}'
dirName = f'{directory_path}/data/output/{file_names}/rolling_window'
try:
    # Create target Directory
    os.mkdir(dirName)
    print("Directory " , dirName ,  " created") 
except FileExistsError:
    print("Directory " , dirName ,  " already exists")

Directory  ../../data/output/non-overlapping_includingbidirectional_all_genes_newannotation/rolling_window  already exists


In [7]:
#make directory for the output files to be exported to
#dirName = f'{args.directory_path}/data/output/{args.file_names}'
dirName = f'{directory_path}/data/output/{file_names}/rolling_window/TFBS_coverage_rw'
try:
    # Create target Directory
    os.mkdir(dirName)
    print("Directory " , dirName ,  " created") 
except FileExistsError:
    print("Directory " , dirName ,  " already exists")

Directory  ../../data/output/non-overlapping_includingbidirectional_all_genes_newannotation/rolling_window/TFBS_coverage_rw  already exists


In [8]:
flag_overlapping_proms(promoter_bedfile,overlapping_proms)

In [15]:
proms = window_split(promoter_bedfile,motifs_bed, window_bed, window_size, step_size)

In [16]:
proms

Unnamed: 0,chr,start,stop,window
0,1,2630,2730,AT1G01010_1
1,1,2680,2780,AT1G01010_2
2,1,2730,2830,AT1G01010_3
3,1,2780,2880,AT1G01010_4
4,1,2830,2930,AT1G01010_5
...,...,...,...,...
401442,5,26971448,26971548,AT5G67640_5
401443,5,26971498,26971598,AT5G67640_4
401444,5,26971548,26971648,AT5G67640_3
401445,5,26971598,26971668,AT5G67640_2
