In [None]:
#try using https://daler.github.io/pybedtools/autodocs/pybedtools.bedtool.BedTool.window_maker.html

In [123]:
import os
import argparse
import pandas as pd
import numpy as np
import io
from pybedtools import BedTool

In [79]:
parser = argparse.ArgumentParser(description='rolling_window')
#parser.add_argument('directory_path', type=str, help='Location of base directory')
#better to use relative path
parser.add_argument('file_names', type=str, help='Name of folder and filenames for the promoters extracted')
parser.add_argument('promoter_bedfile', type=str, help='Input location of promoter bedfile')
parser.add_argument('motifs_bed', type=str, help='Input location of motifs bed file')
parser.add_argument('TFBS_coverage_bed', type=str, help='Output location of rolling window bed file')
parser.add_argument('window_bed', type=str, help='Output location of rolling window bed file')
parser.add_argument('window_size', type=int, help='Size of the rolling window in bp')
parser.add_argument('step_size', type=int, help='Size of the window offset in bp')
args = parser.parse_args()


_StoreAction(option_strings=[], dest='window_gap', nargs=None, const=None, default=None, type=<class 'int'>, choices=None, help='Size of the gap between each rolling window in bp', metavar=None)

In [124]:
def flag_overlapping_proms(promoter_bed,output_bed):
    """function to take an input promoter bed file and output a bedfile containing a list of promoters which overlap"""
    #read in promoters
    proms_df = pd.read_table(promoter_bed, sep='\t', header=None)
    cols = ['chr','start','stop','AGI','dot1','strand','source','type','dot2','attributes']
    proms_df.columns = cols
    #create bedtools object of promoters
    proms_bed = BedTool(promoter_bed)
    #c = columns to apply function to
    #o = count number of merged promoters, name the first and last promoter that were merged
    merged = proms_bed.merge(c=4, o=['count_distinct','first', 'last'])
    #write to bufer
    merged_buffer = io.StringIO()
    merged_buffer.write(str(merged))
    merged_buffer.seek(0)
    #read as dataframe
    overlapping = pd.read_table(merged_buffer, sep='\t', header=None)
    cols2 = ['chr','start','stop', 'number_of_overlaps', 'first_overlap','second_overlap']
    overlapping.columns = cols2
    #select only features made of more than one promtoer that were merged as overlapping
    overlapping_only = overlapping[overlapping.number_of_overlaps >= 2]
    overlapping_only.to_csv(output_bed,index=False,sep='\t',header=False)  


In [125]:
# def window_splitold(promoter_bed, motifs_bed, output_bed, window_size, step_size):
#     #separate promoters into dfs by strand
#     proms_df = pd.read_table(promoter_bed, sep='\t', header=None)
#     cols1 = ['chr','start','stop','AGI','dot1','strand','source','type','dot2','attributes']
#     proms_df.columns = cols1
# #     proms_pos = proms_df[proms_df.strand == '+']
# #     proms_neg = proms_df[proms_df.strand == '-']
#     for i,data in proms_df.iterrows():
#         #range function is start, stop, step
#         for num in range(proms_df.loc[i, 'start'], proms_df.loc[i, 'start'] - window_size, step_size):
#             chunk = proms_df[num:num + window_size]
#             #assert len(chunk) == window_size
#             yield chunk
    
      

In [134]:
def window_split(promoter_bed, output_bed, window_size, step_size):
    """function to split promoters into rolling windows"""
    #separate promoters into dfs by strand
    proms_df = pd.read_table(promoter_bed, sep='\t', header=None)
    cols1 = ['chr','start','stop','AGI','dot1','strand','source','type','dot2','attributes']
    proms_df.columns = cols1
    proms_pos = proms_df[proms_df.strand == '+']
    proms_neg = proms_df[proms_df.strand == '-']
    #fool bedtools makewindow so that the first window is actually made from the ATG for the negative strand
    proms_neg_copy = proms_neg.copy()
    proms_neg_copy['length'] = proms_neg.stop-proms_neg.start
    proms_neg_copy['altered_start'] = proms_neg.stop
    proms_neg_copy['altered_stop'] = proms_neg.start + 2*proms_neg_copy.length
    proms_changed = proms_neg_copy[['chr','altered_start','altered_stop','AGI','dot1','strand','source','type','dot2','attributes']]
   
    #write to temporary bed buffers
    pos_buffer = io.StringIO()
    neg_buffer = io.StringIO()
    proms_pos.to_csv(pos_buffer,index=False,sep='\t',header=None)
    proms_changed.to_csv(neg_buffer,index=False,sep='\t',header=None)
    pos_buffer.seek(0)
    neg_buffer.seek(0)
    
    pos_proms = BedTool(pos_buffer)
    neg_proms = BedTool(neg_buffer)
    #make the sliding windows
    #w = window size
    #s = step size
    #n = no. of windows - note there seems to be a bug, window size is one below what you put
    #i = srcwinnum (use source interval name with the window number)
    #reverse = reverse numbering of windows in output - used this for negative strand promoters so window 1 starts at the ATG
    #note - the windows in the reverse strand get cut off if I use a step_size bigger than 1, so I will manually remove windows of the incorrect step size
    windows_pos = BedTool().window_maker(b=pos_proms, w=window_size,s=50, i='srcwinnum')
    windows_neg = BedTool().window_maker(b=neg_proms, w=window_size,s=50, i='srcwinnum')#,reverse=True
    #make buffer
    window_pos_buffer = io.StringIO()
    window_neg_buffer = io.StringIO()
    #write bedfile_like buffer
    window_pos_buffer.write(str(windows_pos))
    window_pos_buffer.seek(0)
    window_neg_buffer.write(str(windows_neg))
    window_neg_buffer.seek(0)
    #create df
    window_pos_df = pd.read_table(window_pos_buffer, sep='\t', header=None)
    window_neg_df = pd.read_table(window_neg_buffer, sep='\t', header=None)
    cols = ['chr', 'start','stop','window']
    window_pos_df.columns = cols
    window_neg_df.columns = cols
    window_neg_df = window_neg_df.astype({'chr':'int','start': 'int', 'stop':'int', 'window':'str'})
    
    #reverse the start/stop changes that fooled bedtools makewindow
    neg_df_corrected = window_neg_df.copy()
    ### need to merge this df with proms_neg using AGI code. then make distance the original stop - 1 to the chunk stop
    #Make AGI column
    neg_df_corrected = neg_df_corrected.assign(AGI=neg_df_corrected.window.str.extract(r'(.*?)\_'))

    #Merge with proms_neg using AGI code
    merged = pd.merge(neg_df_corrected, proms_neg, on='AGI', how='left',suffixes=('','_wholeprom'))    
    
    merged['distance'] = merged.stop-merged.stop_wholeprom
    #create window length column
    merged['window_length'] = merged.stop-merged.start
    
    merged['correct_start'] = merged.stop - 2*merged.distance
    merged['correct_stop'] = merged.correct_start + merged.window_length
    #filter by correct_stop if it is outside the range of the promoter

    merged = merged[['chr','correct_start','correct_stop','window']]
    #rename columns
    merged.rename(columns={'correct_start':'start', 'correct_stop':'stop'}, inplace=True)
    
    
        
    
    #Merge positive and negative strand windows
    merged_all = pd.merge(window_pos_df,merged, how='outer')
    #merged_all.to_csv(output_bed,index=False,sep='\t',header=None)
    
    
    #filter lengths so they are only = 100bp
    window_lengths = merged_all.copy()
    window_lengths =  merged_all.assign(length=((merged_all.stop) - merged_all.start))
    removed= window_lengths.loc[window_lengths.length == 100]
    
    #sort by chr, start
    sorted_removed = removed.sort_values(['chr','start']).reset_index(drop=True)
    #remove length column
    sorted_removed = sorted_removed[['chr','start','stop','window']]
    #write to bed file
    sorted_removed.to_csv(output_bed,index=False,sep='\t',header=None)
    pos_buffer.close()
    neg_buffer.close()
    window_pos_buffer.close()
    window_neg_buffer.close()

In [127]:
file_names = 'non-overlapping_includingbidirectional_all_genes_newannotation'
directory_path = '../..'
promoter_bedfile = f'../../data/output/{file_names}/FIMO/promoters_5UTR.bed'
#motifs_bed = f'../../data/output/{file_names}/FIMO/promoters_5UTR_motifs.bed'
TFBS_coverage_bed = f'{directory_path}/data/output/{file_names}/rolling_window/TFBS_coverage_rw/promoters_5UTR_TFBS_coverage_rw.bed'
window_bed = f'{directory_path}/data/output/{file_names}/rolling_window/promoters_5UTR_windows.bed'
overlapping_proms = f'{directory_path}/data/output/{file_names}/overlapping_promoters.bed'
window_size = 100
step_size = 50

In [6]:
#make directory for the output files to be exported to
#dirName = f'{args.directory_path}/data/output/{args.file_names}'
dirName = f'{directory_path}/data/output/{file_names}'
try:
    # Create target Directory
    os.mkdir(dirName)
    print("Directory " , dirName ,  " created") 
except FileExistsError:
    print("Directory " , dirName ,  " already exists")

Directory  ../../data/output/non-overlapping_includingbidirectional_all_genes_newannotation  already exists


In [7]:
#make directory for the output files to be exported to
#dirName = f'{args.directory_path}/data/output/{args.file_names}'
dirName = f'{directory_path}/data/output/{file_names}/rolling_window'
try:
    # Create target Directory
    os.mkdir(dirName)
    print("Directory " , dirName ,  " created") 
except FileExistsError:
    print("Directory " , dirName ,  " already exists")

Directory  ../../data/output/non-overlapping_includingbidirectional_all_genes_newannotation/rolling_window  already exists


Directory  ../../data/output/non-overlapping_includingbidirectional_all_genes_newannotation/rolling_window/TFBS_coverage_rw  already exists


In [8]:
flag_overlapping_proms(promoter_bedfile,overlapping_proms)

In [135]:
proms = window_split(promoter_bedfile, window_bed, window_size, step_size)

In [136]:
proms

Unnamed: 0,chr,start,stop,window,AGI,chr_wholeprom,start_wholeprom,stop_wholeprom,dot1,strand,source,type,dot2,attributes,distance,window_length,correct_start,correct_stop
0,1,10130,10230,AT1G01020_1,AT1G01020,1,8666,10130,.,-,araport11,promoter,.,ID=gene:AT1G01020;Name=ARV1;biotype=protein_co...,100,100,10030,10130
1,1,10180,10280,AT1G01020_2,AT1G01020,1,8666,10130,.,-,araport11,promoter,.,ID=gene:AT1G01020;Name=ARV1;biotype=protein_co...,150,100,9980,10080
2,1,10230,10330,AT1G01020_3,AT1G01020,1,8666,10130,.,-,araport11,promoter,.,ID=gene:AT1G01020;Name=ARV1;biotype=protein_co...,200,100,9930,10030
3,1,10280,10380,AT1G01020_4,AT1G01020,1,8666,10130,.,-,araport11,promoter,.,ID=gene:AT1G01020;Name=ARV1;biotype=protein_co...,250,100,9880,9980
4,1,10330,10430,AT1G01020_5,AT1G01020,1,8666,10130,.,-,araport11,promoter,.,ID=gene:AT1G01020;Name=ARV1;biotype=protein_co...,300,100,9830,9930
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201035,5,26972568,26972668,AT5G67640_19,AT5G67640,5,26970548,26971668,.,-,araport11,promoter,.,ID=gene:AT5G67640;biotype=protein_coding;descr...,1000,100,26970668,26970768
201036,5,26972618,26972718,AT5G67640_20,AT5G67640,5,26970548,26971668,.,-,araport11,promoter,.,ID=gene:AT5G67640;biotype=protein_coding;descr...,1050,100,26970618,26970718
201037,5,26972668,26972768,AT5G67640_21,AT5G67640,5,26970548,26971668,.,-,araport11,promoter,.,ID=gene:AT5G67640;biotype=protein_coding;descr...,1100,100,26970568,26970668
201038,5,26972718,26972788,AT5G67640_22,AT5G67640,5,26970548,26971668,.,-,araport11,promoter,.,ID=gene:AT5G67640;biotype=protein_coding;descr...,1120,70,26970548,26970618
