In [1]:
import pandas as pd
from pybedtools import BedTool
import argparse
import numpy as np

In [2]:
# parser = argparse.ArgumentParser(description='filter_mappedmotifs_by_geneID')
# parser.add_argument('mapped_motifs', type=str, help='Input location of mapped motif file')
# parser.add_argument('promoter_AGI', type=str, help='Input location of list of promoter AGIs csv file')
# parser.add_argument('TF_AGI', type=str, help='Input location of list of TF AGIs csv file')
# parser.add_argument('output_mappedmotifs', type=str, help='Output location of mapped_motifs file ')
# parser.add_argument('output_csv', type=str, help='Output location of csv file with TF and targets')
# args = parser.parse_args()

In [3]:
file_names='non-overlapping_includingbidirectional_all_genes_newannotation_3KB'
#mapped motifs file
#mapped_motifs_q1 = f'../../data/output/{file_names}/FIMO/promoters_5UTR_motifs_mapped_q1.bed'
mapped_motifs_q0_05 = f'../../data/output/{file_names}/FIMO/promoters_5UTR_motifs_mapped_q0_05.bed'
#file containing promoter AGIs of interest
#promoter_AGI = f'../../data/output/{file_names}/TG_AGI_387_TGs.csv'
promoter_AGI = f'../../data/output/{file_names}/TG_AGI_4_TGs.csv'
#file containing transcription factor AGIs of interest
#TF_AGI = f'../../data/output/{file_names}/TF_AGI_458_TFs.csv'
#output mapped motifs file
output_mappedmotifs_q0_05 = f'../../data/output/{file_names}/FIMO/filtered_promoters_5UTR_motifs_mapped_q0_05.bed'
#output_mappedmotifs_q1 = f'../../data/output/{file_names}/FIMO/filtered_promoters_5UTR_motifs_mapped_q1.bed'
#output csv location
output_csv_q0_05 = f'../../data/output/{file_names}/FIMO/filtered_promoters_5UTR_q0_05.csv'
#output_csv_q1 = f'../../data/output/{file_names}/FIMO/filtered_promoters_5UTR_q1.csv'

In [4]:
def filter_mapped_motifs(mapped_motifs,promoter_AGI,output_mappedmotifs,output_csv,TF_AGI=None):
    """function to filter mapped motifs file from FIMO scan to promoters and TFs of interest,
    and output a filtered mapped motifs file and also a csv with 3 columns showing the TFs and targets"""
    #read in files
    mapped_motifs_df = pd.read_table(mapped_motifs, sep='\t', header=None)
    cols = ['chr','start','stop','name_rep','score','strand','promoter_AGI','p-value','q-value','matched_sequence','TF_name','TF_family','TF_AGI']
    mapped_motifs_df.columns = cols
    #read in promoter_AGI list
    promoter_AGI_df = pd.read_table(promoter_AGI, sep='\t', header=0)
    #read in TF_AGI list
    if TF_AGI is not None:
        TF_AGI_df = pd.read_table(TF_AGI, sep='\t', header=0)
        #merge mappedmotifs with TF_AGI_df
        merged = pd.merge(TF_AGI_df,mapped_motifs_df, how='left',on='TF_AGI').copy()
        #merge with promoter_AGI
        merged = pd.merge(promoter_AGI_df,merged, how='left',left_on='TG_AGI', right_on='promoter_AGI').copy()
    else:
        #merge with promoter_AGI
        merged = pd.merge(promoter_AGI_df,mapped_motifs_df, how='left',left_on='TG_AGI', right_on='promoter_AGI').copy()
    #remove NaN values
    merged = merged[merged.TF_name.notnull()]
    #create df ready for the csv
    df = merged[['TF_AGI','TG_AGI']].copy()
    #add interaction_type column
    df['interaction_type'] = 'PD'
    #rearrange columns
    df = df[['TF_AGI','interaction_type','TG_AGI']].copy()
    #sort by TF_AGI then TG_AGI
    df.sort_values(['TF_AGI','TG_AGI'], inplace=True, ignore_index=True)
    #write df to csv
    df.to_csv(output_csv,header=1,index=None)
    #filter columns
    merged = merged[['chr','start','stop','name_rep','score','strand','promoter_AGI','p-value','q-value','matched_sequence','TF_name','TF_family','TF_AGI']]
    #sort by chr then start
    merged.sort_values(['chr','start'], inplace=True, ignore_index=True)
    #make columns integars or floats
    merged = merged.astype({'chr': 'int','start':'int','stop':'int','q-value':'float32','p-value':'float32', 'score':'float32'})
    #write out bed file
    BedTool.from_dataframe(merged).saveas(output_mappedmotifs)

In [5]:
#filter mapped motifs and output the two files
filter_mapped_motifs(mapped_motifs_q0_05,promoter_AGI,output_mappedmotifs_q0_05,output_csv_q0_05)
#filter_mapped_motifs(mapped_motifs_q1,promoter_AGI,TF_AGI,output_mappedmotifs_q1,output_csv_q1)