In [1]:
import pandas as pd
import re
import numpy as np


In [28]:
def rename_motif(motifs_csv):
    """function to add a TF name column in the motifs.bed file (for DAP-seq cistrome motifs only)"""
    #read in motifs_bed_file
    motifs = pd.read_csv(motifs_csv, header=0)
    cols = ['TF_ID', 'TF_name', 'deficit_no', 'search_sites', 'mean_sites_per_search_gene', 'no_background_sites','mean_sites_per_background_gene',
            'site_representation','Mann_whitney_U','site_p_value','total_no_search_genes','no_TF_search_genes','total_background_genes','no_TF_background_genes',
           'gene_representation','gene_p_value','average_log2_proportion_bound','log2_enrichment','significance_score']
    motifs.columns = cols
    motifs['TF'] = motifs.TF_name    
    capitalise = lambda x: x.upper()
    motifs.TF = motifs.TF.apply(capitalise)
    #replace characters upto and including the '.' in the TF column
    motifs.TF = motifs.TF.replace('^[^\.]*\.', '', regex=True)
    #replace characters after the '_'
    motifs.TF = motifs.TF.replace('_.*$', '', regex=True)
    return motifs

In [3]:
def TF_family(motifs_df):
    """function to add a TF family column in the motifs.bed file (for DAP-seq cistrome motifs only)"""
    motifs_df['TF_family'] = motifs_df.TF_name
    capitalise = lambda x: x.upper()
    motifs_df.TF_family = motifs_df.TF_family.apply(capitalise)
    ##replace characters inluding and after the '_' in the TF_family column
    motifs_df.TF_family = motifs_df.TF_family.replace('_.*$', '', regex=True)
    return motifs_df

In [86]:
def map_ID2(motifs, geneIDtable, output_tsv):   
    """function to rename the TF column values in a motifs.bed file to the Arabidopsis gene ID nomenclature using geneIDtable. (for DAP-seq cistrome motifs only). Outputs a bed file."""
    #read in motifID table
    geneIDtable = pd.read_table(geneIDtable, sep='\t', header=None)
    #name columns
    cols = ['TF_ID', 'AGI']
    geneIDtable.columns = cols
    #remove '_m' from end of name_rep value in motifs
    motifs.TF_ID = motifs.TF_ID.str.replace('_m1', '')
    merged = pd.merge(motifs,geneIDtable, on='TF_ID')
    #print(merged.shape)TF_ID
    #make bed file
    #sorted_motifs = merged.sort_values(['chr','start'])
    #bed = BedTool.from_dataframe(sorted_motifs).saveas(output_bed)
    #save output file
    merged.to_csv(output_tsv, sep='\t', header=True)
    return merged
    

In [80]:
geneIDtable = pd.read_table('/home/witham/Documents/pipeline_new/PromoterArchitecture/data/FIMO/motif_data/motif_map_IDs.txt', sep='\t', header=None)

In [81]:
motifs_csv = '/home/witham/Documents/pipeline_new/PromoterArchitecture/data/output/non-overlapping_includingbidirectional_all_genes_newannotation/CiiiDER/Czechowski_400bp_promoters_5UTR_variable/enrichmentoutput_MostSigDeficit.txt'

In [82]:
cols = ['TF_ID', 'AGI']
geneIDtable.columns = cols

In [83]:
motifs_renamed = rename_motif(motifs_csv)

In [84]:
motifs_renamed_family = TF_family(motifs_renamed)

In [85]:
map_ID2(motifs_renamed_family, geneIDtable, 'hi')

Unnamed: 0,TF_ID,TF_name,deficit_no,search_sites,mean_sites_per_search_gene,no_background_sites,mean_sites_per_background_gene,site_representation,Mann_whitney_U,site_p_value,...,total_background_genes,no_TF_background_genes,gene_representation,gene_p_value,average_log2_proportion_bound,log2_enrichment,significance_score,TF,TF_family,AGI
0,C2H2_tnt.At5g66730_col,C2H2_tnt.At5g66730_col_m1,0.13,15,0.2,4,0.0,Up,5550.0,0.178992,...,100,4,Up,0.013989,-3.588991,1.784271,1.854199,AT5G66730,C2H2,AT5G66730
1,Homeobox_tnt.HAT2_colamp_a,Homeobox_tnt.HAT2_colamp_a_m1,0.11,159,1.6,132,1.3,Up,5691.0,0.091338,...,100,57,Up,0.040956,-0.648371,0.314381,1.387677,HAT2,HOMEOBOX,AT5G47370
2,C2H2_tnt.IDD7_col_a,C2H2_tnt.IDD7_col_a_m1,0.15,19,0.2,6,0.1,Up,5690.5,0.091573,...,100,5,Up,0.003872,-3.278635,1.825971,2.412075,IDD7,C2H2,AT1G55110
3,C2C2dof_tnt.At4g38000_colamp_a,C2C2dof_tnt.At4g38000_colamp_a_m1,0.12,260,2.6,176,1.8,Up,6739.0,0.000021,...,100,76,Up,0.000014,-0.218693,0.349942,4.840088,AT4G38000,C2C2DOF,AT4G38000
4,C2C2dof_tnt.At1g64620_colamp_a,C2C2dof_tnt.At1g64620_colamp_a_m1,0.06,163,1.6,89,0.9,Up,6950.0,0.000002,...,100,54,Up,0.000007,-0.566520,0.632695,5.160354,AT1G64620,C2C2DOF,AT1G64620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,G2like_tnt.At3g12730_col_a,G2like_tnt.At3g12730_col_a_m1,0.09,13,0.1,0,0.0,Up,5350.0,0.392448,...,100,0,Up,0.014018,-5.697606,3.906891,1.853321,AT3G12730,G2LIKE,AT3G12730
181,C2C2dof_tnt.dof24_col_a,C2C2dof_tnt.dof24_col_a_m1,0.11,139,1.4,82,0.8,Up,6673.0,0.000044,...,100,43,Up,0.000001,-0.791518,0.833181,5.834819,DOF24,C2C2DOF,AT2G37590
182,G2like_tnt.AT4G37180_colamp_a,G2like_tnt.AT4G37180_colamp_a_m1,0.01,16,0.2,4,0.0,Up,5502.0,0.219980,...,100,4,Up,0.023810,-3.637099,1.688056,1.623247,AT4G37180,G2LIKE,AT4G37180
183,WRKY_tnt.WRKY25_col_a,WRKY_tnt.WRKY25_col_a_m1,0.08,17,0.2,7,0.1,Up,5490.0,0.231205,...,100,5,Up,0.031720,-3.444238,1.494765,1.498666,WRKY25,WRKY,AT2G30250


In [None]:
#make df of geneIDtable
geneIDtable = pd.read_table(args.geneIDtable, sep='\t', header=None)
cols = ['TF_ID', 'AGI']
geneIDtable.columns = cols
    
motifs_renamed = rename_motif(args.motifs_csv)
motifs_renamed_family = TF_family(motifs_renamed)
map_ID2(motifs_renamed_family, geneIDtable, args.motifs_csv_mapped)