In [1]:
import pandas as pd
import argparse
import os

In [None]:
parser = argparse.ArgumentParser(description='choose_genes_cv')
parser.add_argument('file_names', type=str, help='Name of folder and filenames for the promoters extracted')
parser.add_argument('promoter_bedfile', type=str, help='Input location of promoter bedfile')
parser.add_argument('Czechowski_rankedcv', type=str, help='Input location of Czechowski et al 2005 ranked cv dataset reanalysed by Will Nash')
parser.add_argument('Mergner_rankedcv', type=str, help='Input location of Mergner et al 2020 ranked cv dataset')
parser.add_argument('no_of_genes', type=int, help='Number of genes in each category to subset')
parser.add_argument('Czechowski_gene_categories', type=str, help='Output location of microarray gene category subsets')
parser.add_argument('Mergner_gene_categories', type=str, help='Output location of RNAseq gene category subsets')
parser.add_argument('promoter_mapped_motifs', type=str, help='Input location of promoter mapped motifs bed file')
parser.add_argument('promoters_filtered_contain_motifs', type=str, help='output location of the promoter bed file filtered so that each promoter contains at least one TFBS')
parser.add_argument('Czechowski_allgenes', type=str, help='Output location of all filtered microarray genes')
parser.add_argument('Mergner_allgenes', type=str, help='Output location of all filtered RNAseq genes')

args = parser.parse_args()

In [12]:
def remove_proms_no_TFBS(promoter_bedfile, promoter_mapped_motifs,promoters_filtered_contain_motifs):
    """remove promoters which had no TFBSs found within them after filtering the FIMO output. Create output file of these"""
    promoters = pd.read_table(promoter_bedfile, sep='\t', header=None)
    col = ['chr','start','stop','promoter_AGI','dot1', 'strand','source','type','dot2','attributes']
    promoters.columns = col
    mapped_motifs = pd.read_table(promoter_mapped_motifs, sep='\t', header=None)
    col2 = ['chr', 'start', 'stop', 'name_rep', 'score', 'strand', 'promoter_AGI', 'p-value', 'q-value', 'matched_sequence', 'TF_name', 'TF_family', 'TF_AGI']
    mapped_motifs.columns = col2

    merged = pd.merge(promoters,mapped_motifs, on='promoter_AGI', how='left',  suffixes=['', '_y'])
    #remove NaNs in TF_AGI column
    filtered_df = merged[merged.TF_AGI.notnull()]
    #reduce columns
    filtered_df = filtered_df[col]
    #filter duplicates
    idx = filtered_df.promoter_AGI.drop_duplicates().index
    #this will return filtered df
    no_dups = filtered_df.loc[idx,:]
    no_dups.to_csv(promoters_filtered_contain_motifs, sep='\t', header=None, index=False)    

In [13]:
def filter_genes_czechowski(promoter_bed, select_genes_file):
    """filter out genes from the microarray data which aren't in the promoter_bed"""
    select_genes = pd.read_table(select_genes_file, sep='\t', header=None)
    cols = ['rank','probe_id','AGI','expression_mean','expression_SD','expression_CV','proportion_of_values_present_in_mas5','presence_in_araport11','constitutive_in_araport11']
    select_genes.columns = cols
    
    promoters = pd.read_table(promoter_bed, sep='\t', header=None)
    col = ['chr','start','stop','AGI','dot1', 'strand','source','type','dot2','attributes']
    promoters.columns = col

    merged = pd.merge(promoters, select_genes, on='AGI', how='left')
    #remove NaNs in expression_CV column
    filtered_df = merged[merged.expression_CV.notnull()]
    #save df
    filtered_df.to_csv(Czechowski_allgenes,sep='\t',header=None)
    
    return filtered_df   

In [21]:
def filter_genes_mergner(promoter_bed, select_genes_file):
    """filter out genes from the RNA-seq data which aren't in the promoter_bed"""
    select_genes = pd.read_csv(select_genes_file, header=0)
    cols = ['AGI','transcription_class','transcription_family','expression_CV']
    select_genes.columns = cols
    #all present in Araport 11 column
    select_genes['presence_in_araport11'] = 1
    
    promoters = pd.read_table(promoter_bed, sep='\t', header=None)
    col = ['chr','start','stop','AGI','dot1', 'strand','source','type','dot2','attributes']
    promoters.columns = col

    merged = pd.merge(promoters, select_genes, on='AGI', how='left')
    #remove NaNs in expression_CV column
    filtered_df = merged[merged.expression_CV.notnull()]
    #save df
    filtered_df.to_csv(Mergner_allgenes,sep='\t',header=None)
    
    return filtered_df   

In [14]:
def subSet_onCV(in_df, out_dir, no_of_genes):
    '''
    Extract the constitutive, variable, and control subsets based on CV values
    '''
    #fliltering based on presence in the Araport11 annotation, define the first
    #n rows as the constitutive set and add label
    constitutive          = in_df[in_df.presence_in_araport11 == 1][0:no_of_genes]
    constitutive['state'] = 'constitutive'

    #define the last n rows as the variable set and add label
    variable          = in_df[in_df.presence_in_araport11 == 1][-no_of_genes:]
    variable['state'] = 'variable'

    #extract the rest of the rows as the control search space
    mid_range    = in_df[in_df.presence_in_araport11 == 1][(no_of_genes+1):-(no_of_genes+1)]

    #create 10 labelled bins
    mid_range['bins'] = pd.Series(pd.qcut(mid_range['expression_CV'], q = 10, precision = 2))

    #extract 10 random rows from these bins and label as the control set
    samples_from_bins = mid_range.groupby('bins').apply(pd.DataFrame.sample, 10, random_state = 2)
    samples_from_bins['state'] = 'control'

    #concatenate and write as output
    output_set = pd.concat([constitutive[['AGI', 'state']], variable[['AGI', 'state']], samples_from_bins[['AGI', 'state']]], ignore_index = True)
    output_set.to_csv(out_dir, sep = '\t', index = False, header=False)
    
    #function from expressionVar_subsets_plot.py
    #__author__ = "Will Nash"
    # __copyright__ = "Copyright 2020, The Earlham Institute"
    # __credits__ = ["Will Nash", "Wilfried Haerty"]
    # __license__ = "GPL"
    # __version__ = "1.0"
    # __maintainer__ = "Will Nash"
    # __email__ = "will.nash@earlham.ac.uk"
    # __status__ = "Testing"
    #__modified_by__ "Sam Witham"

In [9]:

file_names = 'non-overlapping_includingbidirectional_all_genes_newannotation'
Czechowski_rankedcv = '../../data/genes/AtGE_dev_gcRMA__all_probes__CV.tsv'
Mergner_rankedcv = '../../data/genes/RNA_CVs.csv'
promoter_bedfile = f'../../data/output/{file_names}/FIMO/promoters_5UTR.bed'
Czechowski_gene_categories = f'../../data/output/{file_names}/genes/promoters_5UTR_czechowski_constitutive_variable_random.txt'
Mergner_gene_categories = f'../../data/output/{file_names}/genes/promoters_5UTR_mergner_constitutive_variable_random.txt'
no_of_genes = 100
promoter_mapped_motifs = f'../../data/output/{file_names}/FIMO/promoters_5UTR_motifs_mapped.bed'
promoters_filtered_contain_motifs = f'../../data/output/{file_names}/FIMO/promoters_5UTR_filtered_contain_motifs.bed'

In [7]:
#make directory for the output files to be exported to
#dirName = f'{args.directory_path}/data/output/{args.file_names}'
dirName = f'../../data/output/{file_names}/genes'
try:
    # Create target Directory
    os.mkdir(dirName)
    print("Directory " , dirName ,  " created") 
except FileExistsError:
    print("Directory " , dirName ,  " already exists")

Directory  ../../data/output/non-overlapping_includingbidirectional_all_genes_newannotation/genes  already exists


In [10]:
remove_proms_no_TFBS(promoter_bedfile, promoter_mapped_motifs,promoters_filtered_contain_motifs)

In [25]:
filtered_czechowski = filter_genes_czechowski(promoters_filtered_no_motifs,Czechowski_rankedcv)
filtered_czechowski

Unnamed: 0,chr,start,stop,AGI,dot1,strand,source,type,dot2,attributes,rank,probe_id,expression_mean,expression_SD,expression_CV,proportion_of_values_present_in_mas5,presence_in_araport11,constitutive_in_araport11
3,1,37061,38443,AT1G01060,.,-,araport11,promoter,.,ID=gene:AT1G01060;Name=LHY;biotype=protein_cod...,8752.0,261569_at,659.659794,490.970711,0.744279,99.0,1.0,0.0
5,1,46789,47233,AT1G01080,.,-,araport11,promoter,.,ID=gene:AT1G01080;biotype=protein_coding;descr...,8901.0,261577_at,384.087612,292.389851,0.761258,96.0,1.0,0.0
6,1,49166,49908,AT1G01090,.,-,araport11,promoter,.,ID=gene:AT1G01090;Name=PDH-E1 ALPHA;biotype=pr...,7123.0,261583_at,2541.430002,1529.913875,0.601989,100.0,1.0,0.0
7,1,50954,51952,AT1G01100,.,-,araport11,promoter,.,ID=gene:AT1G01100;Name=RPP1A;biotype=protein_c...,4823.0,261578_at,11304.396095,5166.610692,0.457044,100.0,1.0,0.0
9,1,58978,60215,AT1G01120,.,-,araport11,promoter,.,ID=gene:AT1G01120;Name=KCS1;biotype=protein_co...,6864.0,261570_at,991.741578,578.774828,0.583594,96.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19568,5,26923274,26924488,AT5G67460,.,-,araport11,promoter,.,ID=gene:AT5G67460;biotype=protein_coding;descr...,10771.0,246994_at,143.437529,158.091065,1.102160,94.0,1.0,0.0
19569,5,26934694,26935074,AT5G67490,.,-,araport11,promoter,.,ID=gene:AT5G67490;biotype=protein_coding;descr...,5261.0,247006_at,937.252790,449.548526,0.479645,100.0,1.0,0.0
19572,5,26949184,26950578,AT5G67560,.,+,araport11,promoter,.,ID=gene:AT5G67560;Name=ARL8B;biotype=protein_c...,3814.0,247008_at,711.205117,287.266875,0.403916,97.0,1.0,0.0
19573,5,26957073,26957914,AT5G67580,.,-,araport11,promoter,.,ID=gene:AT5G67580;Name=TRB2;biotype=protein_co...,206.0,247012_at,61.548600,13.354799,0.216980,100.0,1.0,0.0


In [23]:
filtered_mergner = filter_genes_mergner(promoters_filtered_no_motifs,Mergner_rankedcv)
filtered_mergner

Unnamed: 0,chr,start,stop,AGI,dot1,strand,source,type,dot2,attributes,transcription_class,transcription_family,expression_CV,presence_in_araport11
0,1,2630,3759,AT1G01010,.,+,araport11,promoter,.,ID=gene:AT1G01010;Name=NAC001;biotype=protein_...,TF,NAC,0.945805,1.0
1,1,8666,10130,AT1G01020,.,-,araport11,promoter,.,ID=gene:AT1G01020;Name=ARV1;biotype=protein_co...,,,0.384746,1.0
2,1,12940,14714,AT1G01030,.,-,araport11,promoter,.,ID=gene:AT1G01030;Name=NGA3;biotype=protein_co...,TF,B3,1.094770,1.0
3,1,37061,38443,AT1G01060,.,-,araport11,promoter,.,ID=gene:AT1G01060;Name=LHY;biotype=protein_cod...,TF,MYB-related,0.877298,1.0
4,1,40877,42017,AT1G01070,.,-,araport11,promoter,.,ID=gene:AT1G01070;biotype=protein_coding;descr...,,,1.202535,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19573,5,26957073,26957914,AT5G67580,.,-,araport11,promoter,.,ID=gene:AT5G67580;Name=TRB2;biotype=protein_co...,TF,MYB-related,1.039561,1.0
19574,5,26963614,26964550,AT5G67610,.,-,araport11,promoter,.,ID=gene:AT5G67610;biotype=protein_coding;descr...,,,0.347671,1.0
19575,5,26965720,26967010,AT5G67620,.,-,araport11,promoter,.,ID=gene:AT5G67620;biotype=protein_coding;descr...,,,1.288362,1.0
19576,5,26969306,26969515,AT5G67630,.,-,araport11,promoter,.,ID=gene:AT5G67630;biotype=protein_coding;descr...,,,0.927208,1.0


In [21]:
#czechowksi subset
subSet_onCV(filtered_czechowski,Czechowski_gene_categories,no_of_genes)

In [27]:
#mergner subset
subSet_onCV(filtered_mergner,Mergner_gene_categories,no_of_genes)