In [2]:
import pandas as pd
#import argparse
import os

In [4]:

file_names = 'non-overlapping_includingbidirectional_all_genes_newannotation'
Schmid_rankedtau = f'../../data/output/{file_names}/genes/tissue_specific/promoters_5UTR_schmid_allfilteredgenes_TAU.txt'
promoter_bedfile = f'../../data/output/{file_names}/FIMO/promoters_5UTR.bed'
promoterno5UTR_bedfile = f'../../data/output/{file_names}/promoters.gff3'
Schmid_gene_categories = f'../../data/output/{file_names}/genes/promoters_5UTR_schmid_tissuespecific_variable_random_300.txt'
no_of_genes = 300
promoter_mapped_motifs = f'../../data/output/{file_names}/FIMO/promoters_5UTR_motifs_mapped.bed'
promoters_filtered_contain_motifs = f'../../data/output/{file_names}/FIMO/promoters_5UTR_filtered_contain_motifs.bed'
promoters_gff3 = f'../../data/output/{file_names}/promoters.gff3'
Schmid_allgenes = f'../../data/output/{file_names}/genes/promoters_5UTR_schmid_allfilteredgenes.txt'
CV_gene_categories = f'../../data/output/{file_names}/genes/promoters_5UTR_czechowski_constitutive_variable_random.txt'

In [5]:
def remove_proms_no_TFBS(promoter_bedfile, promoter_mapped_motifs,promoters_filtered_contain_motifs):
    """remove promoters which had no TFBSs found within them after filtering the FIMO output. Create output file of these"""
    promoters = pd.read_table(promoter_bedfile, sep='\t', header=None)
    col = ['chr','start','stop','promoter_AGI','dot1', 'strand','source','type','dot2','attributes']
    promoters.columns = col
    mapped_motifs = pd.read_table(promoter_mapped_motifs, sep='\t', header=None)
    col2 = ['chr', 'start', 'stop', 'name_rep', 'score', 'strand', 'promoter_AGI', 'p-value', 'q-value', 'matched_sequence', 'TF_name', 'TF_family', 'TF_AGI']
    mapped_motifs.columns = col2

    merged = pd.merge(promoters,mapped_motifs, on='promoter_AGI', how='left',  suffixes=['', '_y'])
    #remove NaNs in TF_AGI column
    filtered_df = merged[merged.TF_AGI.notnull()]
    #reduce columns
    filtered_df = filtered_df[col]
    #filter duplicates
    idx = filtered_df.promoter_AGI.drop_duplicates().index
    #this will return filtered df
    no_dups = filtered_df.loc[idx,:]
    no_dups.to_csv(promoters_filtered_contain_motifs, sep='\t', header=None, index=False)

In [6]:
def remove_only5UTR(promoter5UTR_bedfile, promoters_gff3):
    """remove genes where only the Araport11 5'UTR is present due to the promoter overlapping other genes"""
    #read in df
    promoter5UTR = pd.read_table(promoter5UTR_bedfile,sep='\t', header=None)
    col = ['chr','start','stop','AGI','dot1', 'strand','source','type','dot2','attributes']
    promoter5UTR.columns = col
    
    # read in promoter only gff3
    promoter_no_5UTR_df = pd.read_table(promoters_gff3, sep='\t', header=None)
    col = ['chr', 'source', 'type', 'start','stop', 'dot1','strand','dot2','attributes']
    promoter_no_5UTR_df.columns = col
    #add AGI column
    promoter_no_5UTR_df_agi = promoter_no_5UTR_df.assign(AGI=promoter_no_5UTR_df.attributes.str.extract(r'ID=gene:(.*?)\;'))
    
    #filter promoters in promoter5UTR but not in promoter_no_5UTR_df_agi
    filtered = promoter5UTR[promoter5UTR.AGI.isin(promoter_no_5UTR_df_agi.AGI)]    
    
    #rename promoter5UTR_bedfile as including genes with only non-overlapping 5'UTRs
    
    oldextension = os.path.splitext(promoter5UTR_bedfile)[1]
    oldname = os.path.splitext(promoter5UTR_bedfile)[0]
    os.rename(promoter5UTR_bedfile, oldname + '_incl_only5UTR' + oldextension)
    
    #make a new file called the same name as promoter5UTR_bedfile
    filtered.to_csv(promoter5UTR_bedfile,sep='\t', header=None, index=False) 

In [7]:
def filter_genes_schmid(promoter_bed, select_genes_file):
    """filter out genes from the microarray data which aren't in the promoter_bed"""
    select_genes = pd.read_table(select_genes_file, sep='\t', header=0)
    cols = ['AGI','tau']
    select_genes.columns = cols
    #make AGI capitalised
    select_genes.AGI = select_genes.AGI.str.upper()
    
    promoters = pd.read_table(promoter_bed, sep='\t', header=None)
    col = ['chr','start','stop','AGI','dot1', 'strand','source','type','dot2','attributes']
    promoters.columns = col
    merged = pd.merge(promoters, select_genes, on='AGI', how='left')
    #remove NaNs in tau column
    filtered_df = merged[merged.tau.notnull()].copy()
    
    #sort by chr and start 
    filtered_df.sort_values(['chr','start'], inplace=True, ignore_index=True) 
    
    #save df
    filtered_df.to_csv(Schmid_allgenes,sep='\t',columns=filtered_df.columns, index=False)
    
    #sort by tau value
    filtered_df.sort_values('tau', inplace=True, ignore_index=True) 
    
    return filtered_df

In [33]:
def subSet_ontau(in_df, out_dir, no_of_genes,CV_gene_categories):
    '''
    Extract the constitutive, tissue_specific, and control subsets based on tau values
    '''
    #filtering based on presence in the Araport11 annotation, define the first
    #n rows as the constitutive set and add label
    constitutive          = in_df[0:no_of_genes].copy()
    constitutive['state'] = 'constitutive'

    #define the last n rows as the variable set and add label
    variable          = in_df[-no_of_genes:].copy()
    variable['state'] = 'tissue_specific'
    
    #read in czechowski gene categories so the midrange samples exclude the constitutive and variable genes from there
    CV_categories = pd.read_table(CV_gene_categories,sep='\t',header=None)
    #Name columns
    cols = ['AGI','gene_type']
    CV_categories.columns = cols
    #exclude control genes
    CV_categories_nocontrol = CV_categories[~(CV_categories.gene_type=='control')]
    subset_genes = in_df[~(in_df.AGI.isin(CV_categories_nocontrol.AGI))]
    
    #extract the rest of the rows as the control search space
    mid_range_full    = in_df[(no_of_genes+1):-(no_of_genes+1)].copy()
    #remove constitutive and variable genes
    mid_range = mid_range_full[~(mid_range_full.AGI.isin(CV_categories_nocontrol.AGI))]

    #create 10 labelled bins
    mid_range['bins'] = pd.Series(pd.qcut(mid_range['tau'], q = 10, precision = 2))

    #extract 10 random rows from these bins and label as the control set
    sample = no_of_genes/10
    sample_integar = int(str(sample).replace('.0', '')) #convert sample to an integar
    samples_from_bins = mid_range.groupby('bins').apply(pd.DataFrame.sample, sample_integar, random_state = 2)
    samples_from_bins['state'] = 'control'

    #concatenate and write as output
    output_set = pd.concat([constitutive[['AGI', 'state']], variable[['AGI', 'state']], samples_from_bins[['AGI', 'state']]], ignore_index = True)
    output_set.to_csv(out_dir, sep = '\t', index = False, header=False)
    
    #function from expressionVar_subsets_plot.py
    #__author__ = "Will Nash"
    # __copyright__ = "Copyright 2020, The Earlham Institute"
    # __credits__ = ["Will Nash", "Wilfried Haerty"]
    # __license__ = "GPL"
    # __version__ = "1.0"
    # __maintainer__ = "Will Nash"
    # __email__ = "will.nash@earlham.ac.uk"
    # __status__ = "Testing"
    #__modified_by__ "Sam Witham"

In [9]:
#make directory for the output files to be exported to
#dirName = f'{args.directory_path}/data/output/{args.file_names}'
dirName = f'../../data/output/{file_names}/genes'
try:
    # Create target Directory
    os.mkdir(dirName)
    print("Directory " , dirName ,  " created") 
except FileExistsError:
    print("Directory " , dirName ,  " already exists")

Directory  ../../data/output/non-overlapping_includingbidirectional_all_genes_newannotation/genes  already exists


In [10]:
remove_only5UTR(promoter_bedfile, promoters_gff3)    

In [11]:
remove_proms_no_TFBS(promoter_bedfile,promoter_mapped_motifs,promoters_filtered_contain_motifs)


In [12]:
filtered_schmid = filter_genes_schmid(promoters_filtered_contain_motifs,Schmid_rankedtau)


In [34]:
subSet_ontau(filtered_schmid,Schmid_gene_categories,no_of_genes,CV_gene_categories)

Unnamed: 0,chr,start,stop,AGI,dot1,strand,source,type,dot2,attributes,tau
301,1,19498145,19499281,AT1G52360,.,+,araport11,promoter,.,ID=gene:AT1G52360;biotype=protein_coding;descr...,0.0308
302,1,609391,610362,AT1G02780,.,-,araport11,promoter,.,ID=gene:AT1G02780;Name=RPL19A;biotype=protein_...,0.0308
303,3,22133950,22135156,AT3G59920,.,+,araport11,promoter,.,ID=gene:AT3G59920;Name=GDI2;biotype=protein_co...,0.0308
304,3,17511858,17513656,AT3G47520,.,+,araport11,promoter,.,ID=gene:AT3G47520;Name=MDH;biotype=protein_cod...,0.0308
305,5,20880036,20880557,AT5G51400,.,-,araport11,promoter,.,ID=gene:AT5G51400;biotype=protein_coding;descr...,0.0309
...,...,...,...,...,...,...,...,...,...,...,...
13594,4,13112360,13113482,AT4G25750,.,-,araport11,promoter,.,ID=gene:AT4G25750;Name=ABCG4;biotype=protein_c...,0.4320
13595,4,16922963,16923350,AT4G35700,.,+,araport11,promoter,.,ID=gene:AT4G35700;biotype=protein_coding;descr...,0.4322
13596,4,16788283,16789283,AT4G35280,.,-,araport11,promoter,.,ID=gene:AT4G35280;Name=ZAT3;biotype=protein_co...,0.4322
13597,1,6342460,6343329,AT1G18410,.,-,araport11,promoter,.,ID=gene:AT1G18410;biotype=protein_coding;descr...,0.4324
