In [1]:
import pandas as pd
from pybedtools import BedTool
import os
import io

In [None]:
parser = argparse.ArgumentParser(description='TATA_enrichment')
parser.add_argument('file_names', type=str, help='Name of folder and filenames for the promoters extracted')
parser.add_argument('promoterpref', type=str, help='Promoter prefix name')
parser.add_argument('Czechowski_gene_categories', type=str, help='Input location of Czechowski gene categories text file')
parser.add_argument('promoter_bed_file', type=str, help='Input location of promoters bed file')
parser.add_argument('output_genecat_prefix', type=str, help='Gene category prefix (eg. Czechowski)')
parser.add_argument('TATA_box_locations', type=str, help='Input location of TATAbox_location bed file (from Eukaryotic promoter database)')

#parser.add_argument('output_folder_name', type=str, help='Optional output folder name ending in a forward slash',default = '')
args = parser.parse_args()

In [2]:
file_names = 'non-overlapping_includingbidirectional_all_genes_newannotation'
promoterpref='promoters_5UTR'
Czechowski_gene_categories = f'../../data/output/{file_names}/genes/promoters_5UTR_czechowski_constitutive_variable_random.txt'
promoter_bed_file = f'../../data/output/{file_names}/FIMO/promoters_5UTR.bed'
#output_folder_name = 'promoters_5UTR_400bp/'

output_genecat_prefix = 'Czechowski'
TATA_box_locations = '../../data/EPD_promoter_analysis/EPDnew_promoters/TATAbox_location_renamed.bed'

In [3]:
promoter_TATA_intersect_bed = f'../../data/output/{file_names}/TATA/{promoterpref}_TATA_intersect.bed'

In [4]:
#make directory for the plots to be exported to
dirName = f'../../data/output/{file_names}/TATA'
try:
    # Create target Directory
    os.mkdir(dirName)
    print("Directory " , dirName ,  " created") 
except FileExistsError:
    print("Directory " , dirName ,  " already exists")

Directory  ../../data/output/non-overlapping_includingbidirectional_all_genes_newannotation/TATA  already exists


In [5]:
#make directory for the plots to be exported to
dirName = f'../../data/output/{file_names}/TATA/plots'
try:
    # Create target Directory
    os.mkdir(dirName)
    print("Directory " , dirName ,  " created") 
except FileExistsError:
    print("Directory " , dirName ,  " already exists")

Directory  ../../data/output/non-overlapping_includingbidirectional_all_genes_newannotation/TATA/plots  already exists


In [6]:
#make directory for the plots to be exported to
dirName = f'../../data/output/{file_names}/TATA/gat_analysis'
try:
    # Create target Directory
    os.mkdir(dirName)
    print("Directory " , dirName ,  " created") 
except FileExistsError:
    print("Directory " , dirName ,  " already exists")

Directory  ../../data/output/non-overlapping_includingbidirectional_all_genes_newannotation/TATA/gat_analysis  already exists


In [7]:
def sort_data(promoter_bed_file,gene_categories):
    """sort the promoters and genes types data ready for gat TATA enrichment"""
    promoters = pd.read_table(promoter_bed_file, sep='\t', header=None)
    col = ['chr', 'start','stop','AGI','dot1','strand','source','type','dot2','attributes']
    promoters.columns = col
    #read in gene categories
    gene_cats = pd.read_table(gene_categories, sep='\t', header=None)
    cols = ['AGI','gene_type']
    gene_cats.columns = cols
    #merge with promoters
    promoters = pd.merge(promoters, gene_cats, on='AGI', how='left')
    #EPD downloaded file motifs (all genes)
    core_motifs_file = '../../data/EPD_promoter_analysis/EPDnew_promoters/db/promoter_motifs.txt'

    #Read in that file
    core_motifs = pd.read_table(core_motifs_file, sep='\t', header=0)
    cols = ['AGI','TATA_present','Inr_present','CCAAT_box_present','GC_box_present']
    core_motifs.columns = cols
    #remove last 2 characters of AGI in cor_motifs df
    core_motifs['AGI'] = core_motifs.AGI.str.slice(0, -2)
    #Merge them with extracted promoters
    merged = pd.merge(promoters,core_motifs, how='left', on='AGI')    
    return merged  

In [8]:
# def remove_characters_linestart(input_location,output_location,oldcharacters,newcharacters,linestart):
#     """this function removes characters from the start of each line in the input file and sends modified lines to output"""
#     output = open(output_location, 'w') #make output file with write capability
#     #open input file
#     with open(input_location, 'r') as infile:  
#         #iterate over lines in file
#         for line in infile:
#             line = line.strip() # removes hidden characters/spaces
#             if line[0] == linestart:
                                 
#                 line = line.replace(oldcharacters, newcharacters) #remove characters from start of line, replace with new characters        
#             output.write(line + '\n') #output to new file
#     output.close()

In [9]:
def prepare_gat(df):
    """prepare files for running gat analysis - outputs a workspace file containing all promoters, a variable promoter file and a constitutive promoter file"""
    #make buffer to save promoters
    buffer = io.StringIO()
    df.to_csv(buffer,sep='\t', header=None, index = False)
    buffer.seek(0)
    #select only constitutive and variable genes
    df = df[(df.gene_type == 'constitutive') | (df.gene_type == 'variable')]
    #reorder columns
    df_reordered = df[['chr','start','stop','gene_type', 'strand', 'source', 'attributes','AGI']]
    #sort by chromosome and start
    sorted_motifs = df_reordered.sort_values(['chr','start'])
    #save bed file
    bed = BedTool.from_dataframe(sorted_motifs).saveas(f'../../data/output/{file_names}/TATA/{output_genecat_prefix}_{promoterpref}_nocontrol.bed')

    #run bedtools intersect between TATAbox_location_renamed.bed and the extracted promoters
    TATAlocations = BedTool(TATA_box_locations)
    promoters = BedTool(buffer)
    intersect = promoters.intersect(TATAlocations, wao=True,output=promoter_TATA_intersect_bed)
    #make a new gat workspace file with all promoters (first 3 columns)
    bed = BedTool.from_dataframe(sorted_motifs[['chr','start','stop']]).saveas(f'../../data/output/{file_names}/TATA/gat_analysis/{output_genecat_prefix}_{promoterpref}_workspace.bed')
    #select only variable promoters
    variable_promoters_extended = sorted_motifs[sorted_motifs['gene_type'] == 'variable']
    sorted_variable = variable_promoters_extended.sort_values(['chr','start'])
    bed = BedTool.from_dataframe(sorted_variable).saveas(f'../../data/output/{file_names}/TATA/gat_analysis/{output_genecat_prefix}_{promoterpref}_variable.bed')
    #make a constitutive only file
    constitutive_promoters = sorted_motifs[sorted_motifs['gene_type'] == 'constitutive']
    sorted_constitutive = constitutive_promoters.sort_values(['chr','start'])
    bed = BedTool.from_dataframe(sorted_constitutive).saveas(f'../../data/output/{file_names}/TATA/gat_analysis/{output_genecat_prefix}_{promoterpref}_constitutive.bed')

   

In [10]:
# #read in promoter TATA intersect bed
# promoter_TATA_intersect_df = pd.read_table(promoter_TATA_intersect_bed,sep='\t', header=None)
# cols = ['chr','start','stop','AGI','dot1','strand','source','type','dot2','attributes',
#         'gene_type','TATA_present','Inr_present','CCAAT_box_present' ,'GC_box_present','chr_TATA','start_TATA','stop_TATA','type','strand_TATA','bpoverlap']
# promoter_TATA_intersect_df.columns = cols

In [11]:
#commented this out as I only have to preform it once
# remove_characters_linestart('../../data/EPD_promoter_analysis/EPDnew_promoters/TATAbox_location.bed', 
#                             '../../data/EPD_promoter_analysis/EPDnew_promoters/TATAbox_location_renamed.bed',
#                            'chr','','c')

# #read in TATAbox_location_renamed.bed to df. Extend length of each box to 15bp as stated in EPD
# TATA_locations_df = pd.read_table('../../data/EPD_promoter_analysis/EPDnew_promoters/TATAbox_location_renamed.bed',sep='\t', header=None,skiprows=10)
# cols = ['chr','start','stop','type','number','strand']
# TATA_locations_df.columns = cols
# #Make TATA box segment the actual size - I will set all to 15 bp
# TATA_locations_df.loc[TATA_locations_df.strand =='+', 'stop'] = TATA_locations_df.stop + 14
# TATA_locations_df.loc[TATA_locations_df.strand =='-', 'start'] = TATA_locations_df.start - 14
# #filter columns
# TATA_locations_df = TATA_locations_df[['chr','start','stop','type','strand']]
# #save file
# TATA_locations_df.to_csv('../../data/EPD_promoter_analysis/EPDnew_promoters/TATAbox_location_renamed.bed',sep='\t',header=None,index=None)

In [12]:
merged = sort_data(promoter_bed_file,Czechowski_gene_categories)

In [225]:
merged_norandom = prepare_gat(merged)

In [114]:
# ## how many TATA
# merged_TATA = merged.groupby('gene_type')['TATA_present'].value_counts()
# merged_TATA

gene_type     TATA_present
constitutive  0.0             84
              1.0             12
control       0.0             77
              1.0             22
variable      0.0             65
              1.0             30
Name: TATA_present, dtype: int64

In [115]:
# ## how many Inr
# merged_Inr = merged.groupby('gene_type')['Inr_present'].value_counts()
# merged_Inr

gene_type     Inr_present
constitutive  0.0            68
              1.0            28
control       0.0            69
              1.0            30
variable      0.0            68
              1.0            27
Name: Inr_present, dtype: int64

In [116]:
# ## how many GC_box
# merged_GC_box = merged.groupby('gene_type')['GC_box_present'].value_counts()
# merged_GC_box

gene_type     GC_box_present
constitutive  0.0               90
              1.0                6
control       0.0               97
              1.0                2
variable      0.0               91
              1.0                4
Name: GC_box_present, dtype: int64

In [117]:
# ## how many CCAAT_box
# merged_CCAAT_box = merged.groupby('gene_type')['CCAAT_box_present'].value_counts()
# merged_CCAAT_box

gene_type     CCAAT_box_present
constitutive  0.0                  79
              1.0                  17
control       0.0                  81
              1.0                  18
variable      0.0                  72
              1.0                  23
Name: CCAAT_box_present, dtype: int64

In [118]:
# make separate dfs based on gene type

In [119]:
# #get names of each promoter type
# pd.Categorical(merged.gene_type)
# names = merged.gene_type.unique()
# for name in names:
#     print(name)

nan
variable
control
constitutive


In [120]:
#Chi squared: H0:There is no statistically significant relationship between gene type and the presence of the feature.

#Ha:There is a statistically significant relationship between gene type and the presence of the feature.

In [122]:

# # Constructing the Contingency Table

# # The next step is to format the data into a frequency count table. This is called a Contingency Table, we can accomplish this by using the pd.crosstab() function in pandas.
# contingency_table = pd.crosstab(
#     merged_norandom['gene_type'],
#     merged_norandom['TATA_present'],
#     margins = True
# )
# contingency_table

TATA_present,0.0,1.0,All
gene_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
constitutive,84,12,96
variable,65,30,95
All,149,42,191


In [123]:
# # First, let's put the observed values into a one dimensional array, reading the contingency table from left to right then top to bottom.
# f_obs = np.append(contingency_table.iloc[0][0:2].values, contingency_table.iloc[1][0:2].values)
# f_obs

array([84, 12, 65, 30])

In [124]:
#Next, we need to calculate the expected values. The expected values assume that null hypothesis is true. We would need to calculate values if there is an equal percentage of males and females for each category.

In [125]:
# row_sums = contingency_table.iloc[0:2,2].values
# row_sums

array([96, 95])

In [126]:
# col_sums = contingency_table.iloc[2,0:2].values
# col_sums

array([149,  42])

In [127]:
# total = contingency_table.loc['All', 'All']

# f_expected = []
# for j in range(2):
#     for i in col_sums:
#         f_expected.append(i*row_sums[j]/total)
# f_expected

[74.89005235602095, 21.10994764397906, 74.10994764397905, 20.89005235602094]

In [128]:
# Now that we have all our observed and expected values, we can just plug everything into the Chi-squared test formula.

In [129]:
# chi_squared_statistic = ((f_obs - f_expected)**2/f_expected).sum()
# print('Chi-squared Statistic: {}'.format(chi_squared_statistic))

Chi-squared Statistic: 10.132146641772216


In [130]:
#Degrees of Freedom

# Similar to the Welch's t-test, we would have to calculate the degrees of freedom before we can determine the p-value.

#DoF=(Number of rows−1)∗(Number of columns−1)

In [131]:
# dof = (len(row_sums)-1)*(len(col_sums)-1)
# print("Degrees of Freedom: {}".format(dof))

Degrees of Freedom: 1


In [132]:
# #make function for doing chisqaure on contingency table
# def chi_stat(data, column, apply_correction):
#     """function to calculate chi squared values from a dataset. You provide the column of interest containing your categories. Choose whether Yates' correction is true or false"""
#     contingency_table = pd.crosstab(
#     data['gene_type'],
#     data[column],
#     margins = True
#     )
    
#     f_obs = np.array([contingency_table.iloc[0][0:2].values,
#                   contingency_table.iloc[1][0:2].values])
    
#     statistics = stats.chi2_contingency(f_obs, correction=apply_correction)[0:3]
#     return statistics

In [133]:
# #With no correction, identical to manual chi sqare above
# chi_stat(merged_norandom, 'TATA_present', False)

(10.132146641772216, 0.001457056195141269, 1)

In [134]:
# #with Yate's correction
# chi_stat(merged_norandom, 'TATA_present', True)

(9.050461510144208, 0.002626285171560698, 1)

In [135]:
#chi_stat(merged_norandom, 'TF_ID', True)

In [137]:
# #Get column names of interest
# cols = list(merged_norandom.columns.values)
# cols_of_interest = cols[11:]
# cols_of_interest

['TATA_present', 'Inr_present', 'CCAAT_box_present', 'GC_box_present']

In [138]:
for col in cols_of_interest:
    print(col)

TATA_present
Inr_present
CCAAT_box_present
GC_box_present


In [139]:
# #calculate chi-squared, p-value and degrees of freedom with Yates' correction for all four column
# #if p-value < 0.05, reject null hypothesis
# for col in cols_of_interest:
#     if chi_stat(merged_norandom, col, True)[1] > 0.05:
#         print(f'{col}: {chi_stat(merged_norandom, col, True)}, NOT SIGNIFICANT')
#     elif chi_stat(merged_norandom, col, True)[1] < 0.05:
#         print(f'{col}: {chi_stat(merged_norandom, col, True)}, SIGNIFICANT')    
        

TATA_present: (9.050461510144208, 0.002626285171560698, 1), SIGNIFICANT
Inr_present: (0.0021173971233229825, 0.9632981479521568, 1), NOT SIGNIFICANT
CCAAT_box_present: (0.858198518466075, 0.35424331486024774, 1), NOT SIGNIFICANT
GC_box_present: (0.0947669956140352, 0.7582021919495565, 1), NOT SIGNIFICANT


## <b> now I need to rerun analyses using gat enrichment

If binding sites you're mapping are small, need to get the mapability genome containing all regions that are uniquely mappable with reads of 24 bases. https://genome.ucsc.edu/cgi-bin/hgTrackUi?db=hg38&g=mappability
See https://gat.readthedocs.io/en/latest/tutorialGenomicAnnotation.html

Downloaded TATAbox_location.bed from EPD
Used the following search parameters for download:
## FindM Genome Assembly : A. thaliana (Feb 2011 TAIR10/araTha1)
##Series : EPDnew, the Arabidopsis Curated Promoter Database
##Sample : TSS from EPDnew rel 004
##Repeat masking: off
##5' border: -100     3' border: 100
##Search mode: forward
##Selection mode : all matches 

Copied the chromsizes.chr to data/EPD_promoter_analysis/TATA and converted it into a BED file for the workspace.

0