In [203]:
import pandas as pd
import numpy as np

In [204]:
#use only 100 random genes
random_100 = '../../data/genes/random_100.csv'
constitutive_variable = '../../data/genomes/ara_housekeeping_list.out'

In [205]:
random_genes = pd.read_csv(random_100, header=0)
promoters = pd.read_table(constitutive_variable, sep='\t', header=None)
cols2 = ['promoter_AGI', 'gene_type']
promoters.columns = cols2

In [206]:
random_genes = random_genes[['promoter_AGI','gene_type']]

In [207]:
promoters_filtered = promoters.copy()
#drop randCont rows
promoters_filtered = promoters_filtered[~(promoters.gene_type == 'randCont')]
promoters_filtered

Unnamed: 0,promoter_AGI,gene_type
0,AT4G34270,housekeeping
1,AT3G32260,housekeeping
2,AT1G59830,housekeeping
3,AT4G33380,housekeeping
4,AT2G28390,housekeeping
...,...,...
195,AT3G44070,highVar
196,AT5G61360,highVar
197,AT5G16100,highVar
198,AT3G60570,highVar


In [208]:
#concat the dfs, and drop rows if present in groupby_promoter

promoters_filtered = pd.concat([promoters_filtered, random_genes], axis=0, join='outer', ignore_index=True, keys=None,
          levels=None, names=None, verify_integrity=False, copy=True, sort=False)
promoters_filtered
#write to file for future
promoters_filtered.to_csv('../../data/genes/constitutive-variable-random_100_each.csv')

In [209]:
TATA_presence = '../../data/EPD_promoter_analysis/responsive_housekeeping_TATA_box_positive.bed'
TFs = '../../data/genes/housekeeping_variable_TFs_only.txt'
GC_box_file = '../../data/EPD_promoter_analysis/responsive_housekeeping_GCbox_positive.bed'
Inr_file = '../../data/EPD_promoter_analysis/responsive_housekeeping_Inr_positive.bed'
CCAAT_box_file = '../../data/EPD_promoter_analysis/responsive_housekeeping_CCAAT_box_positive.bed'

In [210]:
TATA = pd.read_table(TATA_presence, sep='\t', header=None, skiprows=2)
TF_promoters = pd.read_csv(TFs, header=0)
GC_box = pd.read_table(GC_box_file, sep='\t', header=None, skiprows=2)
Inr = pd.read_table(Inr_file, sep='\t', header=None, skiprows=2)
CCAAT_box = pd.read_table(CCAAT_box_file, sep='\t', header=None, skiprows=2)

In [211]:
#remove duplicates from TF_Promoters
TF_promoters = TF_promoters.drop_duplicates(subset='Gene_ID')

In [212]:
#columns
cols = ['chr', 'start', 'stop', 'gene_ID', 'number', 'strand']
TATA.columns = cols
GC_box.columns = cols
Inr.columns = cols
CCAAT_box.columns = cols
TATA['TATA_present'] = 'yes'
GC_box['GC_box_present'] = 'yes'
Inr['Inr_present']  = 'yes'
CCAAT_box['CCAAT_box_present'] = 'yes'

TF_promoters.rename(columns = {'Gene_ID':'promoter_AGI'}, inplace = True)
TF_promoters['is_TF'] = 'yes'

In [213]:
#remove last 2 characters of promoter_AGI in TATA dataframe
TATA['promoter_AGI'] = TATA.gene_ID.str.slice(0, -2)
GC_box['promoter_AGI'] = GC_box.gene_ID.str.slice(0, -2)
Inr['promoter_AGI'] = Inr.gene_ID.str.slice(0, -2)
CCAAT_box['promoter_AGI'] = CCAAT_box.gene_ID.str.slice(0, -2)

In [214]:
merged = pd.merge(promoters_filtered,TATA[['promoter_AGI', 'TATA_present']], how='left', on='promoter_AGI')

In [215]:
merged = pd.merge(merged,GC_box[['promoter_AGI', 'GC_box_present']], how='left', on='promoter_AGI')

In [216]:
merged = pd.merge(merged,Inr[['promoter_AGI', 'Inr_present']], how='left', on='promoter_AGI')

In [217]:
merged = pd.merge(merged,CCAAT_box[['promoter_AGI', 'CCAAT_box_present']], how='left', on='promoter_AGI')

In [218]:
merged = pd.merge(merged, TF_promoters, how='left', on='promoter_AGI')

In [219]:
merged

Unnamed: 0,promoter_AGI,gene_type,TATA_present,GC_box_present,Inr_present,CCAAT_box_present,TF_ID,Family,is_TF
0,AT4G34270,housekeeping,,,,,,,
1,AT3G32260,housekeeping,,,,,,,
2,AT1G59830,housekeeping,,,,,,,
3,AT4G33380,housekeeping,,,,,,,
4,AT2G28390,housekeeping,,,,,,,
...,...,...,...,...,...,...,...,...,...
295,AT5G13410,randCont,,,,,,,
296,AT5G56090,randCont,yes,,,,,,
297,AT5G05170,randCont,,,,yes,,,
298,AT3G18110,randCont,,,yes,,,,


In [221]:
## how many TATA
merged_TATA = merged.groupby('gene_type')['TATA_present'].value_counts()
merged_TATA

gene_type     TATA_present
highVar       yes             26
housekeeping  yes             11
randCont      yes             25
Name: TATA_present, dtype: int64

In [222]:
## how many Inr
merged_Inr = merged.groupby('gene_type')['Inr_present'].value_counts()
merged_Inr

gene_type     Inr_present
highVar       yes            25
housekeeping  yes            29
randCont      yes            37
Name: Inr_present, dtype: int64

In [223]:
## how many GC_box
merged_GC_box = merged.groupby('gene_type')['GC_box_present'].value_counts()
merged_GC_box

gene_type     GC_box_present
highVar       yes               4
housekeeping  yes               6
randCont      yes               1
Name: GC_box_present, dtype: int64

In [224]:
## how many CCAAT_box
merged_CCAAT_box = merged.groupby('gene_type')['CCAAT_box_present'].value_counts()
merged_CCAAT_box

gene_type     CCAAT_box_present
highVar       yes                  18
housekeeping  yes                  20
randCont      yes                  16
Name: CCAAT_box_present, dtype: int64

In [225]:
## how many TFs
merged_TF = merged.groupby('gene_type')['is_TF'].value_counts()
merged_TF

gene_type     is_TF
highVar       yes      5
housekeeping  yes      1
randCont      yes      5
Name: is_TF, dtype: int64