In [18]:
from pybedtools import BedTool
import pandas as pd
from Bio import SeqIO

In [19]:
#first run:
#bedtools getfasta -fi input_fasta_file -bed input_bed_file -fo output.fasta -name

In [20]:
def fasta2csv(fasta, output_tsv):
    """function to convert fasta file into tsv"""
    with open(fasta, 'r') as f_input, open(output_tsv, 'w') as f_output:
        sequences = SeqIO.parse(f_input, "fasta")
        count = SeqIO.write(sequences, f_output, 'tab')

In [21]:
def change_column_names(tsv, type_column, colour):
    """function to add column names compatible with csv upload on benchling.com"""
    csv_df = pd.read_table(tsv, sep='\t', names=["Name", "Feature"])
    csv_df['Type'] = type_column
    csv_df['Color'] = colour
    csv_df['Match type'] = ''
    
    #ensure sequence length is 3 or more
    csv_df.Feature = csv_df.Feature.astype('str')
    mask = (csv_df.Feature.str.len() >= 3)
    csv_df = csv_df.loc[mask]
    csv_df.to_csv(tsv, header=True, index=False)

    return csv_df

In [22]:
roots_fasta = '../../data/ATAC-seq/potter2018/Roots_NaOH_peaks_all.fasta'
shoots_fasta = '../../data/ATAC-seq/potter2018/Shoots_NaOH_peaks_all.fasta'
promoter_5UTR_fasta = '../../data/FIMO/non-overlapping_includingbidirectional_all_genes/promoters_5UTR_renamedChr.fasta'
shoots_csv = '../../data/ATAC-seq/potter2018/Shoots_NaOH_peaks_all.csv'
roots_csv = '../../data/ATAC-seq/potter2018/Roots_NaOH_peaks_all.csv'
promoter_5UTR_csv = '../../data/FIMO/non-overlapping_includingbidirectional_all_genes/promoters_5UTR_renamedChr.csv'

In [23]:
#process the files
fasta2csv(roots_fasta,roots_csv)

In [24]:
fasta2csv(shoots_fasta,shoots_csv)

In [25]:
fasta2csv(promoter_5UTR_fasta,promoter_5UTR_csv)

In [26]:
shoot = change_column_names(shoots_csv,'ATAC-seq Potter 2018', 'green')
root = change_column_names(roots_csv,'ATAC-seq Potter 2018', 'brown')
promoter_5UTR = change_column_names(promoter_5UTR_csv,'Sam_promoter_extracted_4.6.20', 'blue')

In [27]:
shoot

Unnamed: 0,Name,Feature,Type,Color,Match type
0,::1:2504-2918,CCAACGACCATGATAAATCCAAAAAGTAGAAACAATCTATTATCTA...,ATAC-seq Potter 2018,green,
1,::1:2921-3661,CCATTTGCGACAAGCAAAACAACATGGTCAAAATTAAAAGCTAACA...,ATAC-seq Potter 2018,green,
2,::1:8571-8978,CATTTCATGAGACGAATGTTACCCGGAGAGTATTGAATGAACAATG...,ATAC-seq Potter 2018,green,
3,::1:13546-13899,CAGGACTTAGAACTCAAATTGGGTTCTTGCCAAACAAGAGGATCTC...,ATAC-seq Potter 2018,green,
4,::1:14008-14377,TGTTAGTTAGTAACTGACCCGGGAATCAGGATATGTATTTATACTA...,ATAC-seq Potter 2018,green,
...,...,...,...,...,...
32515,::5:26967011-26967530,AGAAATTCAAGGACACACGTTTATTCATAAATTTAGATAATATTTC...,ATAC-seq Potter 2018,green,
32516,::5:26969097-26970085,AGAATAGCCCGACCCGCGATTTTCCCTTCTCTAATCATCTGAAGGA...,ATAC-seq Potter 2018,green,
32517,::5:26970545-26970745,CATCGTCGGCACTTGGCAGCGAAATCTCCGTCTAATTCATAAGATG...,ATAC-seq Potter 2018,green,
32518,::5:26971007-26971353,TAATTACAATACTAAATTACGTCTTTTCAAATAAGAGAAAGATAAA...,ATAC-seq Potter 2018,green,


In [28]:
promoter_5UTR

Unnamed: 0,Name,Feature,Type,Color,Match type
0,AT1G01040::1:22120-23518,ATATAATATCCATAAAAAATAGCCTATGCGTGTTGGATGCTTACAA...,Sam_promoter_extracted_4.6.20,blue,
1,AT1G01050::1:32670-34171,CTGTCAAAATCAGAACGTTTCATCACTCATTGATATTGACTGAATC...,Sam_promoter_extracted_4.6.20,blue,
2,AT1G01060::1:37061-38443,AACAGGACCGGTGCAGCTATTCGCTGCTTCAAATCCTCTCTAACAA...,Sam_promoter_extracted_4.6.20,blue,
3,AT1G01070::1:40877-42017,ATCTCTCTCTATATGTGAGTTATTTTTGTGTGTATTACTTAGAGAG...,Sam_promoter_extracted_4.6.20,blue,
4,AT1G01080::1:46789-47233,TATCGAAGGACCAGAAAGTAAATTATTTGAGAAGAATGATTAAAAA...,Sam_promoter_extracted_4.6.20,blue,
...,...,...,...,...,...
26907,AT5G67590::5:26957774-26958072,ATCAAATTACACAATAAAACCTACCAACTAAATGAATAGGATCAAG...,Sam_promoter_extracted_4.6.20,blue,
26908,AT5G67600::5:26960226-26960903,CTTTCTTCTTCTTCTGAGTTCTTTGAGCTTTTTTTTCTTTCTCTCT...,Sam_promoter_extracted_4.6.20,blue,
26909,AT5G67610::5:26963614-26964550,CGATTTCTTCCTTGTGGGTTTCAGGAATAAAAAAAGTTTCAGGGCG...,Sam_promoter_extracted_4.6.20,blue,
26910,AT5G67620::5:26965720-26967010,TTTTATTTTTTTTGCAAAAAAAAAGAAAAGAGAAAAGAGTTTTGAT...,Sam_promoter_extracted_4.6.20,blue,


In [29]:
#edit shoot name
shoot = pd.read_csv(shoots_csv,header=0)
shoot.Name = 'shoot_open_chromatin_peaks' + shoot.Name.astype(str)
shoot.to_csv(shoots_csv, header=True, index=False)

In [30]:
#edit root name
root = pd.read_csv(roots_csv,header=0)
root.Name = 'shoot_open_chromatin_peaks' + shoot.Name.astype(str)
root.to_csv(roots_csv, header=True, index=False)