In [1]:
# q1
from Bio import Entrez, SeqIO 
def download_save(acc_nos, output_file):
    Entrez.email = "sargun22450@iiitd.ac.in"
    chromosome_seqs = [] 
    for accno in acc_nos:
        handle = Entrez.efetch(db="nucleotide", id=accno, rettype="fasta", retmode="text")
        record = SeqIO.read(handle, "fasta")
        handle.close()

        chromosome_seqs.append(record)

    with open(output_file, "w") as f:
        SeqIO.write(chromosome_seqs,f,"fasta")

    print(f"Chromosomes downloaded and saved to {output_file}")
    return chromosome_seqs

accession_list = ["NC_001133.9", "NC_001134.8", "NC_001135.5", "NC_001136.10", "NC_001137.3",
                 "NC_001138.5", "NC_001139.9", "NC_001140.6", "NC_001141.2", "NC_001142.9",
                 "NC_001143.9", "NC_001144.5", "NC_001145.3", "NC_001146.8", "NC_001147.5", "NC_001148.4"]
output_file = "bakersyeast.fasta"
 
download_save(accession_list, output_file)


Chromosomes downloaded and saved to bakersyeast.fasta


[SeqRecord(seq=Seq('CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACA...GGG'), id='NC_001133.9', name='NC_001133.9', description='NC_001133.9 Saccharomyces cerevisiae S288C chromosome I, complete sequence', dbxrefs=[]),
 SeqRecord(seq=Seq('AAATAGCCCTCATGTACGTCTCCTCCAAGCCCTGTTGTCTCTTACCCGGATGTT...TGT'), id='NC_001134.8', name='NC_001134.8', description='NC_001134.8 Saccharomyces cerevisiae S288C chromosome II, complete sequence', dbxrefs=[]),
 SeqRecord(seq=Seq('CCCACACACCACACCCACACCACACCCACACACCACACACACCACACCCACACA...GTG'), id='NC_001135.5', name='NC_001135.5', description='NC_001135.5 Saccharomyces cerevisiae S288C chromosome III, complete sequence', dbxrefs=[]),
 SeqRecord(seq=Seq('ACACCACACCCACACCACACCCACACACACCACACCCACACACCACACCCACAC...TGG'), id='NC_001136.10', name='NC_001136.10', description='NC_001136.10 Saccharomyces cerevisiae S288C chromosome IV, complete sequence', dbxrefs=[]),
 SeqRecord(seq=Seq('CGTCTCCTCCAAGCCCTGTTGTCTCTTACCCGGATGTTCAACCAAAAGCTACTT...TTT'), id='NC_00

In [4]:
# q2
from Bio import Entrez, SeqIO 
import regex as re

def find_ori_using_ars(accession_number): 
    handle = Entrez.efetch(db="nucleotide", id=accession_number, rettype="fasta", retmode="text")
    record = SeqIO.read(handle, "fasta")
    handle.close()

    dna_seq = str(record.seq)
    
    ars_seq = "(A/T)TTTA(C/T)(A/G)TTT(A/T)"
    ars_pattern = re.compile("[AT]TTTA[CT][AG]TTT[AT]")

    ars_matches = [match.start() for match in ars_pattern.finditer(dna_seq)]

    matching_entries = [] 
    first_ori_position = None

    if ars_matches:
        for ori_posn in ars_matches:
            matching_entry = record[ori_posn : ori_posn + 11] #11 is the len of the ars_seq that i hv chosen as mentioned in readme
            matching_entries.append((matching_entry.id, str(matching_entry.seq), ori_posn))

            if first_ori_position is None: #first ori ocuuring for a chromosme for a matching seq to arsseq stored
                first_ori_position = ori_posn 

    return matching_entries, first_ori_position

def ori_for_chromosomes(f):
    records = SeqIO.to_dict(SeqIO.parse(f, "fasta"))
    ori_results = {}#dict to store results for each chromosom

    for chromosome_id, record in records.items():
        matching_entries, first_ori_position = find_ori_using_ars(record.id)
        total_entries = len(matching_entries)
        # print(f"Chromosome {chromosome_id}: Total Entries: {total_entries}")
        ori_results[chromosome_id] = {
            'matching_entries': matching_entries,
            'first_ori_position': first_ori_position,
            'total_entries': total_entries
        }
    return ori_results

f = "bakersyeast.fasta"
ori_results = ori_for_chromosomes(f)
out_file="out.txt"
def store_first_ori_positions(file_path, ori_results):
    with open(file_path, "w") as f:
        for chromosome_id, results in ori_results.items():
            first_ori_position = results['first_ori_position']
            f.write(f"Chromosome {chromosome_id}: First ORI position: {first_ori_position}\n")

store_first_ori_positions(out_file, ori_results)

for chromosome_id, results in ori_results.items():
    matching_entries = results['matching_entries']
    first_ori_position = results['first_ori_position']

    if matching_entries:
        print(f"\nChromosome {chromosome_id}: Matching entries:")
        start_idxs=[entry[2] for entry in matching_entries]
        # print("all the entries matcjhing to arsseq:  ",matching_entries)
        print("all the starting idxs of seqs that match the ars_seq :",start_idxs)
        for entry_id, sequence, start_idxs in matching_entries:
            print(f" Sequence: {sequence} - Start Index: {start_idxs}")

        print(f" First ORI position: {first_ori_position}")
    else:
        print(f"\nChromosome {chromosome_id}: No matching entries found.")




Chromosome NC_001133.9: Matching entries:
all the starting idxs of seqs that match the ars_seq : [17149, 159953, 171816, 176236, 176522, 208605, 229450]
 Sequence: ATTTATGTTTA - Start Index: 17149
 Sequence: ATTTATATTTA - Start Index: 159953
 Sequence: ATTTACGTTTA - Start Index: 171816
 Sequence: TTTTATGTTTT - Start Index: 176236
 Sequence: ATTTATATTTA - Start Index: 176522
 Sequence: TTTTATATTTA - Start Index: 208605
 Sequence: TTTTATATTTA - Start Index: 229450
 First ORI position: 17149

Chromosome NC_001134.8: Matching entries:
all the starting idxs of seqs that match the ars_seq : [80, 53415, 122598, 189470, 195767, 231686, 238293, 246606, 256898, 326080, 326195, 368745, 381151, 403313, 420235, 424981, 543395, 562508, 568821, 603190, 622760, 632052, 665038, 676293, 755032, 777821, 784662, 792466, 812416]
 Sequence: TTTTATGTTTA - Start Index: 80
 Sequence: TTTTACGTTTT - Start Index: 53415
 Sequence: TTTTATATTTT - Start Index: 122598
 Sequence: TTTTACATTTA - Start Index: 189470
 Seq