In [17]:
from Bio import Entrez, SeqIO

def download_and_save_chromosomes(accession_numbers, output_file):
    # Set your email for Entrez
    Entrez.email = "sargun22450@iiitd.ac.in"

    # List to store the downloaded sequences
    chromosome_sequences = [] 
    # Loop through each accession number and download the sequence
    for accession_number in accession_numbers:
        handle = Entrez.efetch(db="nucleotide", id=accession_number, rettype="fasta", retmode="text")
        record = SeqIO.read(handle, "fasta")
        handle.close()

        # Append the sequence to the list
        chromosome_sequences.append(record)

    # Save all sequences to a single file
    with open(output_file, "w") as f:
        SeqIO.write(chromosome_sequences, f, "fasta")

    print(f"Chromosomes downloaded and saved to {output_file}")
    return chromosome_sequences

accession_list = ["NC_001133.9", "NC_001134.8", "NC_001135.5", "NC_001136.10", "NC_001137.3",
                 "NC_001138.5", "NC_001139.9", "NC_001140.6", "NC_001141.2", "NC_001142.9",
                 "NC_001143.9", "NC_001144.5", "NC_001145.3", "NC_001146.8", "NC_001147.5", "NC_001148.4"]
output_file = "yeast_genome.fasta"

# Download and save chromosomes to a single file
download_and_save_chromosomes(accession_list, output_file)


Chromosomes downloaded and saved to yeast_genome.fasta


[SeqRecord(seq=Seq('CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACA...GGG'), id='NC_001133.9', name='NC_001133.9', description='NC_001133.9 Saccharomyces cerevisiae S288C chromosome I, complete sequence', dbxrefs=[]),
 SeqRecord(seq=Seq('AAATAGCCCTCATGTACGTCTCCTCCAAGCCCTGTTGTCTCTTACCCGGATGTT...TGT'), id='NC_001134.8', name='NC_001134.8', description='NC_001134.8 Saccharomyces cerevisiae S288C chromosome II, complete sequence', dbxrefs=[]),
 SeqRecord(seq=Seq('CCCACACACCACACCCACACCACACCCACACACCACACACACCACACCCACACA...GTG'), id='NC_001135.5', name='NC_001135.5', description='NC_001135.5 Saccharomyces cerevisiae S288C chromosome III, complete sequence', dbxrefs=[]),
 SeqRecord(seq=Seq('ACACCACACCCACACCACACCCACACACACCACACCCACACACCACACCCACAC...TGG'), id='NC_001136.10', name='NC_001136.10', description='NC_001136.10 Saccharomyces cerevisiae S288C chromosome IV, complete sequence', dbxrefs=[]),
 SeqRecord(seq=Seq('CGTCTCCTCCAAGCCCTGTTGTCTCTTACCCGGATGTTCAACCAAAAGCTACTT...TTT'), id='NC_00

In [19]:

def find_ori_using_ars(accession_number): 
    # Use Entrez to fetch the genome sequence
    handle = Entrez.efetch(db="nucleotide", id=accession_number, rettype="fasta", retmode="text")
    record = SeqIO.read(handle, "fasta")
    handle.close()

    # Get the DNA sequence
    dna_seq = str(record.seq)

    # Define ARS sequence pattern
    ars_seq = "(A/T)TTTA(C/T)(A/G)TTT(A/T)"
    ars_pattern = re.compile("[AT]TTTA[CT][AG]TTT[AT]")

    # Find ARS sequence matches
    ars_matches = [match.start() for match in ars_pattern.finditer(dna_seq)]

    matching_entries = [] 
    first_ori_position = None

    # Process matching entries
    if ars_matches:
        for ori_posn in ars_matches:
            matching_entry = record[ori_posn : ori_posn + len(ars_seq)]
            matching_entries.append((matching_entry.id, str(matching_entry.seq), ori_posn))

            if first_ori_position is None:
                first_ori_position = ori_posn

    return matching_entries, first_ori_position

def find_ori_for_all_chromosomes(file_path):
    # Read the FASTA file containing all chromosomes
    records = SeqIO.to_dict(SeqIO.parse(file_path, "fasta"))

    # Initialize a dictionary to store results for each chromosome
    ori_results = {}

    for chromosome_id, record in records.items():
        # Find ORI for the current chromosome
        matching_entries, first_ori_position = find_ori_using_ars(record.id)

        # Print total entries for the current chromosome
        total_entries = len(matching_entries)
        print(f"\nChromosome {chromosome_id}: Total Entries: {total_entries}")

        # Store results for the current chromosome in the dictionary
        ori_results[chromosome_id] = {
            'matching_entries': matching_entries,
            'first_ori_position': first_ori_position,
            'total_entries': total_entries
        }
        
    return ori_results

# Usage
file_path = "yeast_genome.fasta"
ori_results = find_ori_for_all_chromosomes(file_path)
# Print results for each chromosome
for chromosome_id, results in ori_results.items():
    matching_entries = results['matching_entries']
    first_ori_position = results['first_ori_position']

    if matching_entries:
        print(f"\nChromosome {chromosome_id}: Matching entries:")
        for entry_id, sequence, ori_posn in matching_entries:
            print(f"  Sequence: {sequence} - Start Index: {ori_posn}")

        print(f" First Origin of Replication (ORI) position: {first_ori_position}")
    else:
        print(f"\nChromosome {chromosome_id}: No matching entries found.")



Chromosome NC_001133.9: Total Entries: 7

Chromosome NC_001134.8: Total Entries: 29

Chromosome NC_001135.5: Total Entries: 11

Chromosome NC_001136.10: Total Entries: 57

Chromosome NC_001137.3: Total Entries: 25

Chromosome NC_001138.5: Total Entries: 6

Chromosome NC_001139.9: Total Entries: 30

Chromosome NC_001140.6: Total Entries: 15

Chromosome NC_001141.2: Total Entries: 25

Chromosome NC_001142.9: Total Entries: 31

Chromosome NC_001143.9: Total Entries: 23

Chromosome NC_001144.5: Total Entries: 32

Chromosome NC_001145.3: Total Entries: 31

Chromosome NC_001146.8: Total Entries: 27

Chromosome NC_001147.5: Total Entries: 30

Chromosome NC_001148.4: Total Entries: 27

Chromosome NC_001133.9: Matching entries:
  Sequence: ATTTATGTTTAGAGAGTTTATGGTCAG - Start Index: 17149
  Sequence: ATTTATATTTAGTGGGAGCAAAACAGT - Start Index: 159953
  Sequence: ATTTACGTTTAACCTAATTAGGAAACG - Start Index: 171816
  Sequence: TTTTATGTTTTCTTATGATTGAATTAT - Start Index: 176236
  Sequence: ATTTATATTTA