In [37]:
from Bio import Entrez, SeqIO

def download_and_save_chromosomes(accession_numbers, output_file):
    # Set your email for Entrez
    Entrez.email = "your_email@example.com"

    # List to store the downloaded sequences
    chromosome_sequences = [] 
    # Loop through each accession number and download the sequence
    for accession_number in accession_numbers:
        handle = Entrez.efetch(db="nucleotide", id=accession_number, rettype="fasta", retmode="text")
        record = SeqIO.read(handle, "fasta")
        handle.close()

        # Append the sequence to the list
        chromosome_sequences.append(record)

    # Save all sequences to a single file
    with open(output_file, "w") as f:
        SeqIO.write(chromosome_sequences, f, "fasta")

    print(f"Chromosomes downloaded and saved to {output_file}")
    return chromosome_sequences

accession_list = ["NC_001133.9", "NC_001134.8", "NC_001135.5", "NC_001136.10", "NC_001137.3",
                 "NC_001138.5", "NC_001139.9", "NC_001140.6", "NC_001141.2", "NC_001142.9",
                 "NC_001143.9", "NC_001144.5", "NC_001145.3", "NC_001146.8", "NC_001147.5", "NC_001148.4"]
output_file = "yeast_genome.fasta"

# Download and save chromosomes to a single file
download_and_save_chromosomes(accession_list, output_file)


Genome sequence downloaded and saved to yeast_genome.fasta
Accession number: NG_013229.2


In [38]:
import re
from Bio import SeqIO
from Bio import Entrez

def find_ori_using_ars_for_all_chromosomes(file_path):
    # Read the FASTA file containing all chromosomes
    records = SeqIO.to_dict(SeqIO.parse(file_path, "fasta"))

    # Initialize a dictionary to store results for each chromosome
    ori_results = {}

    for chromosome_id, record in records.items():
        # Get the DNA sequence for the current chromosome
        dna_seq = str(record.seq)

        # Define ARS sequence pattern
        ars_seq = "(A/T)TTTA(C/T)(A/G)TTT(A/T)"
        ars_pattern = re.compile("[AT]TTTA[CT][AG]TTT[AT]")

        # Find ARS sequence matches
        ars_matches = [match.start() for match in ars_pattern.finditer(dna_seq)]

        matching_entries = []
        first_ori_position = None

        # Process matching entries for the current chromosome
        if ars_matches:
            print(f"Chromosome {chromosome_id}: ARS sequence ({ars_seq}) found at positions (1-based indexing): {ars_matches}")

            for ori_posn in ars_matches:
                matching_entry = record[ori_posn: ori_posn + len(ars_seq)]
                matching_entries.append((matching_entry.id, str(matching_entry.seq), ori_posn))

                if first_ori_position is None:
                    first_ori_position = ori_posn

        else:
            print(f"Chromosome {chromosome_id}: No ARS sequence found.")

        # Store results for the current chromosome in the dictionary
        ori_results[chromosome_id] = {
            'matching_entries': matching_entries,
            'first_ori_position': first_ori_position
        }

    return ori_results

# Usage
file_path = "yeast_genome.fasta"
ori_results = find_ori_using_ars_for_all_chromosomes(file_path)

# Print results for each chromosome
for chromosome_id, results in ori_results.items():
    matching_entries = results['matching_entries']
    first_ori_position = results['first_ori_position']

    if matching_entries:
        print(f"\nChromosome {chromosome_id}: Matching entries:")
        for entry_id, sequence, ori_posn in matching_entries:
            print(f"  Sequence: {sequence} - Start Index: {ori_posn}")

        print(f"  First Origin of Replication (ORI) position: {first_ori_position}")
    else:
        print(f"\nChromosome {chromosome_id}: No matching entries found.")


ARS sequence ((A/T)TTTA(C/T)(A/G)TTT(A/T)) found at positions (1-based indexing): [159, 423, 10087, 23067, 24105, 36462, 38702, 55048, 82587, 90507, 101874, 108868]
Matching entries:
Sequence: TTTTATATTTTTAACTTTTCCCACAAT - Start Index: 159
Sequence: ATTTATATTTTATTCAGGTCCAGGATT - Start Index: 423
Sequence: TTTTATATTTTTAGTAGAGACATTGTT - Start Index: 10087
Sequence: TTTTATGTTTTTGTGGAGGCAAGGTCT - Start Index: 23067
Sequence: TTTTATATTTTTTCAGAGAAAGGGTCT - Start Index: 24105
Sequence: ATTTATATTTAATGATATTCAGTATGA - Start Index: 36462
Sequence: TTTTATATTTTTAGTAGAGACGGGGTT - Start Index: 38702
Sequence: TTTTACATTTTGATGGCTCCTATACAC - Start Index: 55048
Sequence: ATTTATGTTTTTATAAGCTGGTACATT - Start Index: 82587
Sequence: ATTTACATTTTTAAAATATTGTGTGTT - Start Index: 90507
Sequence: TTTTACATTTTTTAATGGTTAAAAAAA - Start Index: 101874
Sequence: TTTTATGTTTTTTAAAAATGTATGTAG - Start Index: 108868
First Origin of Replication (ORI) position: 159




- Reference:
1. https://en.wikipedia.org/wiki/Autonomously_replicating_sequence-:~:text=An%20autonomously%20replicating%20sequence%20(ARS,their%20effect%20on%20plasmid%20stability.
2. https://pubmed.ncbi.nlm.nih.gov/9441849/#:~:text=Autonomously%20replicating%20sequence%20(ARS)%20elements%20were%20first%20identified%20in%20the,extrachromosomal%20maintenance%20of%20plasmid%20DNA.
3. https://www.sciencedirect.com/topics/medicine-and-dentistry/autonomously-replicating-sequence
4. https://en.wikipedia.org/wiki/Baker%27s_yeast
5. https://en.wikipedia.org/wiki/Autonomously_replicating_sequence
6. https://www.ncbi.nlm.nih.gov/search/all/?term=Saccharomyces+cerevisiae
- using the hint given in the q, (S. cerevisiae has an Autonomously Replicating Sequence with features you can search.)
- the ARS that contains the origin of replication in the yeast genome is - "(A/T)TTTA(C/T)(A/G)TTT(A/T)" found in Saccharomyces cerevisiae which i derived from the sequence given from the above links
 where 
- the "/" slash shows that it could be either the first base or the second one which will be checked by the code for matching the seq from yeastgenome.gb file.

- "T" is thymine.
- "A" is adenine
- "C" is cytosine (C) 
- "G" is guanine (G)

- So after runnin above code ouput is coming out to be-
 


- This means that there are three occurences matching the ars_seq "(A/T)TTTA(C/T)(A/G)TTT(A/T)" and this seq is present in three different chromosomes of the yeast genome, starting at positions 56946, 65503, and 68265.
- The first copy starts at 56946 which after crosschecking from yeast_genome.gb file, is correct- TTTTATGTTTA which matches our ars sequence.
 
 this is a short snippet frm yeast_genome.gb file-

     56941 ctgtat'tttt atgttta'att ataacccctt taggattata atttaaatta atttaaatat
 
 where 56941 denoted the first nucleotides seq starting idx(1 based) and now if we use slicing just to crosscheck our ans 
 "ttttatgttta" is extracted , which starts on the "t" at position 56946.

- Then i used the nt_search function as done in tuts - from Bio.SeqUtils module to search for occurrences of ars_seq "(A/T)TTTA(C/T)(A/G)TTT(A/T)" in the DNA sequence.

- If matches are found, i store them in "matching_entries" list and also their coresponding position from the .gb file

- finally for the ans, I hv taken the first position as the Origin of Replication (ori).
 