In [22]:
# Download Genome Sequence: firstly, download the genome sequence of Saccharomyces cerevisiae from the NCBI 
# database using the provided species name.

# Extract DNA Sequence: After obtaining the genome record,i extract the DNA sequence from the record.

# ARS Consensus Sequence: 
# The chosen ARS consensus sequence is "WTTTAYRTTTW."
# This sequence represents a common motif found in Autonomously Replicating Sequences (ARS) in Saccharomyces cerevisiae.
# The use of degenerate bases (W, Y, R) allows for flexibility in base recognition at certain positions, accommodating variations in the 
# actual ARS sequences.
# The sequence "WTTTAYRTTTW" is a consensus sequence representing an Autonomously Replicating Sequence (ARS) 
# in Saccharomyces cerevisiae. The ARS consensus sequence is a pattern that tends to be present in the origins
#  of replication in yeast genomes.

# "W" represents either adenine (A) or thymine (T).
# "T" is thymine.
# "A" is adenine.
# "Y" represents pyrimidine, which is either cytosine (C) or thymine (T).
# "R" represents purine, which is either adenine (A) or guanine (G).
# "T" is thymine.
# "T" is thymine.
# "T" is thymine.
# "W" represents either adenine (A) or thymine (T).
# This consensus sequence allows for some degree of variation in the bases at certain positions, making it more flexible and capturing 
# potential variations in the actual ARS sequences found in the Saccharomyces cerevisiae genome. It is a commonly used approach in 
# bioinformatics to define consensus sequences that capture the essential features of a motif or pattern while 
# allowing for some variability in the actual sequences.

# Search for ARS in DNA Sequence: The nt_search function from Biopython's SeqUtils module is employed to search for occurrences of 
# the chosen ARS consensus sequence in the extracted DNA sequence.
# The function takes into account degeneracy in base recognition, allowing for variations in the actual sequences.

# Identification of ORI: If matches are found, the positions of the ARS consensus sequence are extracted. 
# The script assumes the first position as the potential Origin of Replication (ORI).
 
# Set up an email address for NCBI access.
# Use the Biopython library to search for the genome sequence of Saccharomyces cerevisiae in the NCBI nucleotide database.
# Download the genome record and save it to a GenBank file-yeast_genome.gb

In [23]:
from Bio import Entrez, SeqIO
def download_genome_seq(species, op_file): 
    Entrez.email = "sargun22450@iiitd.ac.in"  
 
    handle = Entrez.esearch(db="nucleotide", term=species, idtype="acc")
    record = Entrez.read(handle)
    handle.close()
 
    if record['IdList']:
        genome_id = record['IdList']
        acc_no = genome_id   
        genbank_handle = Entrez.efetch(db="nucleotide", id=acc_no, rettype="fasta", retmode="text")
         
        with open(op_file, "w") as f:
            f.write(genbank_handle.read())
        genbank_handle.close()
        print(f"genome sequence downloaded and saved to {op_file}")
        print(f"accession number: {acc_no}")
        
        return acc_no
    else:
        print(f"no genome sequence found for {species}")
        return None

acc_no = download_genome_seq("Saccharomyces cerevisiae", "yeast_genome.fasta") 

genome sequence downloaded and saved to yeast_genome.fasta
accession number: ['NG_013229.2', 'NG_031987.2', 'NM_005732.4', 'NM_002911.4', 'NM_001318511.2', 'NM_015148.4', 'NM_001349238.2', 'NM_015972.4', 'NM_001362877.2', 'NM_001351614.2', 'NM_001297549.2', 'NM_006395.3', 'NM_001351617.2', 'NM_001104546.2', 'NM_001317930.2', 'NM_001126130.2', 'NM_001126129.2', 'NM_001007468.3', 'NM_001351632.2', 'NM_001252119.2']


In [24]:
# import regex as re
# from Bio import SeqIO
# from Bio import Entrez

# def find_ori_using_ars(accession_number): 
#     handle = Entrez.efetch(db="nucleotide", id=accession_number, rettype="fasta", retmode="text")
#     record = SeqIO.read(handle, "fasta")
#     handle.close()

#     dna_seq = str(record.seq)
  
#     ars_seq = "(A/T)TTTA(C/T)(A/G)TTT(A/T)" 
 
#     ars_pattern = re.compile("[AT]TTTA[CT][AG]TTT[AT]")
 
#     ars_matches = [match.start() for match in ars_pattern.finditer(dna_seq)]

#     matching_entries = [] 
#     first_ori_position = None

#     if ars_matches: #if the ars_seq matches a dna seq entry 
#         print(f"ARS sequence ({ars_seq})found at positions(1 based idxing as given in yeast_genome.fasta file ): {ars_matches}")

#         for ori_posn in ars_matches: 
#             # matching_entry = record[ori_posn : ori_posn + len(ars_seq)] 
#             matching_entry = record[ori_posn : ori_posn + 11] 
#             matching_entries.append((matching_entry.id, str(matching_entry.seq), ori_posn))

#             if first_ori_position is None: #first as ori posn
#                 first_ori_position = ori_posn

#     else:
#         print("No ARS sequence found in the given sequence.") # if no ARS sequence was found in that sequenc

#     return matching_entries, first_ori_position

# acc_no = download_genome_seq("Saccharomyces cerevisiae", "yeast_genome.fasta")
# # //acc_no is a list of all entries
# for i in range(len(acc_no)): 
#     accession_number=acc_no[i]
#     matching_entries, first_ori_position = find_ori_using_ars(accession_number)
    
#     if matching_entries:
#         print("Matching entries:")
#         for entry_id, sequence, ori_posn in matching_entries:
#             print(f"Sequence: {sequence} - Start Index: {ori_posn}")
    
#         print(f"First Origin of Replication (ORI) position : {first_ori_position }")
#     else:
#         print("no matching entries found")

import re
from Bio import SeqIO
from Bio import Entrez

def find_ori_using_ars_for_all_chromosomes(file_path):
    # Read the FASTA file containing all chromosomes
    records = SeqIO.to_dict(SeqIO.parse(file_path, "fasta"))

    # Initialize a dictionary to store results for each chromosome
    ori_results = {}

    for chromosome_id, record in records.items():
        # Get the DNA sequence for the current chromosome
        dna_seq = str(record.seq)

        # Define ARS sequence pattern
        ars_seq = "(A/T)TTTA(C/T)(A/G)TTT(A/T)"
        ars_pattern = re.compile("[AT]TTTA[CT][AG]TTT[AT]")

        # Find ARS sequence matches
        ars_matches = [match.start() for match in ars_pattern.finditer(dna_seq)]

        matching_entries = []
        first_ori_position = None

        # Process matching entries for the current chromosome
        if ars_matches:
            print(f"Chromosome {chromosome_id}: ARS sequence ({ars_seq}) found at positions (1-based indexing): {ars_matches}")

            for ori_posn in ars_matches:
                matching_entry = record[ori_posn: ori_posn + len(ars_seq)]
                matching_entries.append((matching_entry.id, str(matching_entry.seq), ori_posn))

                if first_ori_position is None:
                    first_ori_position = ori_posn

        else:
            print(f"Chromosome {chromosome_id}: No ARS sequence found.")

        # Store results for the current chromosome in the dictionary
        ori_results[chromosome_id] = {
            'matching_entries': matching_entries,
            'first_ori_position': first_ori_position
        }

    return ori_results

# Usage
file_path = "yeast_genome.fasta"
ori_results = find_ori_using_ars_for_all_chromosomes(file_path)

# Print results for each chromosome
for chromosome_id, results in ori_results.items():
    matching_entries = results['matching_entries']
    first_ori_position = results['first_ori_position']

    if matching_entries:
        print(f"\nChromosome {chromosome_id}: Matching entries:")
        for entry_id, sequence, ori_posn in matching_entries:
            print(f"  Sequence: {sequence} - Start Index: {ori_posn}")

        print(f"  First Origin of Replication (ORI) position: {first_ori_position}")
    else:
        print(f"\nChromosome {chromosome_id}: No matching entries found.")


Chromosome NG_013229.2: ARS sequence ((A/T)TTTA(C/T)(A/G)TTT(A/T)) found at positions (1-based indexing): [159, 423, 10087, 23067, 24105, 36462, 38702, 55048, 82587, 90507, 101874, 108868]
Chromosome NG_031987.2: No ARS sequence found.
Chromosome NM_005732.4: No ARS sequence found.
Chromosome NM_002911.4: No ARS sequence found.
Chromosome NM_001318511.2: No ARS sequence found.
Chromosome NM_015148.4: No ARS sequence found.
Chromosome NM_001349238.2: No ARS sequence found.
Chromosome NM_015972.4: No ARS sequence found.
Chromosome NM_001362877.2: No ARS sequence found.
Chromosome NM_001351614.2: No ARS sequence found.
Chromosome NM_001297549.2: No ARS sequence found.
Chromosome NM_006395.3: No ARS sequence found.
Chromosome NM_001351617.2: No ARS sequence found.
Chromosome NM_001104546.2: No ARS sequence found.
Chromosome NM_001317930.2: No ARS sequence found.
Chromosome NM_001126130.2: No ARS sequence found.
Chromosome NM_001126129.2: No ARS sequence found.
Chromosome NM_001007468.3: No

In [25]:
 
# Reference:(https://en.wikipedia.org/wiki/Autonomously_replicating_sequence#:~:text=An%20autonomously%20replicating%20sequence%20(ARS,their%20effect%20on%20plasmid%20stability.)

# using the hint given in the q, (S. cerevisiae has an Autonomously Replicating Sequence with features you can search.)
# the ARS that contains the origin of replication in the yeast genome is - "WTTTAYRTTTW" found in Saccharomyces cerevisiae.
# where
# "W" represents either adenine (A) or thymine (T).
# "T" is thymine.
# "A" is adenine
# "Y" represents either cytosine (C) or thymine (T).
# "R" represents either adenine (A) or guanine (G)

# So in this case, the output from yeastgemone.gb file is coming out to be-
        # "ARS consensus sequence (WTTTAYRTTTW) found at positions: [56946, 65503, 68265]"
        # "Origin of Replication (ORI) position: 56946"

# This means that there are three occurences matching the ars_consensus " WTTTAYRTTTW" and this seq is present in 
# three different chromosomes of the yeast genome, starting at positions 56946, 65503, and 68265.
# The first copy starts at 56946
# which after crosschecking from yeastgenome.gb file, found out to be ctgtattttt
 
 
 
# Then i used the nt_search function as done in tuts from  Bio.SeqUtils module to search for occurrences of 
# the chosen ARS consensus sequence in the DNA sequence.

# Identification of ORI: If matches are found, the positions of the ARS consensus sequence are extracted. 
# I hv taken the first position as the potential Origin of Replication (ORI).
 

<!-- Without Accession Number:

In the first version, the script searches the NCBI nucleotide database for the species name "Saccharomyces cerevisiae" using Entrez.esearch.
The search result provides a list of IDs, and the script fetches the complete genome record using the first ID with Entrez.efetch.

With Accession Number:

In the second version, the script directly uses the provided accession number to fetch the genome record from the NCBI nucleotide database using Entrez.efetch.
In both cases, the subsequent steps for finding the Origin of Replication (ORI) using Autonomously Replicating Sequence (ARS) features remain the same. The ARS consensus sequence is searched in the obtained genome sequence, and the ORI position is determined based on the identified positions of the ARS consensus sequence.

The primary advantage of using the accession number directly is that it skips the initial search step, making the process more efficient when you already have the unique identifier for the genome record. -->

In [26]:
# from Bio import SeqIO
# from collections import Counter

# def find_most_common_sequence(dna_sequence, window_size):
#     """
#     Find the most common DNA sequence or motif using a sliding window approach.

#     Parameters:
#     - dna_sequence: The DNA sequence in which to search for the most common motif.
#     - window_size: The size of the sliding window.

#     Returns:
#     - most_common_sequence: The most common DNA sequence or motif.
#     """
#     sequences = [dna_sequence[i:i + window_size] for i in range(len(dna_sequence) - window_size + 1)]
#     counter = Counter(sequences)
#     most_common_sequence, _ = counter.most_common(1)[0]

#     return most_common_sequence

# # Read the GenBank file and extract the DNA sequence
# genbank_file = "yeast_genome.gb"
# record = SeqIO.read(genbank_file, "genbank")
# dna_sequence = str(record.seq)

# # Define the size of the sliding window
# window_size = 10  # You can adjust this based on your analysis

# # Find the most common DNA sequence in the genome
# most_common_sequence = find_most_common_sequence(dna_sequence, window_size)

# # Print the results
# print(f"The most common DNA sequence or motif in the genome: {most_common_sequence}")


In [27]:
# import regex as re
# from Bio import SeqIO
# from Bio import Entrez

# def find_ori_using_ars(accession_number):
#     """
#     Find the Origin of Replication (ORI) in the yeast genome using Autonomously Replicating Sequence (ARS) features.

#     Parameters:
#     - accession_number: The accession number of the yeast genome sequence in the NCBI database.

#     Returns:
#     - matching_entries: A list of tuples containing entry ID, sequence, and starting index for each matching entry.
#     - first_ori_position: The 0-based index of the first origin of replication position.
#     """
#     genbank_handle = Entrez.efetch(db="nucleotide", id=accession_number, rettype="gb", retmode="text")
#     record = SeqIO.read(genbank_handle, "genbank")
#     genbank_handle.close()

#     dna_sequence = str(record.seq)

#     # ARS consensus sequence in S. cerevisiae
#     ars_seq = "WTTTAYRTTTW"

#     # Create a regular expression pattern for the ARS consensus sequence
#     ars_pattern = re.compile("[AT]TTTA[CT][AG]TTT[AT]")

#     # Search for the ARS consensus sequence using regex
#     ars_matches = [match.start() for match in ars_pattern.finditer(dna_sequence)]

#     matching_entries = []
#     first_ori_position = None

#     if ars_matches:
#         # Print the ARS consensus sequence and its positions
#         print(f"ARS consensus sequence ({ars_seq}) found at positions(1 based idxing as given in yeastgemone.gb file ): {ars_matches}")

#         # Check if positions were found
#         if ars_matches:
#             for ori_position in ars_matches:
#                 # Extract the matching entry
#                 matching_entry = record[ori_position : ori_position + len(ars_seq)]
#                 matching_entries.append((matching_entry.id, str(matching_entry.seq), ori_position))

#                 # Store the first ORI position
#                 if first_ori_position is None:
#                     first_ori_position = ori_position

#     else:
#         # Print a message if no ARS consensus sequence was found in the given sequence
#         print("No ARS consensus sequence found in the given sequence.")

#     return matching_entries, first_ori_position



In [28]:
# from Bio import SeqIO
# from Bio.SeqUtils import nt_search
# def find_ars(genome_file):
#     # Read the genome sequence from the GenBank file
#     with open(genome_file, "r") as f:
#         record = SeqIO.read(f, "genbank")

#     # Extract the DNA sequence and convert it to a string
#     dna_sequence = str(record.seq)

#     # ARS consensus sequence in S. cerevisiae
#     ars_consensus = "WTTTAYRTTTW"

#     # Additional features associated with ARS
#     additional_features = ["AATAAA", "TTTTT", "GCGC"]

#     # Search for the ARS consensus sequence in the DNA sequence
#     ars_matches = nt_search(dna_sequence, ars_consensus)

#     # Check if any matches were found for the ARS consensus sequence
#     if ars_matches:
#         # Extract positions from the list (excluding the ARS consensus sequence itself)
#         positions = [pos for pos in ars_matches if isinstance(pos, int)]

#         # Print the ARS consensus sequence and its positions
#         print(f"ARS consensus sequence ({ars_consensus}) found at positions: {positions}")

#         # Check if additional features are present near the ARS consensus sequence
#         for feature in additional_features:
#             feature_matches = nt_search(dna_sequence, feature)
#             if any(isinstance(pos, int) and pos in range(positions[0] - 20, positions[0] + 20) for pos in feature_matches):
#                 print(f"Additional feature ({feature}) found near the ARS consensus sequence.")

#     else:
#         # Print a message if no ARS consensus sequence was found in the given sequence
#         print("No ARS consensus sequence found in the given sequence.")

# # Example usage
# find_ars("yeast_genome.gb")
