### Downloading Fasta files
#### Taken help from the Tutorials

In [None]:
from Bio import Entrez, SeqIO

def download_fasta_sequence(email, accession_number, output_file):
    Entrez.email = email
    handle = Entrez.efetch(db="nucleotide", id=accession_number, rettype="fasta", retmode="text")
    print(handle.url)
    sequence = handle.read()
    with open(output_file, "w") as f:
        f.write(sequence)

# Replace with your email, accession number, and output file name
email = "sameer22439@iiitd.ac.in"
accession_number = ["NC_001133.9", "NC_001134.8", "NC_001135.5", "NC_001136.10", "NC_001137.3", "NC_001138.5", "NC_001139.9", "NC_001140.6", "NC_001141.2", "NC_001142.9", "NC_001143.9", "NC_001144.5", "NC_001145.3", "NC_001146.8", "NC_001147.6", "NC_001148.4", "NC_001224.1"]  #List of Accession Number


for i in range (17):
    if (i!=16):
        output_file = f"output_{i+1}.fasta"
        download_fasta_sequence(email, accession_number[i], output_file)
    else:
        output_file = f"output_MT.fasta"
        download_fasta_sequence(email, accession_number[i], output_file)

### Method 1: Using ARS = 5'-WTTTATRTTTW-3' (total 8 possible cases for this method) as W = A or T; R = A or G; there are 2 possible cases of Nucleotide at 3 position therefore 8 cases
#### Taken help from the documentation available on YeastGenome Site = "https://www.yeastgenome.org/locus/S000118318"
#### Taken help from tutorials

In [None]:
from Bio import Entrez, SeqIO
import re

def ori_in_sequences(ithChromosome):
    output_file = f"output_{ithChromosome}.fasta"

    with open(output_file, 'r') as file:  # Open file in read mode
        lines = file.readlines()

    genome_sequence = ''.join(lines[1:])  # Concatenate all lines except the first one (assuming first line is header)
    genome_sequence = genome_sequence.replace('\n', '')  # Remove newline characters


    pattern_for_ARS = re.compile(r'(A|T)TTTAT(A|G)TTT(A|T)')
    indexes_of_ORI = []

    for ori_found_in_sequence in pattern_for_ARS.finditer(genome_sequence):
        indexes_of_ORI.append(ori_found_in_sequence.start())            

    
    print(f"Chromosome {ithChromosome} of Baker's Yeast (Saccharomyces cerevisiae) : ")
    count = 0
    for i in indexes_of_ORI:
        count += 1
        print(f"ORI - {count} : Start Index - {i}; End Index - {i+11}")
        print(genome_sequence[i:i+12])

    if (count == 0):
        print(f"There is no ORI in the Chromosome {ithChromosome} of Baker's Yeast (Saccharomyces cerevisiae)")
    
    return count

def ori_in_sequences_in_Mitochondria():
    output_file = f"output_MT.fasta"

    with open(output_file, 'r') as file:  # Open file in read mode
        lines = file.readlines()

    genome_sequence = ''.join(lines[1:])  # Concatenate all lines except the first one (assuming first line is header)
    genome_sequence = genome_sequence.replace('\n', '')  # Remove newline characters


    pattern_for_ARS = re.compile(r'(A|T)TTTAT(A|G)TTT(A|T)')
    indexes_of_ORI = []

    for ori_found_in_sequence in pattern_for_ARS.finditer(genome_sequence):
        indexes_of_ORI.append(ori_found_in_sequence.start())            

    
    print(f"Mitochondria of Baker's Yeast (Saccharomyces cerevisiae) : ")
    count = 0
    for i in indexes_of_ORI:
        count += 1
        print(f"ORI - {count} : Start Index - {i}; End Index - {i+11}")
        print(genome_sequence[i:i+12])

    if (count == 0):
        print("There is no ORI in Mitochondria of Baker's Yeast (Saccharomyces cerevisiae)")
        
    return count

def main():
    count = 0
    for i in range(1,17):
        count += ori_in_sequences(i)
        # print(count)
        print("\n\n")
    count += ori_in_sequences_in_Mitochondria()
    print(f"\n\nThere are total {count} ORI's in the total genome of Baker's Yeast (Saccharomyces cerevisiae)")


main()
       

### Method 2: Using ARS = 5'-WTTTAYRTTTW-3' (total 16 cases for this method) as W = A or T; Y = C or T; R = A or G; there are 2 possible cases of Nucleotide at 4 position therefore 16 cases
### More Precise than Method 1
#### Taken help from the documentation available on YeastGenome Site = "https://www.yeastgenome.org/locus/S000118318"
#### Taken help from tutorials

In [None]:
from Bio import Entrez, SeqIO
import re

def ori_in_sequences(ithChromosome):
    output_file = f"output_{ithChromosome}.fasta"

    with open(output_file, 'r') as file:  # Open file in read mode
        lines = file.readlines()

    genome_sequence = ''.join(lines[1:])  # Concatenate all lines except the first one (assuming first line is header)
    genome_sequence = genome_sequence.replace('\n', '')  # Remove newline characters


    pattern_for_ARS = re.compile(r'(A|T)TTTA(C|T)(A|G)TTT(A|T)')
    indexes_of_ORI = []

    for ori_found_in_sequence in pattern_for_ARS.finditer(genome_sequence):
        indexes_of_ORI.append(ori_found_in_sequence.start())            

    
    print(f"Chromosome {ithChromosome} of Baker's Yeast (Saccharomyces cerevisiae) : ")
    count = 0
    for i in indexes_of_ORI:
        count += 1
        print(f"ORI - {count} : Start Index - {i}; End Index - {i+11}")
        print(genome_sequence[i:i+12])

    if (count == 0):
        print(f"There is no ORI in the Chromosome {ithChromosome} of Baker's Yeast (Saccharomyces cerevisiae)")
    
    return count

def ori_in_sequences_in_Mitochondria():
    output_file = f"output_MT.fasta"

    with open(output_file, 'r') as file:  # Open file in read mode
        lines = file.readlines()

    genome_sequence = ''.join(lines[1:])  # Concatenate all lines except the first one (assuming first line is header)
    genome_sequence = genome_sequence.replace('\n', '')  # Remove newline characters


    pattern_for_ARS = re.compile(r'(A|T)TTTA(C|T)(A|G)TTT(A|T)')
    indexes_of_ORI = []

    for ori_found_in_sequence in pattern_for_ARS.finditer(genome_sequence):
        indexes_of_ORI.append(ori_found_in_sequence.start())            

    
    print(f"Mitochondria of Baker's Yeast (Saccharomyces cerevisiae) : ")
    count = 0
    for i in indexes_of_ORI:
        count += 1
        print(f"ORI - {count} : Start Index - {i}; End Index - {i+11}")
        print(genome_sequence[i:i+12])
    
    if (count == 0):
        print("There is no ORI in Mitochondria of Baker's Yeast (Saccharomyces cerevisiae)")

    return count

def main():
    count = 0
    for i in range(1,17):
        count += ori_in_sequences(i)
        # print(count)
        print("\n\n")
    count += ori_in_sequences_in_Mitochondria()
    print(f"\n\nThere are total {count} ORI's in the total genome of Baker's Yeast (Saccharomyces cerevisiae)")


main()
       

### Method 3: Using AT rich sites for each chromosome, this code is user centric He/She must know which window size the user has to take for particular chromosome and particular threshold percentage of AT content in the window provided by the user.
#### Generally for example I have taken 100 as window size and 80% as threshold AT percentage in the window. (*It will just give the general idea of ORI and it might give wrong answer; so see the answers given below for more precise answer)

In [None]:
def calculate_AT_percentage(sequence):
    total_length = len(sequence)
    AT_count = 0
    for nucleotide in sequence:
        if nucleotide.upper() == 'A' or nucleotide.upper() == 'T':
            AT_count += 1
    return (AT_count / total_length) * 100


def main(output_file, ithChromosome):
    # Read the DNA sequence from the file
    with open(output_file, 'r') as file:
        lines = file.readlines()

    sequence = ''.join(lines[1:])  # Concatenate all lines except the first one (assuming first line is header)
    sequence = sequence.replace('\n', '')  # Remove newline characters

    # Input band size
    band_size = int(input(f"Enter the band size (number of nucleotides per band) you want to take for Chromosome No. {ithChromosome}: "))
    threshold = float(input(f"Enter the AT threshold percentage you want to take for Chromosome No. {ithChromosome}: "))

    # band_size = 100
    # threshold = 80


    # Check AT% in every band
    current_band_start = 0
    count = 1
    for i in range(0, len(sequence), band_size):
        band = sequence[i:i+band_size]
        AT_percentage = calculate_AT_percentage(band)
        if AT_percentage >= threshold:
            print(f"ORI {count} of {ithChromosome} Chromosome: Band {i//band_size + 1}, Actual_AT%: {AT_percentage:.2f}%, Start: {current_band_start}, End: {i}")
            count += 1
        current_band_start = i + 1



for i in range (1,18):
    output_file = f"output_{i}.fasta"
    
    if (i != 17):
        main(output_file, i)
    else:
        main("output_MT.fasta", 17)


### Method 4: Directly Extracting from GeneBank by using accession number
#### It will always give correct answer as I have directly accessed the data present in the GeneBank to find the ORI's.

In [None]:
from Bio import Entrez
from Bio import SeqIO

# Set your email address (required by NCBI)
Entrez.email = "sameer22439@iiitd.ac.in"

def fetch_replication_origins(accession_number, output_file):
    handle = Entrez.efetch(db="nucleotide", id=accession_number, rettype="gb", retmode="text")
    record = handle.read()
    handle.close()
    
    with open(output_file, 'r') as file:
        lines = file.readlines()

    sequence = ''.join(lines[1:])  # Concatenate all lines except the first one (assuming first line is header)
    sequence = sequence.replace('\n', '')  # Remove newline characters

    count = 1

    # replication_origins = []
    # Parse the GenBank record
    for line in record.split("\n"):
        if line.startswith("ORIGIN"):
            break
        if "rep_origin" in line:
            # Extract replication origin information
            parts = line.strip().split()
            start, end = parts[-1].split("..")
            # print(parts)
            start = int(start)
            end = int(end)
            print(f"ORI - {count} : Start Index - {start} ; End Index - {end}")
            print(sequence[start-1:end])
            count += 1
    if count == 1 :
        print("There is no ORI in this Chromosome")

def generate_complementary_strand(sequence):
    complementary_sequence = ""
    for nucleotide in sequence:
        if nucleotide == "A":
            complementary_sequence += "T"
        elif nucleotide == "T":
            complementary_sequence += "A"
        elif nucleotide == "C":
            complementary_sequence += "G"
        elif nucleotide == "G":
            complementary_sequence += "C"
        else:
            complementary_sequence += nucleotide  # Preserve other characters
    return complementary_sequence

def fetch_replication_origin_of_MT(accession_number, output_file):
    handle = Entrez.efetch(db="nucleotide", id=accession_number, rettype="gb", retmode="text")
    record = handle.read()
    handle.close()

    with open(output_file, 'r') as file:  # Open file in read mode
        lines = file.readlines()

    sequence = ''.join(lines[1:])  # Concatenate all lines except the first one (assuming first line is header)
    sequence = sequence.replace('\n', '')  # Remove newline characters

    count = 1
    # Parse the GenBank record
    for line in record.split("\n"):
        if line.startswith("ORIGIN"):
            break
        if "rep_origin" in line:
            parts = line.split('(')[-1].split(')')[0].split('..')
            if 'complement' in line:
                start = (int(parts[0]))
                end = (int(parts[1]))
                print(f"ORI - {count} : Start Index - {start} ; End Index - {end}")
                # print(sequence[start-1:end])
                complementary_strand = generate_complementary_strand(sequence[start-1:end])
                print("It is the complementary strand with the indexes as per the template strand: " + complementary_strand)
                count += 1
            else:
                nums = [int(x) for x in line.split() if x.isdigit()]
                if len(nums) == 1:
                    start = int(nums[0])
                    end = int(nums[0])
                elif len(nums) == 2:
                    start = int(nums[0])
                    end = int(nums[1])
                print(f"ORI - {count} : Start Index - {start} ; End Index - {end}")
                print(sequence[start-1:end])
                count += 1
    if count == 1 :
        print("There is no ORI in Mitochondria")
    
    
    

def main():
    # Accession number for baker's yeast chromosome 

    for i in range(16):
        accession_number = ["NC_001133.9", "NC_001134.8", "NC_001135.5", "NC_001136.10", "NC_001137.3", "NC_001138.5", "NC_001139.9", "NC_001140.6", "NC_001141.2", "NC_001142.9", "NC_001143.9", "NC_001144.5", "NC_001145.3", "NC_001146.8", "NC_001147.6", "NC_001148.4", "NC_001224.1"]  #List of Accession Number
        print(f"Replication Origins for Baker's Yeast Chromosome {i+1}:")
        fetch_replication_origins(accession_number[i], f"output_{i+1}.fasta")
        print("\n\n")

    print(f"Replication Origins for Baker's Yeast Mitochondria :")
    fetch_replication_origin_of_MT("NC_001224.1", "output_MT.fasta")


main()
