# Data Collection for Bacterial Gene Prediction

Download and parse bacterial genomes from NCBI for gene prediction analysis.


## Goals:
- Download genomes from NCBI
- Parse GenBank files with BioPython
- Extract basic genome information

In [None]:
import os
from Bio import Entrez, SeqIO
import pandas as pd

print("✓ All imports successful!")
print(f"✓ Current working directory: {os.getcwd()}")

✓ All imports successful!
✓ Current working directory: c:\Users\User\Desktop\Bacterial Gene Prediction & Comparison Project\notebooks


In [None]:
def download_genome_and_reference(genome_id, email):
    """
    Download both genome FASTA and reference GFF from NCBI.
    Skips download if files already exist.
    
    Parameters:
        genome_id (str): NCBI accession ID
        email (str): Your email for NCBI Entrez
    
    Returns:
        tuple: (genome_sequence, gff_path)
            - genome_sequence:sequence extracted from Biopython SeqRecord object
            - gff_path: Path to the GFF file
    """
    Entrez.email = email
    
    # Check/download genome FASTA
    fasta_path = f"{genome_id}.fasta"
    if os.path.exists(fasta_path):
        print(f"Genome file {fasta_path} already exists, loading from disk...")
        genome_record = SeqIO.read(fasta_path, "fasta")
    else:
        print(f"Downloading genome {genome_id}...")
        handle = Entrez.efetch(db="nuccore", id=genome_id, rettype="fasta", retmode="text")
        genome_record = SeqIO.read(handle, "fasta")
        handle.close()
        # Save for future use
        SeqIO.write(genome_record, fasta_path, "fasta")
        print(f"Saved to {fasta_path}")
    
    genome_sequence = str(genome_record.seq)
    print(f"Loaded: {genome_record.description}")
    print(f"Genome size: {len(genome_sequence):,} bp")
    
    # Check/download reference GFF
    gff_path = f"{genome_id}.gff"
    if os.path.exists(gff_path):
        print(f"Reference file {gff_path} already exists, skipping download")
    else:
        print(f"Downloading reference GFF...")
        handle = Entrez.efetch(db="nuccore", id=genome_id, rettype="gff3", retmode="text")
        with open(gff_path, "w") as f:
            f.write(handle.read())
        handle.close()
        print(f"Saved to {gff_path}")
    
    return genome_sequence, gff_path

def load_reference_genes_from_gff(gff_path):
    """
    Load reference genes from GFF file.
    
    Args:
        gff_path (str): Path to GFF file
        
    Returns:
        set: Set of (start, end) tuples for fast lookup
    """
    import pandas as pd
    
    print(f"Loading reference genes from {gff_path}...")
    
    # Read GFF file (tab-separated, skip comment lines starting with #)
    ref = pd.read_csv(gff_path, sep="\t", comment="#", header=None)
    
    # GFF columns: seqid, source, type, start, end, score, strand, phase, attributes
    # We want columns 2 (type), 3 (start), 4 (end)
    
    if (ref[2] == "CDS").sum() > 0:
        ref_genes = ref[ref[2] == "CDS"][[3, 4]].copy()
        print(f"  Using CDS features")
    elif (ref[2] == "gene").sum() > 0:
        ref_genes = ref[ref[2] == "gene"][[3, 4]].copy()
        print(f"  Using gene features")
    else:
        ref_genes = ref[[3, 4]].copy()
        print(f"  Using all features")
    
    ref_genes.columns = ["start", "end"]
    
    ref_genes = ref_genes.drop_duplicates()
    
    ref_gene_set = set(zip(ref_genes['start'], ref_genes['end']))
    
    print(f"  ✓ Loaded {len(ref_gene_set):,} reference genes")
    
    return ref_gene_set

def print_dna_sequence(genome_seq, start, end, line_width=60, full=False):
    """
    Print DNA sequence at given positions.
    If full=True, prints as one continuous line for easy copying.
    """
    sequence = genome_seq[start-1:end]
    
    print(f"Position: {start:,} - {end:,}")
    print(f"Length: {len(sequence)} bp")
    
    if full:
        print(f"Sequence:")
        print(sequence)
    else:
        print(f"Sequence:")
        for i in range(0, len(sequence), line_width):
            pos = start + i
            line = sequence[i:i+line_width]
            print(f"{pos:>10,}  {line}")
    print()

def get_reference_orfs_from_gff(genome_id, cached_candidates):
    """
    Extract reference ORFs from the genome's GFF annotation.
    
    Returns a list of dicts with keys: start, end, strand
    """
    genome_data = cached_candidates.get(genome_id)
    if genome_data is None:
        raise ValueError(f"No cached data for genome {genome_id}")
    
    gff_path = genome_data.get('gff_path')
    if gff_path is None:
        raise ValueError(f"No GFF path for genome {genome_id}")
    
    gff = pd.read_csv(gff_path, sep="\t", comment="#", header=None)
    cds = gff[gff[2] == "CDS"]  # use "gene" if you prefer
    reference_orfs = []
    for _, row in cds.iterrows():
        reference_orfs.append({
            'start': int(row[3]),
            'end': int(row[4]),
            'strand': row[6]
        })
    return reference_orfs
