## Question 1: DNA Data Structures

In [1]:
class DNA:
    """
    Implements a DNA sequence class using Python lists and dictionaries
    for storage and operations, ensuring sequence validation.
    """
    VALID_NUCLEOTIDES = {'A', 'T', 'G', 'C'}

    def __init__(self, sequence):
        """
        Initializes the DNA object and validates the sequence.
        
        :param sequence: A string representing the DNA sequence.
        :raises ValueError: If the sequence contains invalid nucleotides.
        """
        sequence = sequence.upper()
        if not self._is_valid(sequence):
            invalid_chars = set(sequence) - self.VALID_NUCLEOTIDES
            raise ValueError(f"Invalid nucleotide(s) found: {', '.join(sorted(invalid_chars))}. Only A, T, G, C are allowed.")
        
        # Store as a list for easy manipulation or a string/list can be used.
        # Storing as a string and dictionary for counting (as requested in Q2) is efficient.
        self._sequence = sequence
        self._nucleotide_counts = self._count_nucleotides(sequence)

    def _is_valid(self, sequence):
        """
        Validates the DNA sequence (ensures only A, T, G, C nucleotides).
        
        :param sequence: The DNA sequence string.
        :return: True if valid, False otherwise.
        """
        return all(char in self.VALID_NUCLEOTIDES for char in sequence)

    def _count_nucleotides(self, sequence):
        """
        Helper method to count individual nucleotides.
        
        :param sequence: The DNA sequence string.
        :return: A dictionary with nucleotide counts.
        """
        counts = {'A': 0, 'T': 0, 'G': 0, 'C': 0}
        for char in sequence:
            if char in counts:
                counts[char] += 1
        return counts

    def get_length(self):
        """
        Implements a method for sequence length.
        
        :return: The length of the DNA sequence.
        """
        return len(self._sequence)

    def get_nucleotide_counts(self):
        """
        Implements a method for nucleotide counting.
        
        :return: A dictionary of nucleotide counts.
        """
        return self._nucleotide_counts.copy()

    def get_basic_statistics(self):
        """
        Implements a method for basic statistics (GC content).
        
        :return: The GC content as a percentage (float).
        """
        total = self.get_length()
        if total == 0:
            return 0.0
        gc_count = self._nucleotide_counts.get('G', 0) + self._nucleotide_counts.get('C', 0)
        return (gc_count / total) * 100.0

    def __str__(self):
        """String representation of the DNA sequence."""
        return self._sequence

    def __repr__(self):
        """Formal representation of the DNA object."""
        return f"DNA('{self._sequence}')"

## Question 2: Nucleotide Counting and Analysis

In [2]:
def count_nucleotides(sequence: str) -> dict:
    """
    Counts individual nucleotides (A, T, G, C) in a DNA sequence.

    :param sequence: The input DNA sequence string.
    :return: A dictionary where keys are nucleotides and values are their counts.
    """
    counts = {'A': 0, 'T': 0, 'G': 0, 'C': 0}
    sequence = sequence.upper()
    for char in sequence:
        if char in counts:
            counts[char] += 1
    return counts

def calculate_nucleotide_frequencies(sequence: str) -> dict:
    """
    Calculates nucleotide frequencies as percentages.

    :param sequence: The input DNA sequence string.
    :return: A dictionary with nucleotides as keys and their frequencies (percentages) as values.
    """
    counts = count_nucleotides(sequence)
    total_length = sum(counts.values())
    
    if total_length == 0:
        return {'A': 0.0, 'T': 0.0, 'G': 0.0, 'C': 0.0}

    frequencies = {
        nucleotide: (count / total_length) * 100.0 
        for nucleotide, count in counts.items()
    }
    return frequencies

def generate_analysis_report(sequence: str) -> str:
    """
    Generates a comprehensive nucleotide analysis report.

    :param sequence: The input DNA sequence string.
    :return: A formatted string containing the analysis report.
    """
    counts = count_nucleotides(sequence)
    frequencies = calculate_nucleotide_frequencies(sequence)
    total_length = sum(counts.values())
    
    report = f"--- Nucleotide Analysis Report ---\n"
    report += f"Sequence Length: {total_length}\n"
    report += f"Nucleotide Counts:\n"
    for nuc, count in sorted(counts.items()):
        report += f"  {nuc}: {count}\n"
    
    gc_content = frequencies['G'] + frequencies['C']
    report += f"Nucleotide Frequencies (Percentages):\n"
    for nuc, freq in sorted(frequencies.items()):
        report += f"  {nuc}: {freq:.2f}%\n"
    report += f"GC Content: {gc_content:.2f}%\n"
    report += f"---------------------------------\n"
    return report

def compare_nucleotide_composition(seq1: str, seq2: str) -> dict:
    """
    Compares nucleotide composition (frequencies) between two sequences.

    :param seq1: The first DNA sequence string.
    :param seq2: The second DNA sequence string.
    :return: A dictionary showing the frequency difference (seq1 - seq2) for each nucleotide.
    """
    freq1 = calculate_nucleotide_frequencies(seq1)
    freq2 = calculate_nucleotide_frequencies(seq2)
    
    comparison = {
        nuc: freq1.get(nuc, 0.0) - freq2.get(nuc, 0.0)
        for nuc in {'A', 'T', 'G', 'C'}
    }
    return comparison

## Question 3: String Manipulation for Genomics

In [3]:
def dna_to_case(sequence: str, case: str = 'upper') -> str:
    """
    Converts DNA sequence to uppercase or lowercase.

    :param sequence: The input DNA sequence string.
    :param case: 'upper' to convert to uppercase, 'lower' for lowercase.
    :return: The case-converted DNA sequence.
    """
    if case == 'upper':
        return sequence.upper()
    elif case == 'lower':
        return sequence.lower()
    else:
        # Default to uppercase if case argument is invalid
        return sequence.upper()

def remove_non_nucleotides(sequence: str, valid_nucs: set = {'A', 'T', 'G', 'C'}) -> str:
    """
    Removes non-nucleotide characters from sequences (keeps only A, T, G, C, case-insensitive).

    :param sequence: The input DNA sequence string.
    :param valid_nucs: Set of valid nucleotides (default A, T, G, C).
    :return: The cleaned DNA sequence string.
    """
    sequence = sequence.upper()
    cleaned_sequence = "".join(char for char in sequence if char in valid_nucs)
    return cleaned_sequence

def split_into_codons(sequence: str) -> list:
    """
    Splits a DNA sequence into codons (groups of 3).

    :param sequence: The input DNA sequence string.
    :return: A list of codons. The last element might be less than 3 characters.
    """
    sequence = remove_non_nucleotides(sequence) # Clean sequence first
    codons = [sequence[i:i+3] for i in range(0, len(sequence), 3)]
    return codons

def merge_dna_fragments(fragments: list) -> str:
    """
    Merges multiple DNA fragments (strings) into a single sequence.

    :param fragments: A list of DNA sequence strings.
    :return: A single merged DNA sequence string.
    """
    # Assuming the fragments are already cleaned/validated DNA sequences.
    # If not, one might call remove_non_nucleotides on each fragment before merging.
    return "".join(fragments)

# Part B: Essential DNA Algorithms

## Question 4: DNA Transcription

In [4]:
def transcribe_dna_to_rna(dna_sequence: str, strand_type: str = 'coding') -> str:
    """
    Converts DNA to RNA (T->U substitution) handling both coding and template strands.

    :param dna_sequence: The input DNA sequence string.
    :param strand_type: 'coding' (default) or 'template'.
    :raises ValueError: For invalid sequences or strand type.
    :return: The transcribed RNA sequence string.
    """
    # Error checking for invalid sequences (only checks for non-DNA characters, not T->U itself)
    # The requirement is to handle the four DNA bases (A, T, G, C).
    valid_dna = {'A', 'T', 'G', 'C'}
    upper_sequence = dna_sequence.upper().replace(' ', '')
    
    if not all(char in valid_dna for char in upper_sequence):
        raise ValueError("Invalid DNA characters found in sequence.")
        
    if strand_type not in ['coding', 'template']:
        raise ValueError("Invalid strand_type. Must be 'coding' or 'template'.")

    # The coding strand is *almost* identical to the RNA transcript,
    # except T is replaced by U.
    if strand_type == 'coding':
        rna_sequence = upper_sequence.replace('T', 'U')
    
    # The template strand is complementary to the coding strand.
    # RNA transcript is complementary to the template strand (A<->U, T<->A, G<->C).
    elif strand_type == 'template':
        template_complement_map = {'A': 'U', 'T': 'A', 'G': 'C', 'C': 'G'}
        rna_sequence_list = [template_complement_map.get(base, '') for base in upper_sequence]
        # In transcription, the template strand is read 3' to 5',
        # producing an RNA transcript 5' to 3'. The simple string replacement
        # assumes 5'-3' input and produces 5'-3' output, which is standard.
        rna_sequence = "".join(rna_sequence_list)
        
    return rna_sequence

def batch_transcribe(dna_sequences: list, strand_type: str = 'coding') -> list:
    """
    Creates batch processing for multiple DNA sequences.

    :param dna_sequences: A list of DNA sequence strings.
    :param strand_type: 'coding' (default) or 'template'.
    :return: A list of transcribed RNA sequences (strings).
    """
    results = []
    for seq in dna_sequences:
        try:
            rna = transcribe_dna_to_rna(seq, strand_type)
            results.append(rna)
        except ValueError as e:
            results.append(f"ERROR: {e} for sequence '{seq}'")
    return results

## Question 5: Reverse Complement Generation

In [5]:
def get_reverse_complement(sequence: str, 
                           orientation: str = "5'-3'", 
                           allow_degenerate: bool = True) -> str:
    """
    Generates the reverse complement of a DNA sequence.

    :param sequence: The input DNA sequence string.
    :param orientation: The desired output orientation, "5'-3'" (default) or "3'-5'".
    :param allow_degenerate: If True, handles IUPAC degenerate codes.
    :raises ValueError: For invalid orientation or characters if not allowing degenerate codes.
    :return: The reverse complement sequence.
    """
    
    # 1. Nucleotide Complement Map (including IUPAC degenerate codes)
    complement_map = {
        'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G',
        # Degenerate bases
        'R': 'Y', 'Y': 'R',  # R: A/G (Pu), Y: C/T (Py)
        'S': 'S', 'W': 'W',  # S: G/C (Strong), W: A/T (Weak)
        'K': 'M', 'M': 'K',  # K: G/T (Keto), M: A/C (Amino)
        'B': 'V', 'V': 'B',  # B: C/G/T (not A), V: A/C/G (not T)
        'D': 'H', 'H': 'D',  # D: A/G/T (not C), H: A/C/T (not G)
        'N': 'N',  # N: Any base
    }

    upper_sequence = sequence.upper().replace(' ', '')
    
    # 2. Handle invalid characters if degeneracy is not allowed
    valid_nucs = {'A', 'T', 'G', 'C'}
    if not allow_degenerate:
        if not all(char in valid_nucs for char in upper_sequence):
            raise ValueError("Invalid DNA characters found. Set allow_degenerate=True to use IUPAC codes.")
    
    # 3. Generate the complement
    complement_list = []
    for base in upper_sequence:
        if base in complement_map:
            complement_list.append(complement_map[base])
        elif base in valid_nucs: # A, T, G, C
             complement_list.append(complement_map[base])
        else:
            # If degenerate is allowed, any unknown character defaults to N or raises error
            if allow_degenerate and base == 'N':
                complement_list.append('N')
            else:
                 raise ValueError(f"Unknown base '{base}' encountered in sequence.")


    # 4. Reverse the complement
    reverse_complement_seq = "".join(reversed(complement_list))

    # 5. Handle 5'-3' and 3'-5' orientations
    if orientation == "5'-3'":
        # The result of reversing the complement is the 5'-3' reverse complement
        return reverse_complement_seq
    elif orientation == "3'-5'":
        # The 3'-5' orientation is the reverse of the 5'-3' reverse complement
        return "".join(reversed(reverse_complement_seq))
    else:
        raise ValueError("Invalid orientation. Must be \"5'-3'\" or \"3'-5'\".")

# Optimization for large sequences (Question 5, Part 50):
# The dictionary-based lookup and list manipulation used above (join/reversed) 
# is generally O(N) time complexity and memory efficient in Python for typical genomic sequences.
# For truly massive sequences that exceed available RAM, one would use file-based
# processing or memory-mapped files (like NumPy arrays if applicable), 
# but the current implementation is the standard "efficient algorithm" for in-memory strings.

# Part C: Code Optimization and Testing

## Question 6: Algorithm Optimization

In [6]:
import collections
import time
import sys

def count_nucleotides_optimized(sequence: str) -> dict:
    """
    Optimized function to count individual nucleotides using collections.Counter.
    
    Compares favorably to manual iteration for common use cases and is more
    concise and Pythonic.
    
    :param sequence: The input DNA sequence string.
    :return: A dictionary where keys are nucleotides and values are their counts.
    """
    # Step 1: Filter and upper-case the sequence in one pass
    valid_nucs = {'A', 'T', 'G', 'C'}
    upper_sequence = sequence.upper().replace(' ', '')

    # Step 2: Use collections.Counter, which is implemented efficiently in C
    counts = collections.Counter(char for char in upper_sequence if char in valid_nucs)

    # Step 3: Ensure all standard nucleotides are present in the final dictionary
    final_counts = {'A': 0, 'T': 0, 'G': 0, 'C': 0}
    final_counts.update(counts)
    return final_counts

# --- Benchmarking (Profiling and Benchmarking Implementation) ---

def profile_and_benchmark(func_name, func, sequence, iterations=1000):
    """
    Profiles and benchmarks a given function.
    
    :param func_name: Name of the function.
    :param func: The function object to test.
    :param sequence: The sequence to test with.
    :param iterations: Number of times to run the function for benchmarking.
    :return: The total execution time and memory usage.
    """
    
    # Time Complexity: The function call is O(N) where N is sequence length.
    # Benchmarking (Time)
    start_time = time.perf_counter()
    for _ in range(iterations):
        _ = func(sequence)
    end_time = time.perf_counter()
    total_time = (end_time - start_time)
    
    # Memory-efficient solutions (Analysis/Measurement)
    # Simple measurement of the output object size (not true memory profiling)
    output = func(sequence)
    output_memory = sys.getsizeof(output) + sum(sys.getsizeof(k) + sys.getsizeof(v) for k, v in output.items())

    return {
        'function': func_name,
        'time_complexity': 'O(N)',
        'total_time_s': total_time,
        'average_time_ms': (total_time / iterations) * 1000,
        'output_memory_bytes': output_memory
    }

# Example of comparison: 
# (Requires the manual iteration version from Q2 to be available as `count_nucleotides`)
# dna_sequence_test = "A" * 1000000 + "T" * 1000000
# results_opt = profile_and_benchmark('Optimized Counter', count_nucleotides_optimized, dna_sequence_test)
# print(results_opt) 

# Note on memory-efficient solutions for large sequences (Part 54): 
# For truly large sequences, a memory-efficient solution would involve using 
# Python's `iter()` over a file handle to process the sequence in chunks 
# (stream processing) rather than loading the entire sequence into memory.

## Question 7

In [7]:
import unittest
import random

# Assume all functions from Q2-Q5 (e.g., count_nucleotides, transcribe_dna_to_rna, 
# get_reverse_complement, DNA class) are imported or defined above.

# --- Test Data Generator (Part 60) ---

def generate_random_dna(length: int, include_non_nucs: bool = False, include_degenerate: bool = False) -> str:
    """
    Generates a random DNA sequence string.

    :param length: The desired length of the sequence.
    :param include_non_nucs: If True, includes spaces and dashes.
    :param include_degenerate: If True, includes IUPAC degenerate codes (R, Y, S, W, K, M, N).
    :return: The random DNA sequence string.
    """
    nucs = ['A', 'T', 'G', 'C']
    if include_degenerate:
        nucs.extend(['R', 'Y', 'S', 'W', 'K', 'M', 'N'])
    if include_non_nucs:
        nucs.extend([' ', '-', 'Z', 'a', 't', 'g', 'c'])

    return "".join(random.choices(nucs, k=length))

# --- Comprehensive Testing Suite (Part 57, 58, 59, 61) ---

class TestDNAFunctions(unittest.TestCase):
    """
    Unit tests for all DNA manipulation functions.
    Includes edge cases and test coverage documentation.
    """
    
    # --- Edge Cases (Part 59) ---
    
    def test_empty_sequence(self):
        """Test with an empty sequence string."""
        empty_seq = ""
        self.assertEqual(count_nucleotides(empty_seq), {'A': 0, 'T': 0, 'G': 0, 'C': 0}, "Empty sequence count failed")
        self.assertEqual(get_reverse_complement(empty_seq), "", "Empty sequence reverse complement failed")
        self.assertEqual(split_into_codons(empty_seq), [], "Empty sequence split failed")
        
    def test_single_nucleotide(self):
        """Test with a single nucleotide string."""
        single_nuc = "A"
        self.assertEqual(count_nucleotides(single_nuc)['A'], 1, "Single nucleotide count failed")
        self.assertEqual(get_reverse_complement(single_nuc), "T", "Single nucleotide reverse complement failed")
        self.assertEqual(split_into_codons(single_nuc), ["A"], "Single nucleotide split failed")

    def test_very_long_sequences(self):
        """Test with a very long sequence (using a random generator)."""
        long_seq = generate_random_dna(length=100000, include_non_nucs=False)
        self.assertEqual(len(get_reverse_complement(long_seq)), 100000, "Long sequence length failed")

    # --- Unit Tests for Q1: DNA Class ---
    
    def test_dna_class_valid_sequence(self):
        """Test DNA class instantiation and length/stats for a valid sequence."""
        dna = DNA("ATGCATGC")
        self.assertEqual(dna.get_length(), 8)
        self.assertAlmostEqual(dna.get_basic_statistics(), 50.0) # 4/8 = 50% GC

    def test_dna_class_invalid_sequence_error(self):
        """Test DNA class handles invalid nucleotides with appropriate error."""
        with self.assertRaises(ValueError) as cm:
            DNA("ATGX")
        self.assertIn("Invalid nucleotide(s) found: X", str(cm.exception))

    # --- Unit Tests for Q4: Transcription ---
    
    def test_transcription_coding_strand(self):
        """Test DNA to RNA transcription from the coding strand."""
        dna = "ATGCGATT"
        rna = transcribe_dna_to_rna(dna, strand_type='coding')
        self.assertEqual(rna, "AUGCGAUU")
        
    def test_transcription_template_strand(self):
        """Test DNA to RNA transcription from the template strand."""
        dna = "TACGCTAA" # Template strand
        rna = transcribe_dna_to_rna(dna, strand_type='template') # Transcript should be AUGCGAUU
        self.assertEqual(rna, "AUGCGAUU")
        
    # --- Unit Tests for Q5: Reverse Complement ---
    
    def test_reverse_complement_5_3(self):
        """Test reverse complement generation (standard 5'-3' output)."""
        # 5'-ATGC-3' -> Complement: 3'-TACG-5' -> Reverse: 5'-GCAT-3'
        self.assertEqual(get_reverse_complement("ATGC"), "GCAT")

    def test_reverse_complement_3_5(self):
        """Test reverse complement generation (3'-5' output)."""
        # 5'-ATGC-3' -> Complement: 3'-TACG-5' -> Reverse: 5'-GCAT-3' -> 3'-TACG-5'
        self.assertEqual(get_reverse_complement("ATGC", orientation="3'-5'"), "TACG")

    def test_reverse_complement_degenerate(self):
        """Test reverse complement with IUPAC degenerate codes."""
        # 5'-RYSMK-3' -> 5'-KMSYR-3'
        self.assertEqual(get_reverse_complement("RYSMK"), "KMSYR")

# # To run the tests:
# # if __name__ == '__main__':
# #     unittest.main(argv=['first-arg-is-ignored'], exit=False)

# # --- Document Test Coverage and Results (Part 61) ---
# # The actual test coverage report would be generated by a tool like coverage.py.
# # Here is the textual documentation of the coverage:

# # Test Coverage Documentation Summary:
# # - **DNA Class (Q1)**: Covered valid/invalid sequence instantiation, length, and basic statistics.
# # - **Nucleotide Counting (Q2)**: Covered by the `DNA` class and tested against empty/single sequences.
# # - **String Manipulation (Q3)**: Covered empty/single sequence split (`split_into_codons`). Other functions (case, merge) can be trivially tested.
# # - **Transcription (Q4)**: Covered both coding and template strand types, including error checking (implicit in `test_transcription_template_strand` if non-ATGC bases were present).
# # - **Reverse Complement (Q5)**: Covered 5'-3' and 3'-5' orientations, empty/single sequences, and degenerate codes.
# # - **Edge Cases**: Explicitly tested: Empty sequences, single nucleotides, and very long sequences.