In [5]:
from tqdm.notebook import tqdm

def nucleotide_to_boolean(nucleotide):
    """
    Converts a nucleotide to a 4-bit boolean representation.
    
    A -> 0001
    T -> 0010
    C -> 0100
    G -> 1000
    """
    if nucleotide == 'A':
        return "0001"
    elif nucleotide == 'T':
        return "0010"
    elif nucleotide == 'C':
        return "0100"
    elif nucleotide == 'G':
        return "1000"
    else:
        return "0000"  # For padding or unexpected characters


def convert_sequence_to_boolean(sequence):
    """
    Converts a DNA sequence into a 9600-length boolean string.
    Pads with zeros if the sequence is shorter than 2400 nucleotides.
    """
    boolean_str = ""
    
    # Process each nucleotide in the sequence
    for nucleotide in sequence:
        boolean_str += nucleotide_to_boolean(nucleotide)
    
    # Calculate required padding
    required_length = 9600
    padding_length = required_length - len(boolean_str)
    
    # Add padding (zeros) if needed
    if padding_length > 0:
        boolean_str += "0" * padding_length
    
    return boolean_str


def process_fasta_file(input_file, output_file):
    """
    Reads a FASTA file, converts each sequence to a 9600-length boolean string,
    and writes the result to an output file with minimal formatting.
    """
    try:
        with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
            sequence = ''
            
            for line in tqdm(infile, desc="Converting lines to Boolean", unit="line", ncols="400"):
                line = line.strip()
                
                # If it's a header line (starts with '>'), process the previous sequence
                if line.startswith('>'):
                    if sequence:  # Process the previous sequence
                        boolean_str = convert_sequence_to_boolean(sequence)
                        outfile.write(boolean_str + "\n")
                    sequence = ''  # Reset the sequence
                else:
                    sequence += line  # Append to the current sequence
            
            # Process the last sequence in the file
            if sequence:
                boolean_str = convert_sequence_to_boolean(sequence)
                outfile.write(boolean_str + "\n")
        
        print(f"FASTA file successfully converted to boolean format: {output_file}")
    
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    # Input FASTA file (merged FASTA file)
    input_fasta = "18S_merged_sequences.fasta"
    
    # Output file for boolean-encoded sequences
    output_file = "18S_boolean_sequences.txt"
    
    # Process the FASTA file
    process_fasta_file(input_fasta, output_file)

    # Input FASTA file (merged FASTA file)
    input_fasta2 = "16S_merged_sequences.fasta"
    
    # Output file for boolean-encoded sequences
    output_file2 = "16S_boolean_sequences.txt"
    
    # Process the FASTA file
    process_fasta_file(input_fasta2, output_file2)

Converting Sequences to Boolean: 0Record [00:00, ?Record/s]

FASTA file successfully converted to boolean format: 18S_boolean_sequences.txt


Converting Sequences to Boolean: 0Record [00:00, ?Record/s]

FASTA file successfully converted to boolean format: 16S_boolean_sequences.txt
