In [11]:
from Bio import SeqIO
from tqdm.notebook import tqdm

# File paths
input_files = ["16S_rRNA_sequences.fasta", "18S_rRNA_sequences.fasta"]  # Input files
output_files = ["16S_rRNA_sequences_cleaned.fasta", "18S_rRNA_sequences_cleaned.fasta"]  # Output files

def remove_duplicates(input_file, output_file):
    """
    Remove duplicate sequences from a FASTA file.
    Sequences are considered duplicates if both their ID and sequence data match.
    """
    seen_sequences = set()  # To store unique (header, sequence) tuples
    unique_records = []  # To store unique SeqRecord objects
    total_records = 0  # Total number of records processed
    duplicate_count = 0  # Count of duplicates found

    # Read all records and initialize tqdm for progress tracking
    with open(input_file, "r") as infile:
        records = list(SeqIO.parse(infile, "fasta"))  # Parse all records
        total_records = len(records)  # Total number of records in the file

        for record in tqdm(records, desc=f"Processing {input_file}", unit="record", ncols=400):
            identifier = (record.id, str(record.seq))  # Unique identifier: header and sequence
            if identifier not in seen_sequences:
                seen_sequences.add(identifier)
                unique_records.append(record)
            else:
                duplicate_count += 1

    # Write unique records to the output file
    with open(output_file, "w") as outfile:
        SeqIO.write(unique_records, outfile, "fasta")

    print(f"{input_file}: {duplicate_count} duplicates removed from {total_records} total records.")
    print(f"{input_file}: {len(unique_records)} unique records written to {output_file}.")

# Process each input file and remove duplicates
for infile, outfile in zip(input_files, output_files):
    try:
        print(f"Processing {infile}...")
        remove_duplicates(infile, outfile)
        print(f"==================================================\n")
    except FileNotFoundError:
        print(f"File not found: {infile}. Skipping...")

Processing 16S_rRNA_sequences.fasta...


Processing 16S_rRNA_sequences.fasta:   0%|                                                                    …

16S_rRNA_sequences.fasta: 0 duplicates removed from 14 total records.
16S_rRNA_sequences.fasta: 14 unique records written to 16S_rRNA_sequences_cleaned.fasta.

Processing 18S_rRNA_sequences.fasta...


Processing 18S_rRNA_sequences.fasta:   0%|                                                                    …

18S_rRNA_sequences.fasta: 0 duplicates removed from 18 total records.
18S_rRNA_sequences.fasta: 18 unique records written to 18S_rRNA_sequences_cleaned.fasta.

