# Setup

In [1]:
import os
from Bio import SeqIO, Align
from Bio.Seq import Seq

# Set up aligner, functions and parameters

In [2]:
aligner = Align.PairwiseAligner()
aligner.mode = 'local'
aligner.open_gap_score = -10
aligner.extend_gap_score = -10

In [3]:
def filter_fastq(query_seq, signature_seq1, signature_seq2, min_score):
    '''Checks whether the read scores a hit with any of the signature sequence in either orientation'''
    
    all_signature_seqs = [
        signature_seq1,
        signature_seq2,
        signature_seq1.reverse_complement(),
        signature_seq2.reverse_complement()
    ]
    
    match_found = False
    for signature_seq in all_signature_seqs:
        alignment = aligner.align(query_seq, signature_seq)
        if alignment.score >= min_score:
            match_found = True
            break
    return match_found

# Change new_filename below to customize the output filename
def map_filename(filename):
    library_info = filename.split(".fq")[0].split("NGS_raw_")[1]
    read_direction = library_info.split("_")[2]
    new_filename = "example_IBM_NGS_filtered_" + library_info + ".fq"
    return new_filename

In [4]:
# Determine how long the sequence within the entered signature sequence will be used
signature_len = 12

# Example on an IBM library of mCherry split by the split M86 intein

In [5]:
M86_N_seq_30 = Seq("TGGAtgcatctcgggagatagtttgatcag")
M86_C_seq_30 = Seq("TGAgttatgtacaatgatgtcattggcgac") # in reverse complement of the M86 CDS

M86_N_sig = M86_N_seq_30[:signature_len]
M86_C_sig = M86_C_seq_30[:signature_len]

In the example below, the example "raw file" is truncated to the first 10,000 records.

In [6]:
raw_filenames = [
    "example_IBM_NGS_raw_1_mCherry_1.fq" # Expand this list for actual looping between multiple files
    ]

In [7]:
for raw_filename in raw_filenames:
    new_filename = map_filename(raw_filename)
    new_file_path = os.path.join("filtered_fastq_files", new_filename)
    raw_file_path = os.path.join("raw_fastq_files", raw_filename)
    read_handle = open(raw_file_path, "r")
    input_seq_iterator = SeqIO.parse(read_handle, "fastq")
    write_handle = open(new_file_path, "w")
    output_seq_iterator = (record for record in input_seq_iterator \
                       if filter_fastq(record.seq, M86_N_sig, M86_C_sig, signature_len) == True)
    SeqIO.write(output_seq_iterator, write_handle, "fastq")
    read_handle.close()
    write_handle.close()

# Inspect before and after filter 
(The section below is typically not used in the data analysis pipeline. It is only used here to show the effect of filtering.)

In [8]:
def count_total_reads(file_path):
    read_handle = open(file_path, "r")
    input_seq_iterator = SeqIO.parse(read_handle, "fastq")
    count = 0
    for rec in input_seq_iterator:
        count+=1
    return count

In [9]:
# Count total number of reads in the raw file
print("Unfiltered read counts =", count_total_reads(raw_file_path))

Unfiltered read counts = 10000


In [10]:
# Count total number of reads in the filtered file
print("Filtered read counts =", count_total_reads(new_file_path))

Filtered read counts = 635


Therefore, in the example file there are approximately 6.4% of reads that contain the signature sequences.