In [None]:
# |default_exp bifrost

In [None]:
# |hide
# See above? this hides these blocks, meaning these blocks aren't in the module and aren't in the documentation
import nbdev
from nbdev.showdoc import *  # ignore this Pylance warning in favor of following nbdev docs

In [None]:
# |export
# That export there, it makes sure this code goes into the module.

# standard libs
import os
import re

# Common to template
# add into settings.ini, requirements, package name is python-dotenv, for conda build ensure `conda config --add channels conda-forge`
import dotenv  # for loading config from .env files, https://pypi.org/project/python-dotenv/
import envyaml  # Allows to loads env vars into a yaml file, https://github.com/thesimj/envyaml
import fastcore  # To add functionality related to nbdev development, https://github.com/fastai/fastcore/
from fastcore import (
    test,
)
from fastcore.script import (
    call_parse,
)  # for @call_parse, https://fastcore.fast.ai/script
import json  # for nicely printing json and yaml
from fastcore import test

#!export
from bifrost_bridge import core

In [None]:
# |hide
# This block should never be exported. It is to have python running in the project (and not the nbs) dir, and to initiate the package using pip.
os.chdir(core.PROJECT_DIR)

##################################################CODE_SEGMENT###########################################

In [None]:
# |export
from bifrost_bridge.mlst import process_mlst_data
from bifrost_bridge.fastp import process_fastp_data
from bifrost_bridge.quast import process_quast_data
from bifrost_bridge.plasmidfinder import process_plasmidfinder_data
from bifrost_bridge.amrfinderplus import process_amrfinderplus_data
from bifrost_bridge.bracken import process_bracken_data
from bifrost_bridge.pmlst import process_pmlst_data
from bifrost_bridge.rmlst import process_rmlst_data
import pandas as pd

@call_parse
def process_qc_data(
    mlst_path:str = None,
    fastp_path:str = None,
    quast_path:str = None,
    plasmidfinder_path:str = None,
    bracken_path:str = None,
    amrfinder_path:str = None,
    pmlst_path:str = None,
    rmlst_path:str = None,
    combine_output:bool = True,
    output_path:str = './output.tsv'):

    """
    Command-line interface for processing QC data.

    This function processes MLST, FASTP, QUAST, PlasmidFinder, and Bracken data files based on the provided command-line arguments.
    It supports specifying input file paths for MLST, FASTP, QUAST, PlasmidFinder, and Bracken data, and outputs the processed data to specified paths.

    Arguments:
        mlst_path (str): Path to the MLST input file.
        fastp_path (str): Path to the FASTP input file.
        quast_path (str): Path to the QUAST input file.
        plasmidfinder_path (str): Path to the PlasmidFinder input file.
        bracken_path (str): Path to the Bracken input file.
        amrfinder_path (str): Path to the AMRFinder input file.
        pmlst_path (str): Path to the PMLST input file.
        output_path (str): Path to the output file (default: './output.tsv').
    """
    if mlst_path is not None:
        if not os.path.exists(mlst_path):
            raise FileNotFoundError(f"File not found: {mlst_path}")
        process_mlst_data(
        input_path=mlst_path, 
        output_path='parsed_mlst.tsv',
        replace_header=None, 
        #filter_columns="SampleID, Species, ST",
        remove_sampleid=True,
        add_header="SampleID, MLST_Species, MLST_ST, MLST_Allele"
        )

    if fastp_path is not None:
        if not os.path.exists(fastp_path):
            raise FileNotFoundError(f"File not found: {fastp_path}")
        process_fastp_data(
            input_path=fastp_path, 
            output_path='parsed_fastp.tsv',
            filter_columns="summary£before_filtering£total_reads, summary£before_filtering£read1_mean_length, summary£before_filtering£read2_mean_length, summary£after_filtering£total_reads, summary£after_filtering£read1_mean_length, summary£after_filtering£read2_mean_length, filtering_result£low_quality_reads, filtering_result£too_many_N_reads, filtering_result£too_short_reads, filtering_result£too_long_reads, duplication£rate, adapter_cutting£adapter_trimmed_reads, adapter_cutting£adapter_trimmed_bases, read1_before_filtering£total_cycles, read1_after_filtering£total_cycles, read2_before_filtering£total_cycles, read2_after_filtering£total_cycles",
            replace_header="fastp_Before_Filtering_Total_Reads,fastp_Before_Filtering_Read1_Mean_Length,fastp_Before_Filtering_Read2_Mean_Length,fastp_After_Filtering_Total_Reads,fastp_After_Filtering_Read1_Mean_Length,fastp_After_Filtering_Read2_Mean_Length,fastp_Low_Quality_Reads,fastp_Too_Many_N_Reads,fastp_Too_Short_Reads,fastp_Too_Long_Reads,fastp_Duplication_Rate,fastp_Adapter_Trimmed_Reads,fastp_Adapter_Trimmed_Bases,fastp_Read1_Before_Filtering_Total_Cycles,fastp_Read1_After_Filtering_Total_Cycles,fastp_Read2_Before_Filtering_Total_Cycles,fastp_Read2_After_Filtering_Total_Cycles"
        )

    if quast_path is not None:
        if not os.path.exists(quast_path):
            raise FileNotFoundError(f"File not found: {quast_path}")
        process_quast_data(
            input_path=quast_path, 
            output_path='parsed_quast.tsv',
            filter_columns='# contigs, Largest contig, Total length, GC (%), N50, N90, L50, L90',
            replace_header="Quast_Contigs,Quast_Largest_Contig,Quast_Total_Length,Quast_GC_Pct,Quast_N50,Quast_N90,Quast_L50,Quast_L90",
            transpose=True
        )
    
    if plasmidfinder_path is not None:
        if not os.path.exists(plasmidfinder_path):
            raise FileNotFoundError(f"File not found: {plasmidfinder_path}")
        process_plasmidfinder_data(
            input_path=plasmidfinder_path, 
            output_path='parsed_plasmidfinder.tsv',
            filter_columns="Database,Plasmid,Identity,Query / Template length,Contig",
            replace_header="PFInder_Database,PFinder_Plasmid,PFinder_Identity,PFinder_Coverage,PFinder_Contig",
            convert_coverage=True,
            filter_contig=True
        )

    if bracken_path is not None:
        if not os.path.exists(bracken_path):
            raise FileNotFoundError(f"File not found: {bracken_path}")
        process_bracken_data(
            input_path=bracken_path, 
            output_path='parsed_bracken.tsv',
            replace_header="Bracken_Species,Bracken_Species_Pct,Bracken_Species1,Bracken_Species1_Pct,Bracken_Species2,Bracken_Species2_Pct,Bracken_Unclassified,Bracken_Unclassified_Pct"
        )

    if amrfinder_path is not None:
        if not os.path.exists(amrfinder_path):
            raise FileNotFoundError(f"File not found: {amrfinder_path}")
        process_amrfinderplus_data(
            input_path=amrfinder_path, 
            output_path='parsed_amrfinder.tsv',
            filter_columns="Contig id,Start,Stop,Strand,Gene symbol,Sequence name,Subclass,% Coverage of reference sequence,% Identity to reference sequence",
            replace_header="AMR_ContigID,AMR_Start,AMR_Stop,AMR_Strand,AMR_ElementSymbol,AMR_ElementName,AMR_Subclass,AMR_Coverage,AMR_Identity"
        )

    if pmlst_path is not None:
        if not os.path.exists(pmlst_path):
            raise FileNotFoundError(f"File not found: {pmlst_path}")
        process_pmlst_data(
            input_path=pmlst_path, 
            output_path='parsed_pmlst.tsv',
            replace_header="pMLST_plasmids,pMLST_IncF,pMLST_IncI1,pMLST_IncA/C,pMLST_IncHI1,pMLST_IncHI2,pMLST_IncN,pMLST_summary"
        )

    if rmlst_path is not None:
        if not os.path.exists(rmlst_path):
            raise FileNotFoundError(f"File not found: {rmlst_path}")
        process_rmlst_data(
            input_path=rmlst_path, 
            output_path='parsed_rmlst.tsv',
            filter_columns="taxon,rank,support",
            replace_header="rMLST_match,rMLST_rank,rMLST_support"
        )
    
    if combine_output:
        # List of output files that were actually created
        output_files = []
        if mlst_path is not None:
            if os.path.getsize('parsed_mlst.tsv') > 0:
                output_files.append('parsed_mlst.tsv')
        if fastp_path is not None:
            output_files.append('parsed_fastp.tsv')
        if quast_path is not None:
            output_files.append('parsed_quast.tsv')
        if plasmidfinder_path is not None:
            output_files.append('parsed_plasmidfinder.tsv')
        if amrfinder_path is not None:
            output_files.append('parsed_amrfinder.tsv')
        if bracken_path is not None:
            output_files.append('parsed_bracken.tsv')
        if pmlst_path is not None:
            output_files.append('parsed_pmlst.tsv')
        if rmlst_path is not None:
            output_files.append('parsed_rmlst.tsv')

        # Read and concatenate all output files
        combined_df = pd.concat([pd.read_csv(file, sep='\t') for file in output_files], axis=1)

        # Save the combined dataframe to the specified output path
        combined_df.to_csv(output_path, sep='\t', index=False)

In [None]:
# |hide
# Example usage of the function
process_qc_data(
    mlst_path='test_data/mlst_report.tabular', 
    #mlst_path='test_data/mlst_empty.tabular',
    fastp_path='test_data/TestSample2.json',
    quast_path='test_data/quast.tsv',
    plasmidfinder_path='test_data/plasmidfinder.tsv',
    bracken_path='test_data/bracken_krakenreport.txt',
    amrfinder_path='test_data/amrfinderbug.tsv',
    pmlst_path='test_data/simple_output.tsv',
    rmlst_path='test_data/rmlst.json',
    combine_output = True,
    output_path = 'test_data/bifrost/output.tsv'
)




##################################################CODE_SEGMENT###########################################

In [None]:
#| hide
# This is included at the end to ensure when you run through your notebook the code is also transferred to the associated python package

nbdev.nbdev_export()