In [None]:
# |default_exp sample_manager
# You need this at the top of every notebook you want turned into a module, the name your provide will determine the module name

# Libraries


In [None]:
# |export
# That export there, it makes sure this code goes into the module.

# standard libs
import os
import re

# Common to template
# add into settings.ini, requirements, package name is python-dotenv, for conda build ensure `conda config --add channels conda-forge`
import dotenv  # for loading config from .env files, https://pypi.org/project/python-dotenv/
import envyaml  # Allows to loads env vars into a yaml file, https://github.com/thesimj/envyaml
import fastcore  # To add functionality related to nbdev development, https://github.com/fastai/fastcore/
from fastcore import (
    test,
)
from fastcore.script import (
    call_parse,
)  # for @call_parse, https://fastcore.fast.ai/script
import json  # for nicely printing json and yaml
from fastcore import test
from fastcore.script import call_parse
from ssi_analysis_utility import (
    core,
)
from pathlib import Path #to be able write :Path in cli function

# Project specific libraries


In [None]:
# To change working directory when running notebook only. DO NOT EXPORT

os.chdir(core.PROJECT_DIR)

# Functions


# Sample manager


## sample_data

The sample data class is simple. It initiates an object with the following attributes:

- sample_name: an identifier for the sample
- assembly_file: Path to a genome assembly in fasta format, i.e. "/path/to/genome_assembly.fasta"
- Illumina_read_files: Paths to R1 and R2 from Illumina paired end sequencing in a comma-separated string, i.e. "/path/to/R1.fastq.gz,/path/to/R2.fastq.gz"
- Nanopore_read_file: Path to a file containing long read sequencing data in fastq format, i.e. "/path/to/Nanopore_read_data.fastq.gz"
- metadata: A dictionary containing metadata read from a provided tsv-file. Will also contain the above values (sample_name, paths to data files)
- input_folder: If provided. Needed when loading multiple samples at once using the input_manager class (see below)


## input_manager

The input_manager class is a way to initiate sample_data class instances in three different ways:

1. Initiating just a single sample_data class instance by providing a sample name and paths to read/assembly files
2. Initiating class instances for all samples in a specific input folder. Assembly files, Illumina read files and Nanopore read files are auto detected and matched by sample name.
3. Initiating class instances for samples in a provided metadatasheet sheet with paths to those samples.

For all three options metadata can be provided in tsv-format.

## Usage



In [None]:
# |export


class sample_data:

    species = "Unspecified"

    def __init__(self,attributes,input_folder):
        attributes = attributes.copy()
        try:
            self.input_folder = os.path.abspath(input_folder)
        except TypeError:
            self.input_folder = None
        try:
            self.sample_name: str = attributes['sample_name']
        except KeyError:
            self.sample_name = "sample"
        try:
            self.assembly_file = get_abs_file_path(self.input_folder,attributes['assembly_file'])
        except KeyError:
            self.assembly_file = None
        if "Illumina_read_files" in attributes and attributes["Illumina_read_files"] is not None:
            if isinstance(attributes['Illumina_read_files'], str):
                Illumina_read_files = attributes['Illumina_read_files'].split(",")
            elif isinstance(attributes['Illumina_read_files'], list):
                Illumina_read_files = attributes['Illumina_read_files']
            self.Illumina_read_files = [get_abs_file_path(self.input_folder,Illumina_read_files[0]),get_abs_file_path(self.input_folder,Illumina_read_files[1])]
        else:
            self.Illumina_read_files = None
        try:
            self.Nanopore_read_file: Path = get_abs_file_path(self.input_folder,attributes['Nanopore_read_file'])
        except KeyError:
            self.Nanopore_read_file = None
        self.metadata: dict = attributes



def get_abs_file_path(input_folder,file_path):
    if file_path is not None:
        if input_folder:
            file_path = os.path.abspath(os.path.join(input_folder,file_path))
        else:
            file_path = os.path.abspath(file_path)
        if not os.path.exists(file_path):
            print(f"WARNING: {file_path} does not exist")
    return(file_path)

#def load_samplesheet_tsv(samplesheet):
#    df = core.get_samplesheet({"path": samplesheet})
#    df.set_index('sample_name', inplace=True)
#    df_dict = df.to_dict("index")
#    return(df_dict)


def get_single_sample_attributes(input_config):
    file_paths = dict((k, input_config[k]) for k in ["sample_name","assembly_file","Illumina_read_files","Nanopore_read_file"] if k in input_config)
    if not "sample_name" in input_config or input_config["sample_name"] is None:
        if "assembly_file" in input_config and input_config["assembly_file"] is not None:
            file_paths["sample_name"] = re.match(input_config["file_patterns"]["assembly_fasta_file"],os.path.basename(input_config["assembly_file"])).group("sample_name")
        elif "Illumina_read_files" in input_config and input_config["Illumina_read_files"] is not None:
            file_paths["sample_name"] = re.match(input_config["file_patterns"]["Illumina_read_file"],os.path.basename(input_config["Illumina_read_files"].split(',')[0])).group("sample_name")
        elif "Nanopore_read_file" in input_config and input_config["Nanopore_read_file"] is not None:
            file_paths["sample_name"] = re.match(input_config["file_patterns"]["assembly_fasta_file"],os.path.basename(input_config["Nanopore_read_file"])).group("sample_name")
    if input_config["samplesheet"] is not None:
        df = core.get_samplesheet({"path": input_config["samplesheet"]})
        try:
            df.set_index('sample_name', inplace=True)
            df_dict = df.to_dict("index")
            if input_config["sample_name"] is not None and input_config["sample_name"] in df_dict:
                attributes = df_dict[input_config["sample_name"]]
            else:
                attributes = df_dict[list(df_dict)[0]]
                print("Sample name not provided or not found in metadata. Using first row as metadata input")
        except KeyError:
            df_list = df.to_dict("records")
            attributes = df_list[0]
            print("No sample_name column in metadata. Using first row as metadata input")
    else:
        attributes = {}
    attributes.update(file_paths)
    return(attributes)



class input_manager:
    def __init__(self,input_config):
        try:
            self.base_input_folder: Path = os.path.abspath(input_config["input_folder"])
        except:
            self.base_input_folder: Path = os.path.abspath("./")
        self.get_samplesheet(input_config)
        if input_config["load_from_samplesheet"]:
            if not input_config["input_folder"] or input_config["input_folder"] is None:
                self.base_input_folder = os.path.dirname(self.metadata_file)
            self.add_samples_from_samplesheet()
        else:
            self.add_samples_from_folder(input_config)
    
    ### TODO: figure out how to deal with duplicate sample names. Just a warning?

    def get_samplesheet(self,input_config):
        if "samplesheet" in input_config and input_config["samplesheet"] is not None:
            self.metadata_file = os.path.abspath(input_config["samplesheet"])
            self.metadata = self.get_metadata_from_samplesheet()
        else:
            self.metadata_file = None
            self.metadata = None


    def init_sample(self,attributes):
        sample = sample_data(attributes,self.base_input_folder)
        return(sample)
              
    def add_samples(self,file_paths):
        self.samples = []
        for sample_name in file_paths:
            attributes = file_paths[sample_name].copy()
            attributes["sample_name"] = sample_name
            self.samples.append(self.init_sample(attributes))
        return(file_paths)

    def add_samples_from_samplesheet(self):
        file_paths = self.get_metadata_from_samplesheet()
        self.add_samples(file_paths)
    
    def add_samples_from_folder(self,input_config):
        file_paths = self.get_input_from_folder(input_config)
        self.add_samples(file_paths)

    def get_input_from_folder(self,input_config):
        illumina_regex = re.compile(input_config["file_patterns"]["Illumina_read_file"])
        nanopore_regex = re.compile(input_config["file_patterns"]["Nanopore_read_file"])
        assembly_regex = re.compile(input_config["file_patterns"]["assembly_fasta_file"])
        input_folder = self.base_input_folder
        file_paths = {}
        files = os.listdir(input_folder)
        for file in files:
            sample_name = False
            if file.endswith((".fa",".fasta",".fna")):
                sample_name = re.match(assembly_regex,file).group("sample_name")
                if sample_name in file_paths:
                    file_paths[sample_name]["assembly_file"] = file
                else:
                    file_paths[sample_name] = {"assembly_file": file}
            elif file.endswith(".fastq.gz"):
                re_match = re.match(illumina_regex,file)
                if re_match:
                    sample_name = re_match.group("sample_name")
                    read_number = re_match.group("paired_read_number")
                    if read_number == "R1":
                        if sample_name in file_paths:
                            if "Illumina_read_files" in file_paths[sample_name]:
                                file_paths[sample_name]["Illumina_read_files"] = file + "," + file_paths[sample_name]["Illumina_read_files"]
                            else:
                                file_paths[sample_name]["Illumina_read_files"] = file
                        else:
                            file_paths[sample_name] = {"Illumina_read_files": file}
                    elif read_number == "R2":
                        if sample_name in file_paths:
                            if "Illumina_read_files" in file_paths[sample_name]:
                                file_paths[sample_name]["Illumina_read_files"] = file_paths[sample_name]["Illumina_read_files"] + "," + file
                            else:
                                file_paths[sample_name]["Illumina_read_files"] = file
                        else:
                            file_paths[sample_name] = {"Illumina_read_files": file}
                
                #### TODO: Logic for nanopore read data, need file naming conventions
                else:
                    re_match = re.match(nanopore_regex,file)
                    if re_match:
                        sample_name = re_match.group("sample_name")
                        if sample_name in file_paths:
                            file_paths[sample_name]["Nanopore_read_file"] = file
                        else:
                            file_paths[sample_name] = {"Nanopore_read_file": file}
        ### update attributes with metadata from samplesheet if provided
        if self.metadata_file is not None:
            for sample_name in file_paths:
                if sample_name in self.metadata:
                    file_paths_updated = self.metadata[sample_name].copy()
                    file_paths_updated.update(file_paths[sample_name])
                    file_paths[sample_name] = file_paths_updated
        return(file_paths)
    
    def get_metadata_from_samplesheet(self):
        df = core.get_samplesheet({"path": self.metadata_file})
        df.set_index('sample_name', inplace=True)
        df_dict = df.to_dict("index")
        return(df_dict)

    def __iter__(self):
        for sample in self.samples:
            yield(sample)

    
    def __len__(self):
        return(len(self.samples))




### Testing



In [None]:
# |export

def single_test():
    example_sample = sample_data({#"sample_name":"GAS-2022-1029",
                                       "assembly_file":"examples/GAS-2022-1029.fasta",
                                       "Illumina_read_files":["examples/GAS-2022-1029_S42_L555_R1_001.fastq.gz","examples/GAS-2022-1029_S42_L555_R2_001.fastq.gz"]},
                                       input_folder = False,
                                       )
    assert(len(example_sample.Illumina_read_files) == 2)
    assert(not example_sample.Nanopore_read_file)
    print(example_sample.__dict__)

def single_test_assembly():
    example_sample = sample_data({"assembly_file":"examples/GAS-2022-1029.fasta"},
                                       input_folder = False,
                                       )
    assert(not(example_sample.Illumina_read_files))
    assert(not example_sample.Nanopore_read_file)
    print(example_sample.__dict__)

def single_test_input_folder():
    config = core.get_config()
    example_sample = sample_data({"sample_name":"GAS-2024-0773",
                                       "assembly_file":"GAS-2024-0773.fasta",
                                       "Illumina_read_files":["GAS-2024-0773_S35_L555_R1_001.fastq.gz","GAS-2024-0773_S35_L555_R2_001.fastq.gz"]},
                                       input_folder = "examples",
                                       )
    assert(example_sample.sample_name == "GAS-2024-0773")
    assert(len(example_sample.Illumina_read_files) == 2)
    assert(not example_sample.Nanopore_read_file)
    print(example_sample.__dict__)


def test_folder():
    config = core.get_config()
    input_config =  config["input_manager"]
    input_config["load_from_folder"] = True
    input_config["input_folder"] = "examples/"
    #input_config["samplesheet"] = "samplesheet.tsv"
    test = input_manager(input_config)
    for x in test:
        print(x.__dict__)

def test_folder_with_metadata():
    config = core.get_config()
    input_config =  config["input_manager"]
    input_config["load_from_folder"] = True
    input_config["input_folder"] = "examples/"
    input_config["samplesheet"] = "examples/samplesheet.tsv"
    test = input_manager(input_config)
    for x in test:
        print(x.__dict__)

def samplesheet_test():
    config = core.get_config()
    input_config =  config["input_manager"]
    input_config["load_from_samplesheet"] = True
    input_config["samplesheet"] = "examples/samplesheet.tsv"
    test = input_manager(input_config)
    for x in test:
        print(x.metadata["Illumina_read_files"])
        print(x.metadata["assembly_file"])
        print(x.metadata["sample_name"])
        print(x.assembly_file)
        print(x.Illumina_read_files)



In [None]:

single_test()
single_test_assembly()
single_test_input_folder()


{'input_folder': None, 'sample_name': 'sample', 'assembly_file': '/Users/b246838/Documents/git.repositories/streptofile/examples/GAS-2022-1029.fasta', 'Illumina_read_files': ['/Users/b246838/Documents/git.repositories/streptofile/examples/GAS-2022-1029_S42_L555_R1_001.fastq.gz', '/Users/b246838/Documents/git.repositories/streptofile/examples/GAS-2022-1029_S42_L555_R2_001.fastq.gz'], 'Nanopore_read_file': None, 'metadata': {'assembly_file': 'examples/GAS-2022-1029.fasta', 'Illumina_read_files': ['examples/GAS-2022-1029_S42_L555_R1_001.fastq.gz', 'examples/GAS-2022-1029_S42_L555_R2_001.fastq.gz']}}
{'input_folder': None, 'sample_name': 'sample', 'assembly_file': '/Users/b246838/Documents/git.repositories/streptofile/examples/GAS-2022-1029.fasta', 'Illumina_read_files': None, 'Nanopore_read_file': None, 'metadata': {'assembly_file': 'examples/GAS-2022-1029.fasta'}}
{'input_folder': '/Users/b246838/Documents/git.repositories/streptofile/examples', 'sample_name': 'GAS-2024-0773', 'assembly_

In [None]:
test_folder()

{'input_folder': '/Users/b246838/Documents/git.repositories/streptofile/examples', 'sample_name': 'GAS-2023-0253', 'assembly_file': '/Users/b246838/Documents/git.repositories/streptofile/examples/GAS-2023-0253.fasta', 'Illumina_read_files': ['/Users/b246838/Documents/git.repositories/streptofile/examples/GAS-2023-0253_S74_L555_R1_001.fastq.gz', '/Users/b246838/Documents/git.repositories/streptofile/examples/GAS-2023-0253_S74_L555_R2_001.fastq.gz'], 'Nanopore_read_file': '/Users/b246838/Documents/git.repositories/streptofile/examples/GAS-2023-0253_nanopore.fastq.gz', 'metadata': {'Illumina_read_files': 'GAS-2023-0253_S74_L555_R1_001.fastq.gz,GAS-2023-0253_S74_L555_R2_001.fastq.gz', 'Nanopore_read_file': 'GAS-2023-0253_nanopore.fastq.gz', 'assembly_file': 'GAS-2023-0253.fasta', 'sample_name': 'GAS-2023-0253'}}
{'input_folder': '/Users/b246838/Documents/git.repositories/streptofile/examples', 'sample_name': 'GAS-2022-1029', 'assembly_file': '/Users/b246838/Documents/git.repositories/strep

In [None]:

test_folder_with_metadata()

{'input_folder': '/Users/b246838/Documents/git.repositories/streptofile/examples', 'sample_name': 'GAS-2023-0253', 'assembly_file': '/Users/b246838/Documents/git.repositories/streptofile/examples/GAS-2023-0253.fasta', 'Illumina_read_files': ['/Users/b246838/Documents/git.repositories/streptofile/examples/GAS-2023-0253_S74_L555_R1_001.fastq.gz', '/Users/b246838/Documents/git.repositories/streptofile/examples/GAS-2023-0253_S74_L555_R2_001.fastq.gz'], 'Nanopore_read_file': '/Users/b246838/Documents/git.repositories/streptofile/examples/GAS-2023-0253_nanopore.fastq.gz', 'metadata': {'Illumina_read_files': 'GAS-2023-0253_S74_L555_R1_001.fastq.gz,GAS-2023-0253_S74_L555_R2_001.fastq.gz', 'Nanopore_read_file': 'GAS-2023-0253_nanopore.fastq.gz', 'assembly_file': 'GAS-2023-0253.fasta', 'organism': 'Streptococcus pyogenes', 'variant': 'M1UK', 'notes': nan, 'sample_name': 'GAS-2023-0253'}}
{'input_folder': '/Users/b246838/Documents/git.repositories/streptofile/examples', 'sample_name': 'GAS-2022-1

In [None]:

samplesheet_test()

GAS-2022-1029_S42_L555_R1_001.fastq.gz,GAS-2022-1029_S42_L555_R2_001.fastq.gz
GAS-2022-1029.fasta
GAS-2022-1029
/Users/b246838/Documents/git.repositories/streptofile/examples/GAS-2022-1029.fasta
['/Users/b246838/Documents/git.repositories/streptofile/examples/GAS-2022-1029_S42_L555_R1_001.fastq.gz', '/Users/b246838/Documents/git.repositories/streptofile/examples/GAS-2022-1029_S42_L555_R2_001.fastq.gz']
GAS-2023-0253_S74_L555_R1_001.fastq.gz,GAS-2023-0253_S74_L555_R2_001.fastq.gz
GAS-2023-0253.fasta
GAS-2023-0253
/Users/b246838/Documents/git.repositories/streptofile/examples/GAS-2023-0253.fasta
['/Users/b246838/Documents/git.repositories/streptofile/examples/GAS-2023-0253_S74_L555_R1_001.fastq.gz', '/Users/b246838/Documents/git.repositories/streptofile/examples/GAS-2023-0253_S74_L555_R2_001.fastq.gz']


In [None]:
# | hide
# This is included at the end to ensure when you run through your notebook the code is also transferred to the module and isn't just a notebook
import nbdev

nbdev.nbdev_export()

In [None]:

config = core.get_config()
input_config = config["input_manager"]
input_config.update({#"assembly_file":"examples/GAS-2022-1029.fasta",
                     "Illumina_read_files": "examples/GAS-2022-1029_S42_L555_R1_001.fastq.gz,examples/GAS-2022-1029_S42_L555_R2_001.fastq.gz"})
get_single_sample_attributes(input_config)


{'sample_name': 'GAS-2022-1029',
 'assembly_file': None,
 'Illumina_read_files': 'examples/GAS-2022-1029_S42_L555_R1_001.fastq.gz,examples/GAS-2022-1029_S42_L555_R2_001.fastq.gz',
 'Nanopore_read_file': None}