In [1]:
from pathlib import Path
from typing import Dict
import yaml

In [56]:
# Load the easy_search.yaml defaults file
easy_search_yaml_path = Path("/Users/heispv/Documents/pymseqs/pymmseqs/defaults/easy_search.yaml")

with open(easy_search_yaml_path, "r") as f:
    defaults = yaml.safe_load(f)


In [57]:
import ast

In [125]:
args = ["easy-search"]

In [126]:
def add_arg(
    args,
    flag,
    value,
    default,
):
    if value != default:
        if isinstance(value, bool):
            args.extend([flag, "1" if value else "0"])
        else:
            args.extend([flag, str(value)])

In [127]:
twin_argument = "aa:VTML80.out,nucl:nucleotide.ut"
comma_separated_str = "query, target, fident,alnlen,mismatch,gapopen,qend,qstart,tstart,tend,evalue,bits"
split_memory_limit = "2M"
required_args = ["input1.fasta", "input2.fasta", "output.fasta"]
my_v = 1

for param_name, param_info in defaults.items():
        if param_info['required']:
            if param_name == "query_fasta": 
                if isinstance(required_args, list):
                    for arg in required_args:
                        args.append(arg)
                else:
                    args.append(str(required_args))
            
        if param_info['required'] == False:

            # Check if the parameter name is a single character
            if len(param_name) > 1:
                cmd_param = f"--{param_name.replace('_', '-')}"
            else:
                cmd_param = f"-{param_name}"
            

            default_value = param_info['default']
            # Checking twin arguments
            if param_info['twin']:
                if param_name == "seed_sub_mat":
                    is_default_twin = twin_argument == default_value
                    if not is_default_twin:
                        add_arg(args, cmd_param, twin_argument, default_value)

            # Checking comma_separated_str
            elif param_info['type'] == "comma_separated_str":
                if param_name == "format_output":
                    default_list = [item.strip() for item in default_value.split(",")]
                    argument_list = [item.strip() for item in comma_separated_str.split(",")]
                    is_default = default_list == argument_list
                    if not is_default:
                        add_arg(args, cmd_param, ','.join(argument_list), default_value)
            
            elif param_info['type'] == "str":
                if param_name == "split_memory_limit":
                    is_default = split_memory_limit == default_value
                    if not is_default:
                        add_arg(args, cmd_param, split_memory_limit, default_value)
            
            else:
                if param_name == "v":
                    is_default = my_v == default_value
                    if not is_default:
                        add_arg(args, cmd_param, my_v, default_value)


           



seed_sub_mat: aa:VTML80.out,nucl:nucleotide.out
Not default
split_memory_limit: 0
is default: False

Not default
Checking comma_separated_str
Not default
Not default


In [128]:
args

['easy-search',
 'input1.fasta',
 'input2.fasta',
 'output.fasta',
 '--seed-sub-mat',
 'aa:VTML80.out,nucl:nucleotide.ut',
 '--split-memory-limit',
 '2M',
 '--format-output',
 'query,target,fident,alnlen,mismatch,gapopen,qend,qstart,tstart,tend,evalue,bits',
 '-v',
 '1']

In [141]:
current_value = "1,2,3,     5"
default_value = "1,2,3"
args = []
cmd_param = "-f"

In [142]:
current_list = [item.strip() for item in str(current_value).split(",")]
default_list = [item.strip() for item in str(default_value).split(",")]
if current_list != default_list:
    cleaned_value = ",".join(current_list)
    args.extend([cmd_param, cleaned_value])

In [143]:
args

['-f', '1,2,3,5']

In [146]:
import yaml
import re


def parse_argument(argument_line):
    """
    Parse each argument in the documentation line and return a dictionary containing all the necessary details.
    """
    # Matching pattern for command-line arguments
    pattern = r'(--?[a-zA-Z0-9_-]+)\s*(\s+[A-Za-z]+)?\s*(\[\S+\])?\s*(\[\S+.*\])?'

    match = re.match(pattern, argument_line.strip())
    if not match:
        return None

    # Extract matched groups
    option_name = match.group(1).strip()
    data_type = match.group(2).strip() if match.group(2) else None
    default_value = match.group(3).strip() if match.group(3) else None
    description = match.group(4).strip() if match.group(4) else ""

    # Determine if required
    required = "<" in option_name and ">" in option_name
    should_exist = required and "i:" in option_name

    # Set the argument type, defaults to 'str'
    if data_type:
        data_type = data_type.strip()

    # Extract possible choices
    choices = []
    choices_pattern = r'(\d+):\s*([^\[]+)'
    choices_match = re.findall(choices_pattern, description)
    if choices_match:
        choices = [int(choice[0]) for choice in choices_match]

    return {
        "name": option_name.replace('-', '_'),
        "required": required,
        "type": "path" if data_type and "path" in data_type.lower() else "str",  # Default to 'str' unless it's a path
        "description": description.strip(),
        "default": default_value.strip() if default_value else None,
        "choices": choices if choices else None,
        "twin": "TWIN" in argument_line,
        "should_exist": should_exist
    }


def parse_documentation(doc_str):
    """
    Parses the documentation string to extract command-line arguments.
    """
    lines = doc_str.splitlines()
    argument_lines = []

    in_options = False
    for line in lines:
        line = line.strip()

        if line.lower().startswith('options:'):
            in_options = True
            continue
        elif in_options and (line.startswith('--') or line.startswith('-')):
            argument_lines.append(line)
        elif in_options and not line.startswith(' ') and line != "":
            break  # End of options section

    # Parse the arguments
    parsed_arguments = []
    for arg_line in argument_lines:
        parsed_arg = parse_argument(arg_line)
        if parsed_arg:
            parsed_arguments.append(parsed_arg)

    return parsed_arguments


def generate_yaml(parsed_arguments, output_file):
    """
    Generates the YAML configuration file based on parsed arguments.
    """
    yaml_data = {}
    for arg in parsed_arguments:
        arg_data = {
            "required": arg["required"],
            "type": arg["type"],
            "default": arg["default"] or "",
            "choices": arg["choices"] or None,
            "description": arg["description"],
            "twin": arg["twin"],
            "should_exist": arg["should_exist"]
        }
        yaml_data[arg["name"]] = arg_data

    with open(output_file, "w") as file:
        yaml.dump(yaml_data, file, default_flow_style=False)



# Example usage

# Load the documentation (replace this with your actual documentation)
documentation = """
usage: mmseqs easy-cluster <i:fastaFile1[.gz|.bz2]> ... <i:fastaFileN[.gz|.bz2]> <o:clusterPrefix> <tmpDir> [options]
 By Martin Steinegger <martin.steinegger@snu.ac.kr>
options: prefilter:                      
 --seed-sub-mat TWIN              Substitution matrix file for k-mer generation [aa:VTML80.out,nucl:nucleotide.out]
 -s FLOAT                         Sensitivity: 1.0 faster; 4.0 fast; 7.5 sensitive [4.000]
 -k INT                           k-mer length (0: automatically set to optimum) [0]
 --target-search-mode INT         target search mode (0: regular k-mer, 1: similar k-mer) [0]
 --k-score TWIN                   k-mer threshold for generating similar k-mer lists [seq:2147483647,prof:2147483647]
 --alph-size TWIN                 Alphabet size (range 2-21) [aa:21,nucl:5]
 --max-seqs INT                   Maximum results per query sequence allowed to pass the prefilter (affects sensitivity) [20]
 --split INT                      Split input into N equally distributed chunks. 0: set the best split automatically [0]
 --split-mode INT                 0: split target db; 1: split query db; 2: auto, depending on main memory [2]
 --split-memory-limit BYTE        Set max memory per split. E.g. 800B, 5K, 10M, 1G. Default (0) to all available system memory [0]
 --comp-bias-corr INT             Correct for locally biased amino acid composition (range 0-1) [1]
 --comp-bias-corr-scale FLOAT     Correct for locally biased amino acid composition (range 0-1) [1.000]
 --diag-score BOOL                Use ungapped diagonal scoring during prefilter [1]
 --exact-kmer-matching INT        Extract only exact k-mers for matching (range 0-1) [0]
 --mask INT                       Mask sequences in prefilter stage with tantan: 0: w/o low complexity masking, 1: with low complexity masking [1]
 --mask-prob FLOAT                Mask sequences if probability is above threshold [0.900]
 --mask-lower-case INT            Lowercase letters will be excluded from k-mer search 0: include region, 1: exclude region [0]
 --mask-n-repeat INT              Repeat letters that occur > threshold in a row [0]
 --min-ungapped-score INT         Accept only matches with ungapped alignment score above threshold [15]
 --add-self-matches BOOL          Artificially add entries of queries with themselves (for clustering) [0]
 --spaced-kmer-mode INT           0: use consecutive positions in k-mers; 1: use spaced k-mers [1]
 --spaced-kmer-pattern STR        User-specified spaced k-mer pattern []
 --local-tmp STR                  Path where some of the temporary files will be created []
align:
 -c FLOAT                         List matches above this fraction of aligned (covered) residues (see --cov-mode) [0.800]
 --cov-mode INT                   0: coverage of query and target
                                  1: coverage of target
                                  2: coverage of query
                                  3: target seq. length has to be at least x% of query length
                                  4: query seq. length has to be at least x% of target length
                                  5: short seq. needs to be at least x% of the other seq. length [0]
 --alignment-mode INT             How to compute the alignment: 0 automatic, 1 only score and end_pos, 2 also start_pos and cov, 3 also seq.id, 4 only ungapped alignment [3]
 --alignment-output-mode INT      How to compute the alignment: 0 automatic, 1 only score and end_pos, 2 also start_pos and cov, 3 also seq.id, 4 only ungapped alignment [0]
 --wrapped-scoring BOOL           Double the (nucleotide) query sequence during the scoring process to allow wrapped diagonal scoring around end and start [0]
"""

# Parse the documentation and generate YAML
parsed_args = parse_documentation(documentation)
generate_yaml(parsed_args, 'testing.yaml')

print("YAML file has been generated successfully!")

YAML file has been generated successfully!
