From 40c012c05f23f4e1b1c57557471e969290bcb215 Mon Sep 17 00:00:00 2001 From: Zachary Sethna Date: Fri, 13 Jul 2018 11:42:23 -0400 Subject: [PATCH] Update to 1.0.0 Updated README, and restructured the console scripts --- README.md | 269 ++++-- olga/compute_pgen.py | 782 ++++++++++++++++ olga/compute_single_sequence_pgen.py | 358 -------- ...tic_sequences.py => generate_sequences.py} | 278 +++--- olga/generation_probability.py | 2 +- olga/run_pgen.py | 841 ------------------ olga/utils.py | 34 + setup.py | 7 +- 8 files changed, 1122 insertions(+), 1449 deletions(-) create mode 100755 olga/compute_pgen.py delete mode 100755 olga/compute_single_sequence_pgen.py rename olga/{generate_synthetic_sequences.py => generate_sequences.py} (57%) delete mode 100755 olga/run_pgen.py diff --git a/README.md b/README.md index b1f1abd..81348fc 100644 --- a/README.md +++ b/README.md @@ -4,21 +4,21 @@ OLGA is a python 2.7 software developed to compute the generation probability of ## Motivation -The recent ubiquity in adaptive immune system repertoire sequencing has led to the need for a variety of probabilistic and bioinformatic tools to analyze CDR3 sequence. Of particular interest are the generative models of V(D)J recombination and an inference procedure first introduced in [Murugan 2012](http://www.pnas.org/content/early/2012/09/10/1212755109.short). Recently, a more complete and efficient software package IGoR (Inference and Generation Of Repertoires) was published [(Marcou 2018)](http://www.nature.com/articles/s41467-018-02832-w) and is available on GitHub [(IGoR)](https://github.com/qmarcou/IGoR) and should be used for any generative model inference. +The recent ubiquity in adaptive immune system repertoire sequencing has led to the need for a variety of probabilistic and bioinformatic tools to analyze CDR3 sequence. Of particular interest are the generative models of V(D)J recombination and an inference procedure first introduced in [Murugan 2012](http://www.pnas.org/content/early/2012/09/10/1212755109.short). Recently, an efficient software package [IGoR](https://github.com/qmarcou/IGoR) (Inference and Generation Of Repertoires) was published [(Marcou 2018)](http://www.nature.com/articles/s41467-018-02832-w) and is available on [GitHub](https://github.com/qmarcou/IGoR) and should be used for any generative model inference. This software implements the dynamic programming algorithm discussed *CITE PAPER* to compute the generation probability of amino acid and in-frame nucleotide CDR3 sequences. As adaptive repertoire researchers span the full gamut of technical coding skills, the goal of this README and software is to be as usable and useful as possible for researchers with any programming background. To this end, the core algorithm provided as Python modules with a few command line console scripts provided. The technical user can thus import the Python modules and incorporate them into their own analysis pipeline while the researcher who only requires a cursory analysis can use the provided console scripts. (Furthermore, there is a functional/procedural implementation of the algorithm -- please contact via email if you prefer the modules structured in this manner). -Documentation and examples provided below, and assumes only minimal familiarity with Python/Bash. Ideally, even the interested party who is very uncomfortable with coding should be able to manage from the examples. +Documentation and examples provided below. Ideally, even the interested party who is uncomfortable with coding should be able to manage from the examples. ## Version -Latest released version: 0.1.0 +Latest released version: 1.0.0 ## Installation OLGA is a python 2.7 software which only uses standard python libraries and requires no additional dependencies. It is available on PyPI and can be downloaded and installed through pip: ```pip install olga```. -OLGA is also available on [GitHub](https://github.com/zsethna/OLGA). The command line entry points can be installed by using the setup.py script: ```$ python setup.py install```. If the command line console scripts are not wanted, no installation is necessary and the scripts ```run_pgen.py```, ```compute_single_sequence_pgen.py```, and ```generate_synthetic_sequences.py``` can all be run as executables. +OLGA is also available on [GitHub](https://github.com/zsethna/OLGA). The command line entry points can be installed by using the setup.py script: ```$ python setup.py install```. If the command line console scripts are not wanted, no installation is necessary and the scripts ```compute_pgen.py``` and ```generate_sequences.py``` can be run as executables. @@ -32,9 +32,8 @@ olga/ │ └───olga/ │ __init__.py - │ run_pgen.py - │ compute_single_sequence_pgen.py - │ generate_synthetic_sequences.py + │ compute_pgen.py + │ generate_sequences.py │ generation_probability.py │ preprocess_generative_model_and_data.py │ load_model.py @@ -68,156 +67,232 @@ olga/ ## Command line console scripts and Examples -There are three command line console scripts (the scripts can still be called as executables if OLGA is not installed): -1. olga-compute_single_sequence_pgen - * computes the generation probability of a single sequence. -2. olga-run_pgen - * computes the generation probability of sequences read from a file and outputs the results to another file. -3. olga-generate_synthetic_sequences - * generates sequences and writes them to a file. +There are two command line console scripts (the scripts can still be called as executables if OLGA is not installed): +1. olga-compute_pgen + * computes the generation probability CDR3 sequences according to a generative V(D)J model +2. olga-generate_sequences + * generates CDR3 sequences from a generative V(D)J model For any of them you can execute with the -h or --help flags to get the options. ### Quick Demo After installing OLGA, we offer a quick demonstration of the console scripts. This will demonstrate generating sequences and computing pgens from the default model for human TCR beta chains that ships with OLGA. This demo will generate two files, example_seqs.tsv and example_pgens.tsv, in the directory that these command line calls are made. -1. ```$ olga-compute_single_sequence_pgen CASSLGRDGGHEQYF --humanTCRB``` - * This computes the pgen of the amino acid sequence CASSLGRDGGHEQYF (you should get 7.25342176315e-10) +1. ```$ olga-compute_pgen --humanTRB CASSLGRDGGHEQYF``` + * This computes the pgen of the amino acid sequence CASSLGRDGGHEQYF (you should get ~7.25342176315e-10) -2. ```$ olga-generate_synthetic_sequences.py example_seqs.tsv --humanTCRB -n 1e3``` - * This generates a file example_seqs.tsv with 1000 human TCRB CDR3 sequences (both the nucleotide sequence and the amino acid sequence) along with the V and J genes used to generate them. +2. ```$ olga-generate_sequences --humanTRB -n 5``` + * Generate 5 human TRB CDR3 sequences (both nucleotide and amino acid sequences) and print to stdout along with the V and J genes used to generate them. -3. ```$ olga-run_pgen example_seqs.tsv example_pgens.tsv --humanTCRB``` - * This reads in the file we just generated, example_seqs.tsv, and computes both the nucleotide sequence and the amino acid sequence pgen for each of the 1000 sequences we generated and writes it to the file example_pgens.tsv. +3. ```$ olga-generate_sequences --humanTRB -o example_seqs.tsv -n 1e2``` + * This generates a file example_seqs.tsv and writes 100 generated human TRB CDR3 sequences. -Did it work? If not, check these issues: -* Do you have Python 2.7? (Should be built into the OS for Unix/Mac) -* Is OLGA installed? - * If OLGA is not 'installed,' the scripts can still be called as executables (or through python): - 1. ```$ ./compute_single_sequence_pgen.py CASSLGRDGGHEQYF --humanTCRB``` - 2. ```$ ./generate_synthetic_sequences.py example_seqs.tsv --humanTCRB -n 1e3``` - 3. ```$ ./run_pgen.py example_seqs.tsv example_pgens.tsv --humanTCRB``` +4. ```$ olga-compute_pgen --humanTRB -i example_seqs.tsv -m 5``` + * This reads in the first 5 sequences from the file we just generated, example_seqs.tsv, and and computes both the nucleotide sequence and the amino acid sequence pgens for the 5 sequences and prints it to stdout. + +5. ```$ olga-compute_pgen --humanTRB -i example_seqs.tsv -o example_pgens.tsv``` + * This reads in the full file example_seqs.tsv, and computes both the nucleotide sequence and the amino acid sequence pgen for each of the 100 sequences and writes them to the file example_pgens.tsv. A running display will be printed to stdout with the last few lines computed along with time/rate updates. ### Specifying a default V(D)J model (or a custom model folder) All of the console scripts require specifying a V(D)J generative model and genomic data. OLGA ships with 4 default models that can be indicated by flags, or a custom model folder can be indicated. | Options | Description | |------------------------------------------------|--------------------------------------------------| -| **--humanTCRA**, **--human_T_alpha** | Default human T cell alpha chain model (VJ) | -| **--humanTCRB**, **--human_T_beta** | Default human T cell beta chain model (VDJ) | -| **--mouseTCRB**, **--mouse_T_beta** | Default mouse T cell beta chain model (VDJ) | -| **--humanIGH**, **--human_B_heavy** | Default human B cell heavy chain model (VDJ) | -| **-VDJ_model_folder** PATH/TO/MODEL_FOLDER/ | Specifies a custom VDJ model directory | -| **-VJ_model_folder** PATH/TO/MODEL_FOLDER/ | Specifies a custom VDJ model directory | - -Note, if specifying a custom model folder for either a VJ recombination model -(e.g. an alpha or light chain model) or a VDJ recombination model +| **--humanTRA** | Default human T cell alpha chain model (VJ) | +| **--humanTRB** | Default human T cell beta chain model (VDJ) | +| **--mouseTRB** | Default mouse T cell beta chain model (VDJ) | +| **--humanIGH** | Default human B cell heavy chain model (VDJ) | +| **--set_custom_model_VJ** PATH/TO/MODEL_FOLDER/ | Specifies the directory PATH/TO/MODEL_FOLDER/ of a custom VJ generative model | +| **--set_custom_model_VDJ** PATH/TO/MODEL_FOLDER/| Specifies the directory PATH/TO/MODEL_FOLDER/ of a custom VDJ generative model | + +Note, if specifying a folder for a custom VJ recombination model +(e.g. an alpha or light chain model) or a custom VDJ recombination model (e.g. a beta or heavy chain model), the folder must contain the following files with the exact naming convention: -* model_params.txt (IGoR inference param file) -* model_marginals.txt (IGoR inference marginal file) -* V_gene_CDR3_anchors.csv (V residue anchor and functionality file) -* J_gene_CDR3_anchors.csv (J residue anchor and functionality file) +* model_params.txt ([IGoR](https://github.com/qmarcou/IGoR) inference param file) +* model_marginals.txt ([IGoR](https://github.com/qmarcou/IGoR) inference marginal file) +* V_gene_CDR3_anchors.csv (V anchor residue position and functionality file) +* J_gene_CDR3_anchors.csv (J anchor residue position and functionality file) -The console scripts can only read files of the assumed IGoR/anchor.csv syntaxes. In order to read in models from files of other formats, please read the discussion in the Python module section and the documentation of load_model.py. +The console scripts can only read files of the assumed anchor.csv/[IGoR](https://github.com/qmarcou/IGoR) syntaxes. If the advanced user wants to read in models from files of other formats please read the discussion in the Python module section and the documentation of load_model.py. -### compute_single_sequence_pgen.py +### compute_pgen.py (olga-compute_pgen) -syntax: ```olga-compute_single_sequence_pgen CDR3_SEQ **kwarg ``` +This script will compute the generation probabilities (pgens) of sequences, as defined by a specified generative model. The sequences must be TRIMMED TO ONLY THE CDR3 region as defined by the V and J anchor files (default is to INCLUDE the conserved residues of the C in the V region and the F/W in the J region). -This console script is used to compute the generation probability (Pgen) of a single CDR3 sequence as defined by a provided generative V(D)J model. The sequence itself can be an 'amino acid' sequence, an in-frame nucleotide sequence, or an 'amino acid' regular expression. +Each sequence will be determined to be one of: +* 'Amino acid' sequence including any ambiguous amino acid symbols (see +Options for alphabet_file to specify custom symbols) +* Nucleotide sequence (in-frame) +* Regular expression template for 'amino acid sequences' -The CDR3 sequence is the first argument. The program has a minimal sequence parser that will guess if the inputted sequence is an 'amino acid' sequence, a nucleotide sequence, or a regular expression and the printed output will indicate what the parser interpreted the sequence to be. +This script can read in sequences and output pgens in one of three modes (determined automatically based on the input arguments): -It is also possible to restrict the Pgen computation to specified V and/or J genes or alleles (to reflect any alignment outside of the CDR3 region) by using the options -v or -j (see example below). You can specify multiple V or J genes/alleles by using a comma as a delimiter. +1. Pass CDR3 sequences as arguments, output is printed to stdout. +2. Read in CDR3 sequences from a file, output is printed to stdout. +3. Read in CDR3 sequences from a file, output to a file, dynamic display with time updates printed to stdout. -The only required inputs are the sequence and specifying the generative V(D)J model. Additional options can be found by using -h. +Full options can be printed by: ```$ olga-compute_pgen -h``` -Examples: -``` -$ olga-compute_single_sequence_pgen CASSLGRDGGHEQYF --humanTCRB +**Mode 1):** ------------------------------------------------------------------------------------------- -Pgen of the amino acid sequence CASSLGRDGGHEQYF: 7.25342176315e-10 ------------------------------------------------------------------------------------------- +This mode is provided as a way to quickly compute pgen of just a single or couple of sequences. It is not advisable to use this method to set up a loop over a lot of sequences as each call of the script demands the overhead of processing a model. To compute the pgens of many sequences, it is suggested to read the sequences in from a file and use either mode 2 or 3. +It is also possible to condition the Pgen computation on V and J identity by specifying the V or J usages as a mask. However, note that these V/J masks will be applied to ALL of the sequences provided as arguments. Read Options on v_mask and j_mask for more info. -$ olga-compute_single_sequence_pgen TGTGCCAGCAGCTTAGGTAGGGATGGAGGTCACGAGCAGTACTTC --humanTCRB ------------------------------------------------------------------------------------------- -Pgen of the nucleotide sequence TGTGCCAGCAGCTTAGGTAGGGATGGAGGTCACGAGCAGTACTTC: 8.50807296092e-14 -Pgen of the amino acid sequence nt2aa(TGTGCCAGCAGCTTAGGTAGGGATGGAGGTCACGAGCAGTACTTC) = CASSLGRDGGHEQYF: 7.25342176315e-10 ------------------------------------------------------------------------------------------- +Example calls: +* Compute the pgen of an amino acid CDR3 sequence +``` +$ olga-compute_pgen --humanTRB CASSTGQANYGYTF +Pgen of the amino acid sequence CASSTGQANYGYTF: 5.26507446955e-08 +``` +* Compute the pgen of an in-frame nucleotide sequence *and* the amino acid sequence it translates to. +``` +$ olga-compute_pgen --humanTRB TGTGCCAGCAGTGACGCACAGGGGCGTAATCGTGGGACTGAAGCTTTCTTT +Pgen of the nucleotide sequence TGTGCCAGCAGTGACGCACAGGGGCGTAATCGTGGGACTGAAGCTTTCTTT: 1.31873701121e-17 +Pgen of the amino acid sequence nt2aa(TGTGCCAGCAGTGACGCACAGGGGCGTAATCGTGGGACTGAAGCTTTCTTT) = CASSDAQGRNRGTEAFF: 4.70599549953e-13 +``` +* Compute the pgen of a regular expression template of CDR3 amino acid sequences. Note, for a regular expression sequence, provided as an argument, backslashes may be needed to specify the characters {} for the sequence to be read in properly. +``` +$ olga-compute_pgen --humanTRB CASSTGX\{1,5\}QAN[YA]GYTF +Pgen of the regular expression sequence CASSTGX{1,5}QAN[YA]GYTF: 7.588241802e-08 +``` +* Compute the pgens of all three sequences. +``` +$ olga-compute_pgen --humanTRB CASSTGQANYGYTF CASSTGX\{1,5\}QAN[YA]GYTF TGTGCCAGCAGTGACGCACAGGGGCGTAATCGTGGGACTGAAGCTTTCTTT +Pgen of the amino acid sequence CASSTGQANYGYTF: 5.26507446955e-08 +Pgen of the regular expression sequence CASSTGX{1,5}QAN[YA]GYTF: 7.588241802e-08 +Pgen of the nucleotide sequence TGTGCCAGCAGTGACGCACAGGGGCGTAATCGTGGGACTGAAGCTTTCTTT: 1.31873701121e-17 +Pgen of the amino acid sequence nt2aa(TGTGCCAGCAGTGACGCACAGGGGCGTAATCGTGGGACTGAAGCTTTCTTT) = CASSDAQGRNRGTEAFF: 4.70599549953e-13 +``` +* Specify a comma delimited V or J mask to condition the pgen computation on V and/or J gene usage. +``` +$ olga-compute_pgen --humanTRB CASSTGQANYGYTF --v_mask TRBV2,TRBV14 --j_mask TRBJ1-2 +Pgen of the amino acid sequence CASSTGQANYGYTF: 1.39165562898e-09 +``` +It is also possible to restrict the Pgen computation to specified V and/or J genes or alleles (to reflect any alignment outside of the CDR3 region) by using the options -v or -j (see example below). You can specify multiple V or J genes/alleles by using a comma as a delimiter. -$ olga-compute_single_sequence_pgen CASSLX\{0,5\}DG[GAR]HEQYF --humanTCRB +The only required inputs are the sequence and specifying the generative V(D)J model. Additional options can be found by using -h. ------------------------------------------------------------------------------------------- -Pgen of the regular expression sequence CASSLX{0,5}DG[GAR]HEQYF: 2.42813803819e-07 ------------------------------------------------------------------------------------------- +Modes 2/3): +These read in sequences from a file. The script has only minimal file parsing built in, so reading in sequences from a file requires the file to be structured with delimiter spaced values (i.e. the data is organized in columns separated by delimiter like a .tsv or .csv file). Read Options on delimiter for more info. -$ olga-compute_single_sequence_pgen.py CASSLGRDGGHEQYF --humanTCRB -v TRBV11-1 -j TRBJ2-7 +To read in sequences, the index of column of CDR3 sequences is needed. The default is to assume that the sequences to be read in are in the first column (index 0), meaning that a text file with only a sequence on each line will be read in okay by default. Read Options on seq_in for more info. ------------------------------------------------------------------------------------------- -Pgen of the amino acid sequence CASSLGRDGGHEQYF: 6.66613706172e-12 +It is not recommended to read in regular expression sequences from a file. These sequences require enumerating out the amino acid sequences which correspond to them and computing pgen for each of them individually -- this can require a large time cost. Instead consider defining a custom 'amino acid' alphabet to define the symbols used in the regular expressions if possible. Furthermore, BE CAREFUL if reading in from a .csv file -- if commas are used in a regex sequence and comma is used as the delimiter of the .csv file, the sequence will not be read in properly. -(Conditioned on the V and J gene/allele usages: ['TRBV11-1'], ['TRBJ2-7']) ------------------------------------------------------------------------------------------- +If nucleotide sequences are to be read in it is possible to specify if the +output should be the nucleotide sequence Pgen and/or the translated amino acid sequence Pgen (the default is to compute and output both). See Options. -$ olga-compute_single_sequence_pgen CASSLGRDGGHEQYF --humanTCRB -v TRBV2,TRBV11-1 -j TRBJ2-7,not_a_J_gene +It is also possible to condition the Pgen computation on V and J identity by specifying what index the column that V and J masks are stored for each line. -Unfamiliar J gene/allele: not_a_J_gene ------------------------------------------------------------------------------------------- -Pgen of the amino acid sequence CASSLGRDGGHEQYF: 7.93830612446e-12 +Mode 2 does not have a specified output file and so will print the sequences and their pgens to stdout. -(Conditioned on the V and J gene/allele usages: ['TRBV2', 'TRBV11-1'], ['TRBJ2-7']) ------------------------------------------------------------------------------------------- +Mode 3 does have a specified output file. By default in this mode there is a running display of the last few sequences/pgens written to the output file as well as time elapsed, current rate of computation, and estimated time remaining. This display can be disabled (see Options). -``` +As it is rare for datasets to have >> 1e4 unique sequences, parallelization is not built in to compute_pgen. However, there are options to skip N lines of the file and to load at most M sequences so, if wanted, one could build a parallelized wrapper around this script (though it would be recommended to instead just import the modules and build from there). +Example calls (assuming a file example_seqs.tsv with the line structure +ntseq aaseq V_mask J_mask): -### run_pgen.py -syntax: ```olga-run_pgen PATH/TO/INFILE PATH/TO/OUTFILE **kwarg ``` +* Read in the ntseqs and print the ntseq, aaseq = nt2aa(ntseq) and their pgens to stdout +``` +$ olga-compute_pgen -i example_seqs.tsv --humanTRB +``` -This console script is used to compute the generation probabilities (Pgens), as defined by a specified generative V(D)J model, of CDR3 sequences from a file and writes the output to another file. The infile is assumed to be a DELIMITER SPACED VALUE file (e.g. a tab separated file .tsv or a comma separated file .csv, though other delimiters can be specified. For more info read the options/docstring). +* Only read in the first 10 sequences +``` +$ olga-compute_pgen -i example_seqs.tsv --humanTRB -m 10 +``` -The CDR3 sequences can be either 'amino acid' sequences or in-frame nucleotide sequences and are read in as a specific column of each line (as defined by the delimiter). The default index for the sequence column is 0 (the first column) to ensure that a file composed of only a single sequence on each line will be correctly read in by default. +* Read in the ntseqs, write the ntseq, aaseq = nt2aa(ntseq), and their pgens to example_pgens.tsv +``` +$ olga-compute_pgen -i example_seqs.tsv --humanTRB -o example_pgens.tsv +``` -It is also possible to restrict the Pgen computation to specified V and/or J genes or alleles to reflect any alignment outside of the CDR3 region by specifying the index of columns that contain the V/J mask to be read in (syntax of the mask string is similar to what was used for compute_single_sequence_pgen.py - additional information about this syntax in the options or docstring). +* Specify the V/J mask indices +``` +$ olga-compute_pgen -i example_seqs.tsv --humanTRB -o example_pgens.tsv --v_in 2 --j_in 3 +``` -Required inputs are the PATH/TO/INFILE and PATH/TO/OUTFILE (which are the first two arguments), and specifying the generative V(D)J model. +* Read in the aaseq column instead of the ntseq column +``` +$ olga-compute_pgen -i example_seqs.tsv --humanTRB -o example_pgens.tsv --seq_in 1 +``` +| Selected Options | Description | +|------------------------------------------------|--------------------------------------------------| +| **-h**, **--help** | show full Options list and exit | +| **-i** PATH/TO/FILE | read in CDR3 sequences (and optionally V/J masks) from PATH/TO/FILE | +| **-o** PATH/TO/FILE | write CDR3 sequences and pgens to PATH/TO/FILE | +| **--seq_in** INDEX | specifies sequences to be read in are in column INDEX. Default is index 0 (the first column). | +| **--v_in** INDEX | specifies V_masks are found in column INDEX in the input file. Default is no V mask. | +| **--j_in** INDEX | specifies J_masks are found in column INDEX in the input file. Default is no J mask. | +| **--v_mask** V_MASK | specify V usage to condition Pgen on for seqs read in as arguments. | +| **--j_mask** J_MASK | specify J usage to condition Pgen on for seqs read in as arguments. | +| **-m** N | compute Pgens for at most N sequences. | +| **--lines_to_skip** N | skip the first N lines of the file. Default is 0. | +| **-a** PATH/TO/FILE | specify PATH/TO/FILE defining a custom 'amino acid' alphabet. Default is no custom alphabet. | +| **--seq_type_out** SEQ_TYPE | if read in sequences are ntseqs, declare what type of sequence to compute pgen for. Default is all. Choices: 'all', 'ntseq', 'aaseq' | +| **--display_off** | turn the sequence display off (only applies in write-to-file mode). Default is on. | +| **-d** DELIMITER | declare infile delimiter. Default is tab for .tsv input files, comma for .csv files, and any whitespace for all others. Choices: 'tab', 'space', ',', ';', ':' | +| **--raw_delimiter** DELIMITER | declare infile delimiter as a raw string. | +| **--delimiter_out**, **--raw_delimiter_out**, **--gene_mask_delimiter**, **--raw_gene_mask_delimiter** | declare delimiters for the outfile and for gene masks (read in from the columns of v_mask_index and j_mask_index). Same syntax as the infile delimiter. | +| **--comment_delimiter** COMMENT_DELIMITER | character or string to indicate comment or header lines to skip. | -Further documentation of additional options along with more examples can be found in the options/docstring. +### generate_sequences.py (olga-generate_sequences) +This program will generate a file of Monte Carlo sampling from a specified generative V(D)J model. The sequences generated will have NO ERRORS. -Examples: -``` -$ olga-run_pgen example_seqs.tsv example_pgen.tsv --humanTCRB +It is required to specify the number of sequences to be generated. This is done with -n (see Options). -$ olga-run_pgen example_seqs.tsv example_pgen.tsv --humanTCRB -seq_in_index 1 +If a file is specified to write to (using -o, see Options), the generated sequences are written to the file, otherwise they are printed to stdout. -``` +The default is to record both the nucleotide CDR3 sequence and the amino acid CDR3 sequence. This can be specified (see Options). -### generate_synthetic_sequences.py +The V/J genes used to generate each sequence can be recorded or not. Default is to record them, but this can be toggled off with --record_genes_off (see Options). -syntax: ```olga-generate_synthetic_sequences PATH/TO/OUTFILE **kwarg``` +Full options can be printed by: ```$ olga-generate_sequences -h``` -This console script generates a file of CDR3 sequences generated from Monte Carlo sampling of a specified generative V(D)J model. +Example calls: -Required inputs are the PATH/TO/OUTFILE (assumed to be the first argument), the number of sequences to generate, and specifying the generative V(D)J model. +* Print 20 generated sequences to stdout +``` +$ olga-generate_sequences --humanTRB -n 20 +``` -Further documentation of all options along with more examples can be found in the options/docstring. +* Write 200 generated sequences to example_seqs.tsv +``` +$ olga-generate_sequences --humanTRB -o example_seqs.tsv -n 200 +``` -Example: +* Write 20,000 generated sequences to example_seqs.tsv +``` +$ olga-generate_sequences --humanTRB -o example_seqs.tsv -n 2e4 ``` -$ olga-generate_synthetic_sequences data/example_seqs.tsv --humanTCRB -n 1e3 +* Write only the amino acid sequences +``` +$ olga-generate_sequences --humanTRB -o example_seqs.tsv -n 200 --seq_type amino_acid ``` +| Selected Options | Description | +|------------------------------------------------|--------------------------------------------------| +| **-h**, **--help** | show full Options list and exit | +| **-o** PATH/TO/FILE | write CDR3 sequences to PATH/TO/FILE | +| **-n** N | specify the number of sequences to generate. | +| **--seed** SEED | set seed for pseudorandom number generator. Default is to not set a seed. | +| **--seq_type** SEQ_TYPE | declare sequence type for output sequences. Choices: 'all' [default], 'ntseq', 'aaseq' | +| **--time_updates_off** | turn time updates off. +| **--record_genes_off** | turn off recording V and J gene info. +| **-d** DELIMITER | declare delimiter choice. Default is tab for .tsv output files, comma for .csv files, and tab for all others. Choices: 'tab', 'space', ',', ';', ':' | +| **--raw_delimiter** DELIMITER | declare delimiter choice as a raw string. | + ## Using the OLGA modules in a Python script (advanced users) In order to incorporate the core algorithm into an analysis pipeline (or to write your own script wrappers) all that is needed is to import the modules and load up a generative model. Each module defines some classes that only a few methods get called on. @@ -235,13 +310,13 @@ The modules are: The classes with methods that are of interest will be GenerationProbabilityV(D)J (to compute Pgens) and SequenceGenerationV(D)J (to generate sequences). -There is a fair amount of parameter processing that must go on to call these methods, however this is generally all done by instantiating a particular class. An exception to this rule are the classes GenerativeModelV(D)J and GenomicDataV(D)J. Normally the genomic data and model parameters are read in from IGoR inference files (and prepared V and J anchor files that have been prepared), however this is not mandated in order to make it easier for people to adapt the code to read in models/genomic data from other sources. +There is a fair amount of parameter processing that must go on to call these methods, however this is generally all done by instantiating a particular class. An exception to this rule are the classes GenerativeModelV(D)J and GenomicDataV(D)J. Normally the genomic data and model parameters are read in from [IGoR](https://github.com/qmarcou/IGoR) inference files (and prepared V and J anchor files that have been prepared), however this is not mandated in order to make it easier for people to adapt the code to read in models/genomic data from other sources. -Instantiating GenerativeModelV(D)J and GenomicDataV(D)J leaves the attributes as dummies, and calling the methods load_and_process_igor_model and load_igor_genomic_data will load up IGoR files. +Instantiating GenerativeModelV(D)J and GenomicDataV(D)J leaves the attributes as dummies, and calling the methods load_and_process_igor_model and load_igor_genomic_data will load up [IGoR](https://github.com/qmarcou/IGoR) files. If you want to load models/data from other sources, you will need to write your own methods to set the attributes in GenerativeModelV(D)J and GenomicDataV(D)J. Please see the documentation of load_model.py for more details. -Here is an example of loading the default human TCRB model to compute some sequence Pgens and to generate some random CDR3 sequences: +Here is an example of loading the default human TRB model to compute some sequence Pgens and to generate some random CDR3 sequences: ``` >>> import olga.load_model as load_model @@ -306,7 +381,7 @@ symbolB: another,comma,delimited,list ``` etc, where symbolA and symbolB are single characters and aren't one of the protected characters. Please see utils.py for more documentation, and the options of the console scripts to see how to call such alphabet files. -In addition to allowing customization of the 'amino acid' alphabet, we include functions that can parse regular expressions with a limited vocabulary. In particular only [] and {} are supported (the symbol for a Kleene star, \*, must be reserved for stop codons). The sequences corresponding to the regular expression can then be enumerated and their Pgens summed. Note, this can be slow as Pgen must be computed for each of the enumerated sequences independently. If a particular combination of amino acids is being used in a [] frequently you may consider defining a symbol for that combination and adding it to the alphabet. For example the regular expression 'CASS**[ACDEFGHIKLMNPQRSTVWY]**SARPEQFF' will list out 20 sequences, but its Pgen could be computed by considering the single CDR3 sequence 'CASS**X**SARPEQFF' Please see documentation in pgen.py for more info and examples. +In addition to allowing customization of the 'amino acid' alphabet, we include functions that can parse regular expressions with a limited vocabulary. In particular only [] and {} are supported (the symbol for a Kleene star, \*, must be reserved for stop codons). The sequences corresponding to the regular expression can then be enumerated and their Pgens summed. Note, this can be slow as Pgen must be computed for each of the enumerated sequences independently. If a particular combination of amino acids is being used in a [] frequently you may consider defining a symbol for that combination and adding it to the alphabet. For example the regular expression 'CASS**[ACDEFGHIKLMNPQRSTVWY]**SARPEQFF' will list out 20 sequences, but its Pgen could be computed by considering the single CDR3 sequence 'CASS**X**SARPEQFF' Please see documentation in pgen.py for more info and examples. To prevent overflow, the arbitrary length repetition {x,} is cutoff at a maximum of 40 (if you want a higher number {x,y} will go higher than 40). Furthermore, if more than 10,000 sequences correspond to the template, the regular expression will be rejected. ## Contact diff --git a/olga/compute_pgen.py b/olga/compute_pgen.py new file mode 100755 index 0000000..93489b1 --- /dev/null +++ b/olga/compute_pgen.py @@ -0,0 +1,782 @@ +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- +"""Command line script to compute Pgens of CDR3 sequences. + + Copyright (C) 2018 Zachary Sethna + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +This script will compute the generation probabilities (Pgens) of sequences, as +defined by a specified generative model. The sequences must be TRIMMED TO ONLY +THE CDR3 region as defined by the V and J anchor files (default is to INCLUDE +the conserved residues of the C in the V region and the F/W in the J region). + +Each sequence will be determined to be one of: +1) 'Amino acid' sequence including any ambiguous amino acid symbols (see +Options for alphabet_file to specify custom symbols) +2) Nucleotide sequence (in-frame) +3) Regular expression template for 'amino acid sequences' + +There are four default generative models that ship with OLGA and can be +specified with a flag: +--humanTRA (Human T cell alpha chain VJ model) +--humanTRB (Human T cell beta chain VDJ model) +--mouseTRB (Mouse T cell beta chain VDJ model) +--humanIGH (Human B cell heavy chain VDJ model) + +A custom model can also be specified (see below for details). + +This script can read in sequences and output pgens in one of three modes: + +1) Pass CDR3 sequences as arguments, output is printed to stdout. +2) Read in CDR3 sequences from a file, output is printed to stdout. +3) Read in CDR3 sequences from a file, output to a file, dynamic display with +time updates printed to stdout. + + +Mode 1): + +This mode is provided as a way to quickly compute pgen of just a single or +couple of sequences. It is not advisable to use this mode to set up a loop +over a lot of sequences as each call of the script demands the overhead of +processing a model. To compute the pgens of many sequences, it is suggested to +read the sequences in from a file and use either mode 2 or 3. + +It is also possible to condition the Pgen computation on V and J identity by +specifying the V or J usages as a mask. However, note that these V/J masks will +be applied to ALL of the sequences provided as arguments. Read Options on +v_mask and j_mask for more info. + +-------------------------------------------------------------------------------- +Example calls: + +$ olga-compute_pgen --humanTRB CASSTGQANYGYTF +Pgen of the amino acid sequence CASSTGQANYGYTF: 5.26507446955e-08 + +$ olga-compute_pgen --humanTRB TGTGCCAGCAGTGACGCACAGGGGCGTAATCGTGGGACTGAAGCTTTCTTT +Pgen of the nucleotide sequence TGTGCCAGCAGTGACGCACAGGGGCGTAATCGTGGGACTGAAGCTTTCTTT: 1.31873701121e-17 +Pgen of the amino acid sequence nt2aa(TGTGCCAGCAGTGACGCACAGGGGCGTAATCGTGGGACTGAAGCTTTCTTT) = CASSDAQGRNRGTEAFF: 4.70599549953e-13 + +#For a regular expression sequence, backslashes may be needed to specify the +#characters {} if seq read in as args +$ olga-compute_pgen --humanTRB CASSTGX\{1,5\}QAN[YA]GYTF +Pgen of the regular expression sequence CASSTGX{1,5}QAN[YA]GYTF: 7.588241802e-08 + +$ olga-compute_pgen --humanTRB CASSTGQANYGYTF CASSTGX\{1,5\}QAN[YA]GYTF TGTGCCAGCAGTGACGCACAGGGGCGTAATCGTGGGACTGAAGCTTTCTTT +Pgen of the amino acid sequence CASSTGQANYGYTF: 5.26507446955e-08 +Pgen of the regular expression sequence CASSTGX{1,5}QAN[YA]GYTF: 7.588241802e-08 +Pgen of the nucleotide sequence TGTGCCAGCAGTGACGCACAGGGGCGTAATCGTGGGACTGAAGCTTTCTTT: 1.31873701121e-17 +Pgen of the amino acid sequence nt2aa(TGTGCCAGCAGTGACGCACAGGGGCGTAATCGTGGGACTGAAGCTTTCTTT) = CASSDAQGRNRGTEAFF: 4.70599549953e-13 + +$ olga-compute_pgen --humanTRB CASSTGQANYGYTF --v_mask TRBV14 --j_mask TRBJ1-2 +Pgen of the amino acid sequence CASSTGQANYGYTF: 5.5513032863e-10 + +-------------------------------------------------------------------------------- + +Modes 2/3): + +These read in sequences from a file. The script has only minimal file parsing +built in, so reading in sequences from a file requires the file to be structured +with delimiter spaced values (i.e. the data is organized in columns separated +by delimiter like a .tsv or .csv file). Read Options on delimiter for more info. + +It is not recommended to read in regular expression sequences from a file. These +sequences require enumerating out the amino acid sequences which correspond to +them and computing pgen for each of them individually -- this can require a +large time cost. Instead consider defining a custom 'amino acid' alphabet to +define the symbols used in the regular expressions if possible. Furthermore, +BE CAREFUL if reading in from a .csv file -- if commas are used in a regex +sequence and comma is used as the delimiter of the .csv file, the sequence will +not be read in properly. + +If nucleotide sequences are to be read in it is possible to specify if the +output should be the nucleotide sequence Pgen and/or the translated amino acid +sequence Pgen (the default is to compute and output both). See Options. + +To read in sequences, the index of column of CDR3 sequences is needed. The +default is to assume that the sequences to be read in are in the first column +(index 0), meaning that a text file with only a sequence on each line will be +read in okay by default. Read Options on seq_in for more info. + +It is also possible to condition the Pgen computation on V and J identity by +specifying what index the column that V and J masks are stored for each line. + +Mode 2 does not have a specified output file and so will print the sequences +and their pgens to stdout. + +Mode 3 does have a specified output file. By default in this mode there is a +running display of the last few sequences/pgens written to the output file as +well as time elapsed, current rate of computation, and estimated time remaining. +This display can be disabled (see Options). + +As it is rare for datasets to be >> 1e4, parallelization is not built in. +However, there are options to skip N lines of the file and to load at most M +sequences so, if wanted, one could build a parallelized wrapper around this +script (though it would be recommended to instead just import the modules and +build from there). + +-------------------------------------------------------------------------------- +Example calls (assumes a file example_seqs.tsv with the line structure +ntseq aaseq V_mask J_mask): + +#Reads in the ntseqs, prints the ntseq, aaseq and their pgens to stdout +$ olga-compute_pgen -i example_seqs.tsv --humanTRB + +#Reads in the ntseqs, writes the ntseq, aaseq and their pgens to example_pgens.tsv +$ olga-compute_pgen -i example_seqs.tsv --humanTRB -o example_pgens.tsv + +#Specifies the V/J mask indices +$ olga-compute_pgen -i example_seqs.tsv --humanTRB -o example_pgens.tsv --v_in 2 --j_in 3 + +#Reads in the aaseq column +$ olga-compute_pgen -i example_seqs.tsv --humanTRB -o example_pgens.tsv --seq_in 1 + +#Only runs the first 100 sequences +$ olga-compute_pgen -i example_seqs.tsv --humanTRB -o example_pgens.tsv --seq_in 1 -m 100 +-------------------------------------------------------------------------------- + +To specify a custom model folder use: +--set_custom_model_VJ (generative model of VJ recombination, e.g. T alpha chain) +--set_custom_model_VDJ (generative model of VDJ recombination, e.g. T beta chain) + +Note, if specifying a custom model folder for either a VJ recombination model +(e.g. an alpha or light chain model) or a VDJ recombination model +(e.g. a beta or heavy chain model), the folder must contain the following files +with the exact naming convention: + +model_params.txt (IGoR inference param file) +model_marginals.txt (IGoR inference marginal file) +V_gene_CDR3_anchors.csv (V residue anchor and functionality file) +J_gene_CDR3_anchors.csv (J residue anchor and functionality file) + +-------------------------------------------------------------------------------- +Options: + -h, --help show this help message and exit + --humanTRA, --human_T_alpha + use default human TRA model (T cell alpha chain) + --humanTRB, --human_T_beta + use default human TRB model (T cell beta chain) + --mouseTRB, --mouse_T_beta + use default mouse TRB model (T cell beta chain) + --humanIGH, --human_B_heavy + use default human IGH model (B cell heavy chain) + --set_custom_model_VDJ=PATH/TO/FOLDER/ + specify PATH/TO/FOLDER/ for a custom VDJ generative + model + --set_custom_model_VJ=PATH/TO/FOLDER/ + specify PATH/TO/FOLDER/ for a custom VJ generative + model + -i PATH/TO/FILE, --infile=PATH/TO/FILE + read in CDR3 sequences (and optionally V/J masks) from + PATH/TO/FILE + -o PATH/TO/FILE, --outfile=PATH/TO/FILE + write CDR3 sequences and pgens to PATH/TO/FILE + --seq_in=INDEX, --seq_index=INDEX + specifies sequences to be read in are in column INDEX. + Default is index 0 (the first column). + --v_in=INDEX, --v_mask_index=INDEX + specifies V_masks are found in column INDEX in the + input file. Default is no V mask. + --j_in=INDEX, --j_mask_index=INDEX + specifies J_masks are found in column INDEX in the + input file. Default is no J mask. + --v_mask=V_MASK specify V usage to condition Pgen on for seqs read in + as arguments. + --j_mask=J_MASK specify J usage to condition Pgen on for seqs read in + as arguments. + -m N, --max_number_of_seqs=N + compute Pgens for at most N sequences. + --lines_to_skip=N skip the first N lines of the file. Default is 0. + -a PATH/TO/FILE, --alphabet_filename=PATH/TO/FILE + specify PATH/TO/FILE defining a custom 'amino acid' + alphabet. Default is no custom alphabet. + --seq_type_out=SEQ_TYPE + if read in sequences are ntseqs, declare what type of + sequence to compute pgen for. Default is all. Choices: + 'all', 'ntseq', 'nucleotide', 'aaseq', 'amino_acid' + --skip_off, --skip_empty_off + stop skipping empty or blank sequences/lines (if for + example you want to keep line index fidelity between + the infile and outfile). + --display_off turn the sequence display off (only applies in safe + mode). Default is on. + --num_lines_for_display=N + N lines of the output file are displayed when sequence + display is on. Also used to determine the number of + sequences to average over for speed and time + estimates. + --time_updates_off turn time updates off (only applies when sequence + display is disabled). + --seqs_per_time_update=N + specify the number of sequences between time updates. + Default is 1e5. + -d DELIMITER, --delimiter=DELIMITER + declare infile delimiter. Default is tab for .tsv + input files, comma for .csv files, and any whitespace + for all others. Choices: 'tab', 'space', ',', ';', ':' + --raw_delimiter=DELIMITER + declare infile delimiter as a raw string. + --delimiter_out=DELIMITER_OUT + declare outfile delimiter. Default is tab for .tsv + output files, comma for .csv files, and the infile + delimiter for all others. Choices: 'tab', 'space', + ',', ';', ':' + --raw_delimiter_out=DELIMITER_OUT + declare for the delimiter outfile as a raw string. + --gene_mask_delimiter=GENE_MASK_DELIMITER + declare gene mask delimiter. Default comma unless + infile delimiter is comma, then default is a + semicolon. Choices: 'tab', 'space', ',', ';', ':' + --raw_gene_mask_delimiter=GENE_MASK_DELIMITER + declare delimiter of gene masks as a raw string. + --comment_delimiter=COMMENT_DELIMITER + character or string to indicate comment or header + lines to skip. +-------------------------------------------------------------------------------- + + +@author: zacharysethna +""" + +#Function assumes that it is in the same directory that the folder app/ is +#in (which should contain all the modules imported). + +import os +import sys +import time + +import olga.load_model as load_model +import olga.generation_probability as generation_probability +from olga.utils import nt2aa, determine_seq_type +from optparse import OptionParser + +#Need to determine what mode to run in. +#1) Sequences as args +#2) 'Safe mode', i.e. reading from file, writing to file +#3) not 'safe mode'. i.e. reading from file, printing to stdout + + +def main(): + """Compute Pgens from a file and output to another file.""" + + parser = OptionParser(conflict_handler="resolve") + + parser.add_option('--humanTRA', '--human_T_alpha', action='store_true', dest='humanTRA', default=False, help='use default human TRA model (T cell alpha chain)') + parser.add_option('--humanTRB', '--human_T_beta', action='store_true', dest='humanTRB', default=False, help='use default human TRB model (T cell beta chain)') + parser.add_option('--mouseTRB', '--mouse_T_beta', action='store_true', dest='mouseTRB', default=False, help='use default mouse TRB model (T cell beta chain)') + parser.add_option('--humanIGH', '--human_B_heavy', action='store_true', dest='humanIGH', default=False, help='use default human IGH model (B cell heavy chain)') + parser.add_option('--set_custom_model_VDJ', dest='vdj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VDJ generative model') + parser.add_option('--set_custom_model_VJ', dest='vj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VJ generative model') + + parser.add_option('-i', '--infile', dest = 'infile_name',metavar='PATH/TO/FILE', help='read in CDR3 sequences (and optionally V/J masks) from PATH/TO/FILE') + parser.add_option('-o', '--outfile', dest = 'outfile_name', metavar='PATH/TO/FILE', help='write CDR3 sequences and pgens to PATH/TO/FILE') + parser.add_option('--seq_in', '--seq_index', type='int', metavar='INDEX', dest='seq_in_index', default = 0, help='specifies sequences to be read in are in column INDEX. Default is index 0 (the first column).') + + parser.add_option('--v_in', '--v_mask_index', type='int', metavar='INDEX', dest='V_mask_index', help='specifies V_masks are found in column INDEX in the input file. Default is no V mask.') + parser.add_option('--j_in', '--j_mask_index', type='int', metavar='INDEX', dest='J_mask_index', help='specifies J_masks are found in column INDEX in the input file. Default is no J mask.') + + parser.add_option('--v_mask', type='string', dest='V_mask', help='specify V usage to condition Pgen on for seqs read in as arguments.') + parser.add_option('--j_mask', type='string', dest='J_mask', help='specify J usage to condition Pgen on for seqs read in as arguments.') + + parser.add_option('-m', '--max_number_of_seqs', type='int',metavar='N', dest='max_number_of_seqs', help='compute Pgens for at most N sequences.') + parser.add_option('--lines_to_skip', type='int',metavar='N', dest='lines_to_skip', default = 0, help='skip the first N lines of the file. Default is 0.') + parser.add_option('-a', '--alphabet_filename', dest='alphabet_filename', metavar='PATH/TO/FILE', help="specify PATH/TO/FILE defining a custom 'amino acid' alphabet. Default is no custom alphabet.") + parser.add_option('--seq_type_out', type='choice',metavar='SEQ_TYPE', dest='seq_type_out', choices=['all', 'ntseq', 'nucleotide', 'aaseq', 'amino_acid'], help="if read in sequences are ntseqs, declare what type of sequence to compute pgen for. Default is all. Choices: 'all', 'ntseq', 'nucleotide', 'aaseq', 'amino_acid'") + parser.add_option('--skip_off','--skip_empty_off', action='store_true', dest = 'skip_empty', default=True, help='stop skipping empty or blank sequences/lines (if for example you want to keep line index fidelity between the infile and outfile).') + + parser.add_option('--display_off', action='store_false', dest='display_seqs', default=True, help='turn the sequence display off (only applies in write-to-file mode). Default is on.') + parser.add_option('--num_lines_for_display', type='int', metavar='N', default = 50, dest='num_lines_for_display', help='N lines of the output file are displayed when sequence display is on. Also used to determine the number of sequences to average over for speed and time estimates.') + parser.add_option('--time_updates_off', action='store_false', dest='time_updates', default=True, help='turn time updates off (only applies when sequence display is disabled).') + parser.add_option('--seqs_per_time_update', type='float', metavar='N', default = 100, dest='seqs_per_time_update', help='specify the number of sequences between time updates. Default is 1e5.') + + parser.add_option('-d', '--delimiter', type='choice', dest='delimiter', choices=['tab', 'space', ',', ';', ':'], help="declare infile delimiter. Default is tab for .tsv input files, comma for .csv files, and any whitespace for all others. Choices: 'tab', 'space', ',', ';', ':'") + parser.add_option('--raw_delimiter', type='str', dest='delimiter', help="declare infile delimiter as a raw string.") + parser.add_option('--delimiter_out', type='choice', dest='delimiter_out', choices=['tab', 'space', ',', ';', ':'], help="declare outfile delimiter. Default is tab for .tsv output files, comma for .csv files, and the infile delimiter for all others. Choices: 'tab', 'space', ',', ';', ':'") + parser.add_option('--raw_delimiter_out', type='str', dest='delimiter_out', help="declare for the delimiter outfile as a raw string.") + parser.add_option('--gene_mask_delimiter', type='choice', dest='gene_mask_delimiter', choices=['tab', 'space', ',', ';', ':'], help="declare gene mask delimiter. Default comma unless infile delimiter is comma, then default is a semicolon. Choices: 'tab', 'space', ',', ';', ':'") + parser.add_option('--raw_gene_mask_delimiter', type='str', dest='gene_mask_delimiter', help="declare delimiter of gene masks as a raw string.") + parser.add_option('--comment_delimiter', type='str', dest='comment_delimiter', help="character or string to indicate comment or header lines to skip.") + + + (options, args) = parser.parse_args() + + #Check that the model is specified properly + main_folder = os.path.dirname(__file__) + + default_models = {} + default_models['humanTRA'] = [os.path.join(main_folder, 'default_models', 'human_T_alpha'), 'VJ'] + default_models['humanTRB'] = [os.path.join(main_folder, 'default_models', 'human_T_beta'), 'VDJ'] + default_models['mouseTRB'] = [os.path.join(main_folder, 'default_models', 'mouse_T_beta'), 'VDJ'] + default_models['humanIGH'] = [os.path.join(main_folder, 'default_models', 'human_B_heavy'), 'VDJ'] + + num_models_specified = sum([1 for x in default_models.keys() + ['vj_model_folder', 'vdj_model_folder'] if getattr(options, x)]) + + if num_models_specified == 1: #exactly one model specified + try: + d_model = [x for x in default_models.keys() if getattr(options, x)][0] + model_folder = default_models[d_model][0] + recomb_type = default_models[d_model][1] + except IndexError: + if options.vdj_model_folder: #custom VDJ model specified + model_folder = options.vdj_model_folder + recomb_type = 'VDJ' + elif options.vj_model_folder: #custom VJ model specified + model_folder = options.vj_model_folder + recomb_type = 'VJ' + elif num_models_specified == 0: + print 'Need to indicate generative model.' + print 'Exiting...' + return -1 + elif num_models_specified > 1: + print 'Only specify one model' + print 'Exiting...' + return -1 + + #Check that all model and genomic files exist in the indicated model folder + if not os.path.isdir(model_folder): + print 'Check pathing... cannot find the model folder: ' + model_folder + print 'Exiting...' + return -1 + + params_file_name = os.path.join(model_folder,'model_params.txt') + marginals_file_name = os.path.join(model_folder,'model_marginals.txt') + V_anchor_pos_file = os.path.join(model_folder,'V_gene_CDR3_anchors.csv') + J_anchor_pos_file = os.path.join(model_folder,'J_gene_CDR3_anchors.csv') + + for x in [params_file_name, marginals_file_name, V_anchor_pos_file, J_anchor_pos_file]: + if not os.path.isfile(x): + print 'Cannot find: ' + x + print 'Please check the files (and naming conventions) in the model folder ' + model_folder + print 'Exiting...' + return -1 + + alphabet_filename = options.alphabet_filename #used if a custom alphabet is to be specified + if alphabet_filename is not None: + if not os.path.isfile(alphabet_filename): + print 'Cannot find custom alphabet file: ' + infile_name + print 'Exiting...' + return -1 + + #Load up model based on recomb_type + #VDJ recomb case --- used for TCRB and IGH + if recomb_type == 'VDJ': + genomic_data = load_model.GenomicDataVDJ() + genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) + generative_model = load_model.GenerativeModelVDJ() + generative_model.load_and_process_igor_model(marginals_file_name) + pgen_model = generation_probability.GenerationProbabilityVDJ(generative_model, genomic_data, alphabet_filename) + #VJ recomb case --- used for TCRA and light chain + elif recomb_type == 'VJ': + genomic_data = load_model.GenomicDataVJ() + genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) + generative_model = load_model.GenerativeModelVJ() + generative_model.load_and_process_igor_model(marginals_file_name) + pgen_model = generation_probability.GenerationProbabilityVJ(generative_model, genomic_data, alphabet_filename) + + aa_alphabet = ''.join(pgen_model.codons_dict.keys()) + + if options.infile_name is not None: + infile_name = options.infile_name + + if not os.path.isfile(infile_name): + print 'Cannot find input file: ' + infile_name + print 'Exiting...' + return -1 + + if options.outfile_name is not None: + outfile_name = options.outfile_name + if os.path.isfile(outfile_name): + if not raw_input(outfile_name + ' already exists. Overwrite (y/n)? ').strip().lower() in ['y', 'yes']: + print 'Exiting...' + return -1 + + #Parse delimiter + delimiter = options.delimiter + if delimiter is None: #Default case + if options.infile_name is None: + delimiter = '\t' + elif infile_name.endswith('.tsv'): #parse TAB separated value file + delimiter = '\t' + elif infile_name.endswith('.csv'): #parse COMMA separated value file + delimiter = ',' + else: + try: + delimiter = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[delimiter] + except KeyError: + pass #Other string passed as the delimiter. + + #Parse delimiter_out + delimiter_out = options.delimiter_out + if delimiter_out is None: #Default case + if delimiter is None: + delimiter_out = '\t' + else: + delimiter_out = delimiter + if options.outfile_name is None: + pass + elif outfile_name.endswith('.tsv'): #output TAB separated value file + delimiter_out = '\t' + elif outfile_name.endswith('.csv'): #output COMMA separated value file + delimiter_out = ',' + else: + try: + delimiter_out = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[delimiter_out] + except KeyError: + pass #Other string passed as the delimiter. + + #Parse gene_delimiter + gene_mask_delimiter = options.gene_mask_delimiter + if gene_mask_delimiter is None: #Default case + gene_mask_delimiter = ',' + if delimiter == ',': + gene_mask_delimiter = ';' + else: + try: + gene_mask_delimiter = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[gene_mask_delimiter] + except KeyError: + pass #Other string passed as the delimiter. + + + #More options + time_updates = options.time_updates + display_seqs = options.display_seqs + num_lines_for_display = options.num_lines_for_display + seq_in_index = options.seq_in_index #where in the line the sequence is after line.split(delimiter) + lines_to_skip = options.lines_to_skip #one method of skipping header + comment_delimiter = options.comment_delimiter #another method of skipping header + seqs_per_time_update = options.seqs_per_time_update + max_number_of_seqs = options.max_number_of_seqs + V_mask_index = options.V_mask_index #Default is not conditioning on V identity + J_mask_index = options.J_mask_index #Default is not conditioning on J identity + skip_empty = options.skip_empty + + seq_type_out = options.seq_type_out #type of pgens to be computed. Can be ntseq, aaseq, or both + if seq_type_out is not None: + seq_type_out = {'all': None, 'ntseq': 'ntseq', 'nucleotide': 'ntseq', 'aaseq': 'aaseq', 'amino_acid': 'aaseq'}[seq_type_out] + + if options.infile_name is None: #No infile specified -- args should be the input seqs + print_warnings = True + seqs = args + seq_types = [determine_seq_type(seq, aa_alphabet) for seq in seqs] + unrecognized_seqs = [seq for i, seq in enumerate(seqs) if seq_types[i] is None] + if len(unrecognized_seqs) > 0 and print_warnings: + print 'The following sequences/arguments were not recognized: ' + ', '.join(unrecognized_seqs) + seqs = [seq for i, seq in enumerate(seqs) if seq_types[i] is not None] + seq_types = [seq_type for seq_type in seq_types if seq_type is not None] + + + #Format V and J masks -- uniform for all argument input sequences + try: + V_mask = options.V_mask.split(',') + unrecognized_v_genes = [v for v in V_mask if v not in pgen_model.V_mask_mapping.keys()] + V_mask = [v for v in V_mask if v in pgen_model.V_mask_mapping.keys()] + if len(unrecognized_v_genes) > 0: + print 'These V genes/alleles are not recognized: ' + ', '.join(unrecognized_v_genes) + if len(V_mask) == 0: + print 'No recognized V genes/alleles in the provided V_mask. Continuing without conditioning on V usage.' + V_mask = None + except AttributeError: + V_mask = options.V_mask #Default is None, i.e. not conditioning on V identity + + try: + J_mask = options.J_mask.split(',') + unrecognized_j_genes = [j for j in J_mask if j not in pgen_model.J_mask_mapping.keys()] + J_mask = [j for j in J_mask if j in pgen_model.J_mask_mapping.keys()] + if len(unrecognized_j_genes) > 0: + print 'These J genes/alleles are not recognized: ' + ', '.join(unrecognized_j_genes) + if len(J_mask) == 0: + print 'No recognized J genes/alleles in the provided J_mask. Continuing without conditioning on J usage.' + J_mask = None + except AttributeError: + J_mask = options.J_mask #Default is None, i.e. not conditioning on J identity + + print '' + start_time = time.time() + for seq, seq_type in zip(seqs, seq_types): + if seq_type == 'aaseq': + c_pgen = pgen_model.compute_aa_CDR3_pgen(seq, V_mask, J_mask, print_warnings) + print 'Pgen of the amino acid sequence ' + seq + ': ' + str(c_pgen) + print '' + elif seq_type == 'regex': + c_pgen = pgen_model.compute_regex_CDR3_template_pgen(seq, V_mask, J_mask, print_warnings) + print 'Pgen of the regular expression sequence ' + seq + ': ' + str(c_pgen) + print '' + elif seq_type == 'ntseq': + if seq_type_out is None or seq_type_out == 'ntseq': + c_pgen_nt = pgen_model.compute_nt_CDR3_pgen(seq, V_mask, J_mask, print_warnings) + print 'Pgen of the nucleotide sequence ' + seq + ': ' + str(c_pgen_nt) + if seq_type_out is None or seq_type_out == 'aaseq': + c_pgen_aa = pgen_model.compute_aa_CDR3_pgen(nt2aa(seq), V_mask, J_mask, print_warnings) + print 'Pgen of the amino acid sequence nt2aa(' + seq + ') = ' + nt2aa(seq) + ': ' + str(c_pgen_aa) + print '' + + c_time = time.time() - start_time + if c_time > 86400: #more than a day + c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(c_time)/86400, (int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) + elif c_time > 3600: #more than an hr + c_time_str = '%d hours, %d minutes, and %.2f seconds.'%((int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) + elif c_time > 60: #more than a min + c_time_str = '%d minutes and %.2f seconds.'%((int(c_time)/60)%60, c_time%60) + else: + c_time_str = '%.2f seconds.'%(c_time) + + print 'Completed pgen computation in: ' + c_time_str + + else: #Read sequences in from file + print_warnings = False #Most cases of reading in from file should have warnings disabled + seqs = [] + seq_types = [] + V_usage_masks = [] + J_usage_masks = [] + + infile = open(infile_name, 'r') + + for i, line in enumerate(infile): + if comment_delimiter is not None: #Default case -- no comments/header delimiter + if line.startswith(comment_delimiter): #allow comments + continue + if i < lines_to_skip: + continue + + if delimiter is None: #Default delimiter is any whitespace + split_line = line.split() + else: + split_line = line.split(delimiter) + + #Find the seq + try: + seq = split_line[seq_in_index].strip() + if len(seq.strip()) == 0: + if skip_empty: + continue + else: + seqs.append(seq) #keep the blank seq as a placeholder + seq_types.append('aaseq') + else: + seqs.append(seq) + seq_types.append(determine_seq_type(seq, aa_alphabet)) + except IndexError: #no index match for seq + if skip_empty and len(line.strip()) == 0: + continue + print 'seq_in_index is out of range' + print 'Exiting...' + infile.close() + return -1 + + #Find and format V_usage_mask + if V_mask_index is None: + V_usage_masks.append(None) #default mask + else: + try: + V_usage_mask = split_line[V_mask_index].strip().split(gene_mask_delimiter) + #check that all V gene/allele names are recognized + if all([v in pgen_model.V_mask_mapping for v in V_usage_mask]): + V_usage_masks.append(V_usage_mask) + else: + print str(V_usage_mask) + " is not a usable V_usage_mask composed exclusively of recognized V gene/allele names" + print 'Unrecognized V gene/allele names: ' + ', '.join([v for v in V_usage_mask if not v in pgen_model.V_mask_mapping.keys()]) + print 'Exiting...' + infile.close() + return -1 + except IndexError: #no index match for V_mask_index + print 'V_mask_index is out of range' + print 'Exiting...' + infile.close() + return -1 + + #Find and format J_usage_mask + if J_mask_index is None: + J_usage_masks.append(None) #default mask + else: + try: + J_usage_mask = split_line[J_mask_index].strip().split(gene_mask_delimiter) + #check that all V gene/allele names are recognized + if all([j in pgen_model.J_mask_mapping for j in J_usage_mask]): + J_usage_masks.append(J_usage_mask) + else: + print str(J_usage_mask) + " is not a usable J_usage_mask composed exclusively of recognized J gene/allele names" + print 'Unrecognized J gene/allele names: ' + ', '.join([j for j in J_usage_mask if not j in pgen_model.J_mask_mapping.keys()]) + print 'Exiting...' + infile.close() + return -1 + except IndexError: #no index match for J_mask_index + print 'J_mask_index is out of range' + print 'Exiting...' + infile.close() + return -1 + + if max_number_of_seqs is not None: + if len(seqs) >= max_number_of_seqs: + break + + + unrecognized_seqs = [seq for i, seq in enumerate(seqs) if seq_types[i] is None] + if len(unrecognized_seqs) > 0 and len(unrecognized_seqs) < len(seqs): + if print_warnings or options.outfile_name is not None: + print 'Some strings read in were not parsed as sequences -- they will be omitted.' + print 'Examples of improperly read strings: ' + for unrecognized_seq in unrecognized_seqs[:10]: + print unrecognized_seq + seqs = [seq for i, seq in enumerate(seqs) if seq_types[i] is not None] + V_usage_masks = [V_usage_mask for i, V_usage_mask in enumerate(V_usage_masks) if seq_types[i] is not None] + seq_types = [seq_type for seq_type in seq_types if seq_type is not None] + elif len(unrecognized_seqs) > 0 and len(unrecognized_seqs) == len(seqs): + print 'None of the read in strings were parsed as sequences. Check input file.' + print 'Examples of improperly read strings:' + for unrecognized_seq in unrecognized_seqs[:10]: + print unrecognized_seq + print 'Exiting...' + return -1 + + infile.close() + + + if options.outfile_name is not None: #OUTFILE SPECIFIED, allow printed info/display + + print 'Successfully read in and formatted ' + str(len(seqs)) + ' sequences and any V or J usages.' + if display_seqs: + sys.stdout.write('\r'+'Continuing to Pgen computation in 3... ') + sys.stdout.flush() + time.sleep(0.4) + sys.stdout.write('\r'+'Continuing to Pgen computation in 2... ') + sys.stdout.flush() + time.sleep(0.4) + sys.stdout.write('\r'+'Continuing to Pgen computation in 1... ') + sys.stdout.flush() + time.sleep(0.4) + else: + print 'Continuing to Pgen computation.' + print_warnings = True #Display is off, can print warnings + + if display_seqs: + lines_for_display = [] + times_for_speed_calc = [time.time()] + + outfile = open(outfile_name, 'w') + start_time = time.time() + for i, seq in enumerate(seqs): + if seq_types[i] == 'aaseq': + #Compute Pgen and print out + c_pgen_line = seq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(seq, V_usage_masks[i], J_usage_masks[i], print_warnings)) + if seq_types[i] == 'regex': + #Compute Pgen and print out + c_pgen_line = seq + delimiter_out + str(pgen_model.compute_regex_CDR3_template_pgen(seq, V_usage_masks[i], J_usage_masks[i], print_warnings)) + elif seq_types[i] == 'ntseq': + ntseq = seq + if len(ntseq) % 3 == 0: #inframe sequence + aaseq = nt2aa(ntseq) + #Compute Pgen and print out based on recomb_type and seq_type_out + if seq_type_out is None: + c_pgen_line = ntseq + delimiter_out + str(pgen_model.compute_nt_CDR3_pgen(ntseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) + delimiter_out + aaseq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) + elif seq_type_out == 'ntseq': + c_pgen_line = ntseq + delimiter_out + str(pgen_model.compute_nt_CDR3_pgen(ntseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) + elif seq_type_out == 'aaseq': + c_pgen_line = aaseq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) + else: #out of frame sequence -- Pgens are 0 and use 'out_of_frame' for aaseq + if seq_type_out is None: + c_pgen_line = ntseq + delimiter_out + '0' + delimiter_out + 'out_of_frame' + delimiter_out + '0' + elif seq_type_out == 'ntseq': + c_pgen_line = ntseq + delimiter_out + '0' + elif seq_type_out == 'aaseq': + c_pgen_line = 'out_of_frame' + delimiter_out + '0' + + outfile.write(c_pgen_line + '\n') + + #Print time update + if display_seqs: + cc_time = time.time() + c_time = cc_time - start_time + times_for_speed_calc = [cc_time] + times_for_speed_calc[:num_lines_for_display] + c_avg_speed = (len(times_for_speed_calc)-1)/float(times_for_speed_calc[0] - times_for_speed_calc[-1]) + + #eta = ((len(seqs) - (i+1))/float(i+1))*c_time + + eta = (len(seqs) - (i+1))/c_avg_speed + + lines_for_display = [c_pgen_line] + lines_for_display[:num_lines_for_display] + + + c_time_str = '%s hours, %s minutes, and %s seconds.'%(repr(int(c_time)/3600).rjust(3), repr((int(c_time)/60)%60).rjust(2), repr(int(c_time)%60).rjust(2)) + eta_str = '%s hours, %s minutes, and %s seconds.'%(repr(int(eta)/3600).rjust(3), repr((int(eta)/60)%60).rjust(2), repr(int(eta)%60).rjust(2)) + time_str = 'Time to compute Pgen on %s seqs: %s \nEst. time for remaining %s seqs: %s'%(repr(i+1).rjust(9), c_time_str, repr(len(seqs) - (i + 1)).rjust(9), eta_str) + speed_str = 'Current Pgen computation speed: %s seqs/min'%(repr(round((len(times_for_speed_calc)-1)*60/float(times_for_speed_calc[0] - times_for_speed_calc[-1]), 2)).rjust(8)) + display_str = '\n'.join(lines_for_display[::-1]) + '\n' + '-'*80 + '\n' + time_str + '\n' + speed_str + '\n' + '-'*80 + print '\033[2J' + display_str + elif (i+1)%seqs_per_time_update == 0 and time_updates: + c_time = time.time() - start_time + eta = ((len(seqs) - (i+1))/float(i+1))*c_time + if c_time > 86400: #more than a day + c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(c_time)/86400, (int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) + elif c_time > 3600: #more than an hr + c_time_str = '%d hours, %d minutes, and %.2f seconds.'%((int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) + elif c_time > 60: #more than a min + c_time_str = '%d minutes and %.2f seconds.'%((int(c_time)/60)%60, c_time%60) + else: + c_time_str = '%.2f seconds.'%(c_time) + + if eta > 86400: #more than a day + eta_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(eta)/86400, (int(eta)/3600)%24, (int(eta)/60)%60, eta%60) + elif eta > 3600: #more than an hr + eta_str = '%d hours, %d minutes, and %.2f seconds.'%((int(eta)/3600)%24, (int(eta)/60)%60, eta%60) + elif eta > 60: #more than a min + eta_str = '%d minutes and %.2f seconds.'%((int(eta)/60)%60, eta%60) + else: + eta_str = '%.2f seconds.'%(eta) + + print 'Pgen computed for %d sequences in: %s Estimated time remaining: %s'%(i+1, c_time_str, eta_str) + + c_time = time.time() - start_time + if c_time > 86400: #more than a day + c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(c_time)/86400, (int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) + elif c_time > 3600: #more than an hr + c_time_str = '%d hours, %d minutes, and %.2f seconds.'%((int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) + elif c_time > 60: #more than a min + c_time_str = '%d minutes and %.2f seconds.'%((int(c_time)/60)%60, c_time%60) + else: + c_time_str = '%.2f seconds.'%(c_time) + print 'Completed Pgen computation for %d sequences: in %s'%(len(seqs), c_time_str) + + outfile.close() + + else: #NO OUTFILE -- print directly to stdout + start_time = time.time() + for i, seq in enumerate(seqs): + if seq_types[i] == 'aaseq': + #Compute Pgen and print out + c_pgen_line = seq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(seq, V_usage_masks[i], J_usage_masks[i], print_warnings)) + if seq_types[i] == 'regex': + #Compute Pgen and print out + c_pgen_line = seq + delimiter_out + str(pgen_model.compute_regex_CDR3_template_pgen(seq, V_usage_masks[i], J_usage_masks[i], print_warnings)) + elif seq_types[i] == 'ntseq': + ntseq = seq + if len(ntseq) % 3 == 0: #inframe sequence + aaseq = nt2aa(ntseq) + #Compute Pgen and print out based on recomb_type and seq_type_out + if seq_type_out is None: + c_pgen_line = ntseq + delimiter_out + str(pgen_model.compute_nt_CDR3_pgen(ntseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) + delimiter_out + aaseq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) + elif seq_type_out == 'ntseq': + c_pgen_line = ntseq + delimiter_out + str(pgen_model.compute_nt_CDR3_pgen(ntseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) + elif seq_type_out == 'aaseq': + c_pgen_line = aaseq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) + else: #out of frame sequence -- Pgens are 0 and use 'out_of_frame' for aaseq + if seq_type_out is None: + c_pgen_line = ntseq + delimiter_out + '0' + delimiter_out + 'out_of_frame' + delimiter_out + '0' + elif seq_type_out == 'ntseq': + c_pgen_line = ntseq + delimiter_out + '0' + elif seq_type_out == 'aaseq': + c_pgen_line = 'out_of_frame' + delimiter_out + '0' + + print c_pgen_line + +if __name__ == '__main__': main() diff --git a/olga/compute_single_sequence_pgen.py b/olga/compute_single_sequence_pgen.py deleted file mode 100755 index 127fc8b..0000000 --- a/olga/compute_single_sequence_pgen.py +++ /dev/null @@ -1,358 +0,0 @@ -#!/usr/bin/env python2 -# -*- coding: utf-8 -*- -""" -Command line script for Pgen computation of a single sequence. - - Copyright (C) 2018 Zachary Sethna - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - - -DON'T USE THIS FUNCTION TO LOOP OVER A LOT OF SEQUENCES. This function loads -up and processes a full model to compute Pgen of a single sequence, and thus -pays a nontrivial overhead time cost. If many sequences need to be run it is -better to use run_pgen.py or to write your own wrapper. - -The sequence which is read in must be TRIMMED TO ONLY THE CDR3 region as -defined by the V and J anchor files (default is to INCLUDE the conserved -residues of the C in the V region and the F/W in the J region). - -It is also possible to condition the Pgen computation on V and J identity by -specifying the V or J usages. See documentation below for an explanation of the -syntax and examples of calls. - -Required input: - -1) Sequence -THIS IS ASSUMED TO BE THE FIRST ARGUMENT (see example calls) - -2) Generative model used to define the generation probability of a sequence. -Flags for default models: - ---humanTCRA or --human_T_alpha (default Human T cell alpha chain model) ---humanTCRB or --human_T_beta (default Human T cell beta chain model) ---mouseTCRB or --mouse_T_beta (default Mouse T cell beta chain model) ---humanIGH or --human_B_heavy (default Human B cell heavy chain model) - -In order to use these default model flags, the program must be executed from -the same directory where models/ is. For example, models/human_T_beta/ should -be an okay pathing to the humanTCRB model folder. - -To specify a custom model folder use: - ---VJ_model_folder (a generative model of VJ recombination, e.g. T alpha chain) ---VDJ_model_folder (a generative model of VDJ recombination, e.g. T beta chain) - -Note, if specifying a custom model folder for either a VJ recombination model -(e.g. an alpha or light chain model) or a VDJ recombination model -(e.g. a beta or heavy chain model), the folder must contain the following files -with the exact naming convention: - -model_params.txt (IGoR inference param file) -model_marginals.txt (IGoR inference marginal file) -V_gene_CDR3_anchors.csv (V residue anchor and functionality file) -J_gene_CDR3_anchors.csv (J residue anchor and functionality file) -------------------------------------------------------------------------------- -Example call (example calls are formatted as executed functions instead of the -console script entry point. The arguments are identical in either case.) - -``` -$ ./compute_single_sequence_pgen.py CARQGALYEQYF --humanTCRB - ------------------------------------------------------------------------------------------- -Pgen of the amino acid sequence CARQGALYEQYF: 1.73865590343e-08 ------------------------------------------------------------------------------------------- -``` - - -------------------------------------------------------------------------------- -Options: - -h, --help show this help message and exit - --humanTCRA, --human_T_alpha - use default human TCRA model (T cell alpha chain) - --humanTCRB, --human_T_beta - use default human TCRB model (T cell beta chain) - --mouseTCRB, --mouse_T_beta - use default mouse TCRB model (T cell beta chain) - --humanIGH, --human_B_heavy - use default human IGH model (B cell heavy chain) - --VDJ_model_folder=PATH/TO/FOLDER/ - specify PATH/TO/FOLDER/ for a custom VDJ generative - model - --VJ_model_folder=PATH/TO/FOLDER/ - specify PATH/TO/FOLDER/ for a custom VJ generative - model - -v V_MASK, --v_mask=V_MASK - specify V usage to condition Pgen on - -j J_MASK, --j_mask=J_MASK - specify J usage to condition Pgen on - -a PATH/TO/FILE, --alphabet_filename=PATH/TO/FILE - specify PATH/TO/FILE defining a custom 'amino acid' - alphabet - -t, --time print time to compute Pgen. - --seq_type=SEQ_TYPE declare sequence type. Infers seq_type by default. - Choices: 'ntseq', 'nucleotide', 'aaseq', 'amino_acid', - 'regex', 'regular_expression' - --print_warnings_off turn Pgen print warnings off - -Example call with optional flags: - -``` -$ ./compute_single_sequence_pgen.py C[AI]LXXGSNY[KQ]L[TI][FW] --seq_type regex --humanTCRA -v 23/DV6,21 -j 53,33 -t -output: ------------------------------------------------------------------------------------------- -Pgen of the regular expression sequence C[AI]LXXGSNY[KQ]L[TI][FW]: 1.38162611621e-06 - -(Conditioned on the V and J gene/allele usages: ['23/DV6', '21'], ['53', '33']) ------------------------------------------------------------------------------------------- -Completed pgen computation in: 0.01 seconds. - -``` - -@author: zacharysethna - -""" - -#Function assumes that it is in the same directory that the folder app/ is -#in (which should contain all the modules imported). - -import os -import sys -import time - -import olga.load_model as load_model -import olga.generation_probability as generation_probability -from olga.utils import nt2aa -from optparse import OptionParser - -def main(): - """Compute Pgen of a single sequence.""" - - - parser = OptionParser(conflict_handler="resolve") - - parser.add_option('--humanTCRA', '--human_T_alpha', action='store_true', dest='humanTCRA', default=False, help='use default human TCRA model (T cell alpha chain)') - parser.add_option('--humanTCRB', '--human_T_beta', action='store_true', dest='humanTCRB', default=False, help='use default human TCRB model (T cell beta chain)') - parser.add_option('--mouseTCRB', '--mouse_T_beta', action='store_true', dest='mouseTCRB', default=False, help='use default mouse TCRB model (T cell beta chain)') - parser.add_option('--humanIGH', '--human_B_heavy', action='store_true', dest='humanIGH', default=False, help='use default human IGH model (B cell heavy chain)') - parser.add_option('--VDJ_model_folder', dest='vdj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VDJ generative model') - parser.add_option('--VJ_model_folder', dest='vj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VJ generative model') - parser.add_option('-v', '--v_mask', type='string', dest='V_mask', help='specify V usage to condition Pgen on') - parser.add_option('-j', '--j_mask', type='string', dest='J_mask', help='specify J usage to condition Pgen on') - parser.add_option('-a', '--alphabet_filename', dest='alphabet_filename', metavar='PATH/TO/FILE', help="specify PATH/TO/FILE defining a custom 'amino acid' alphabet") - parser.add_option('-t', '--time', action='store_true', dest='print_c_time', default=False, help='print time to compute Pgen.') - parser.add_option('--seq_type', type='choice', dest='seq_type', choices=['ntseq', 'nucleotide', 'aaseq', 'amino_acid', 'regex', 'regular_expression'], help="declare sequence type. Infers seq_type by default. Choices: 'ntseq', 'nucleotide', 'aaseq', 'amino_acid', 'regex', 'regular_expression'") - parser.add_option('--print_warnings_off', action='store_false', dest="print_warnings", default=True, help='turn Pgen print warnings off') - - - - (options, args) = parser.parse_args() - - #SEQUENCE IS THE FIRST ARGUMENT - try: - seq = args[0] - except IndexError: - print 'Need to specify the CDR3 sequence as the first argument!' - print 'Exiting...' - return -1 - - if len(seq.strip()) == 0: - print 'Need to specify the CDR3 sequence as the first argument!' - print 'Exiting...' - return -1 - - main_folder = os.path.dirname(__file__) - - default_models = {} - default_models['humanTCRA'] = [os.path.join(main_folder, 'default_models', 'human_T_alpha'), 'VJ'] - default_models['humanTCRB'] = [os.path.join(main_folder, 'default_models', 'human_T_beta'), 'VDJ'] - default_models['mouseTCRB'] = [os.path.join(main_folder, 'default_models', 'mouse_T_beta'), 'VDJ'] - default_models['humanIGH'] = [os.path.join(main_folder, 'default_models', 'human_B_heavy'), 'VDJ'] - - num_models_specified = sum([1 for x in default_models.keys() + ['vj_model_folder', 'vdj_model_folder'] if getattr(options, x)]) - - if num_models_specified == 1: #exactly one model specified - try: - d_model = [x for x in default_models.keys() if getattr(options, x)][0] - model_folder = default_models[d_model][0] - recomb_type = default_models[d_model][1] - except IndexError: - if options.vdj_model_folder: #custom VDJ model specified - model_folder = options.vdj_model_folder - recomb_type = 'VDJ' - elif options.vj_model_folder: #custom VJ model specified - model_folder = options.vj_model_folder - recomb_type = 'VJ' - elif num_models_specified == 0: - print 'Need to indicate generative model.' - print 'Exiting...' - return -1 - elif num_models_specified > 1: - print 'Only specify one model' - print 'Exiting...' - return -1 - - #Check that all model and genomic files exist in the indicated model folder - if not os.path.isdir(model_folder): - print 'Check pathing... cannot find the model folder: ' + model_folder - print 'Exiting...' - return -1 - - params_file_name = os.path.join(model_folder,'model_params.txt') - marginals_file_name = os.path.join(model_folder,'model_marginals.txt') - V_anchor_pos_file = os.path.join(model_folder,'V_gene_CDR3_anchors.csv') - J_anchor_pos_file = os.path.join(model_folder,'J_gene_CDR3_anchors.csv') - - for x in [params_file_name, marginals_file_name, V_anchor_pos_file, J_anchor_pos_file]: - if not os.path.isfile(x): - print 'Cannot find: ' + x - print 'Please check the files (and naming conventions) in the model folder ' + model_folder - print 'Exiting...' - return -1 - - - #Optional flags - alphabet_filename = options.alphabet_filename #used if a custom alphabet is to be specified - seq_type = options.seq_type - print_warnings = options.print_warnings - print_c_time = options.print_c_time - - #Format V and J masks - try: - V_mask = options.V_mask.split(',') - except AttributeError: - V_mask = options.V_mask #Default is None, i.e. not conditioning on V identity - - try: - J_mask = options.J_mask.split(',') - except AttributeError: - J_mask = options.J_mask #Default is None, i.e. not conditioning on J identity - - if alphabet_filename is not None: - if not os.path.isfile(alphabet_filename): - print 'Cannot find extended alphabet file: ' + alphabet_filename - print 'Exiting...' - return -1 - - #Default --- infer type of sequence. - if seq_type is None: - if all([x in 'ACGTacgt' for x in seq]): - seq_type = 'ntseq' - elif any([x in '[]{}0123456789,' for x in seq]): - seq_type = 'regex' - else: - seq_type = 'aaseq' - else: - #Must be one of these keys - seq_type = {'ntseq': 'ntseq', 'nucleotide': 'ntseq', 'aaseq': 'aaseq', 'amino_acid': 'aaseq', 'regex': 'regex', 'regular_expression': 'regex'}[seq_type] - - #Vet seq_type - if seq_type in ['aaseq', 'regex']: - if all([x in 'ACGTacgt' for x in seq]): - print 'It looks like the sequence read in is a nucleotide sequence.' - if raw_input('Change read in seq_type to nucleotide? (y/n)? ') in ['y', 'yes']: - seq_type = 'ntseq' - else: - print 'Okay... you are the boss... chances are the Pgen will be 0.' - - if seq_type in ['ntseq', 'aaseq']: - if any([x in '[]{}0123456789,' for x in seq]): - print 'It looks like sequence read in includes symbols of a regular expression sequence.' - if raw_input('Change read in seq_type to regular expression? (y/n)? ') in ['y', 'yes']: - seq_type = 'regex' - else: - print 'Okay... you are the boss... cannot read the sequence in that case.' - - - #Load up model based on recomb_type - #VDJ recomb case --- used for TCRB and IGH - if recomb_type == 'VDJ': - genomic_data = load_model.GenomicDataVDJ() - genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) - generative_model = load_model.GenerativeModelVDJ() - generative_model.load_and_process_igor_model(marginals_file_name) - pgen_model = generation_probability.GenerationProbabilityVDJ(generative_model, genomic_data, alphabet_filename) - #VJ recomb case --- used for TCRA and light chain - elif recomb_type == 'VJ': - genomic_data = load_model.GenomicDataVJ() - genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) - generative_model = load_model.GenerativeModelVJ() - generative_model.load_and_process_igor_model(marginals_file_name) - pgen_model = generation_probability.GenerationProbabilityVJ(generative_model, genomic_data, alphabet_filename) - - if seq_type == 'ntseq' and not all([x in 'ACGTacgt' for x in seq]): - if all([x in pgen_model.codons_dict.keys() for x in seq]): - print 'It looks like sequence read in is an amino acid sequence.' - if raw_input('Change read in seq_type to amino acid? (y/n)? ') in ['y', 'yes']: - seq_type = 'aaseq' - else: - print 'Okay... you are the boss... cannot read the sequence in that case.' - - print '' - if seq_type == 'regex' and all([x in pgen_model.codons_dict.keys() for x in seq]): - seq_type = 'aaseq' #amino acid sequence provided... just change the seq_type for a better printed output - - start_time = time.time() - if seq_type == 'aaseq': - pgen = pgen_model.compute_aa_CDR3_pgen(seq, V_mask, J_mask, print_warnings) - elif seq_type == 'regex': - pgen = pgen_model.compute_regex_CDR3_template_pgen(seq, V_mask, J_mask, print_warnings) - elif seq_type == 'ntseq': - pgen_nt = pgen_model.compute_nt_CDR3_pgen(seq, V_mask, J_mask, print_warnings) - pgen_aa = pgen_model.compute_aa_CDR3_pgen(nt2aa(seq), V_mask, J_mask, print_warnings) - - c_time = time.time() - start_time - if c_time > 86400: #more than a day - c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(c_time)/86400, (int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) - elif c_time > 3600: #more than an hr - c_time_str = '%d hours, %d minutes, and %.2f seconds.'%((int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) - elif c_time > 60: #more than a min - c_time_str = '%d minutes and %.2f seconds.'%((int(c_time)/60)%60, c_time%60) - else: - c_time_str = '%.2f seconds.'%(c_time) - - print '-'*90 - if seq_type == 'aaseq': - print 'Pgen of the amino acid sequence ' + seq + ': ' + str(pgen) - elif seq_type == 'regex': - print 'Pgen of the regular expression sequence ' + seq + ': ' + str(pgen) - elif seq_type == 'ntseq': - print 'Pgen of the nucleotide sequence ' + seq + ': ' + str(pgen_nt) - print 'Pgen of the amino acid sequence nt2aa(' + seq + ') = ' + nt2aa(seq) + ': ' + str(pgen_aa) - - if V_mask is None: - V_mask = [] - if J_mask is None: - J_mask = [] - - V_mask = [v for v in V_mask if v in pgen_model.V_mask_mapping.keys()] - J_mask = [j for j in J_mask if j in pgen_model.J_mask_mapping.keys()] - - if not len(V_mask) == 0 and not len(J_mask) == 0: - print '' - print '(Conditioned on the V and J gene/allele usages: ' + str(V_mask) + ', ' + str(J_mask) + ')' - elif not len(V_mask) == 0: - print '' - print '(Conditioned on the V gene/allele usages: ' + str(V_mask) + ')' - elif not len(J_mask) == 0: - print '' - print '(Conditioned on the J gene/allele usages: ' + str(J_mask) + ')' - - print '-'*90 - if print_c_time: - print 'Completed pgen computation in: ' + c_time_str - - print '' - -if __name__ == '__main__': main() diff --git a/olga/generate_synthetic_sequences.py b/olga/generate_sequences.py similarity index 57% rename from olga/generate_synthetic_sequences.py rename to olga/generate_sequences.py index 025088d..2579366 100755 --- a/olga/generate_synthetic_sequences.py +++ b/olga/generate_sequences.py @@ -18,39 +18,18 @@ along with this program. If not, see . This program will generate a file of Monte Carlo sampling from a specified -genomic model (either the default models that ship with the scripts, or a -custom IGoR model inference). The sequences generated will have NO ERRORS. -Input arguments are specified by an immediately preceding flag. Capitalization -is irrelevant for the flags and limited option string arguments (but can matter -for arbitrary string arguments). +generative V(D)J model. The sequences generated will have NO ERRORS. -The following are required inputs: - -1) File name for output file: PATH/TO/OUTFILE -THIS IS ASSUMED TO BE THE FIRST ARGUMENT (see example calls) - -2) Number of sequences to generate. --num_seqs or -n : int or float - If a float number is given, it will be rounded. However, it is convenient - to be able to specify numbers in scientific notation, e.g. 1e7 is easier than - 10000000. - -3) Generative model used to define the generation probability of a sequence. -Flags for default models: - ---humanTCRA or --human_T_alpha (default Human T cell alpha chain model) ---humanTCRB or --human_T_beta (default Human T cell beta chain model) ---mouseTCRB or --mouse_T_beta (default Mouse T cell beta chain model) ---humanIGH or --human_B_heavy (default Human B cell heavy chain model) - -In order to use these default model flags, the program must be executed from -the same directory where models/ is. For example, models/human_T_beta/ should -be an okay pathing to the humanTCRB model folder. +There are four default generative models that ship with OLGA and can be +specified with a flag: +--humanTRA (Human T cell alpha chain VJ model) +--humanTRB (Human T cell beta chain VDJ model) +--mouseTRB (Mouse T cell beta chain VDJ model) +--humanIGH (Human B cell heavy chain VDJ model) To specify a custom model folder use: - ---VJ_model_folder (a generative model of VJ recombination, e.g. T alpha chain) ---VDJ_model_folder (a generative model of VDJ recombination, e.g. T beta chain) +--set_custom_model_VJ (generative model of VJ recombination, e.g. T alpha chain) +--set_custom_model_VDJ (generative model of VDJ recombination, e.g. T beta chain) Note, if specifying a custom model folder for either a VJ recombination model (e.g. an alpha or light chain model) or a VDJ recombination model @@ -61,22 +40,44 @@ model_marginals.txt (IGoR inference marginal file) V_gene_CDR3_anchors.csv (V residue anchor and functionality file) J_gene_CDR3_anchors.csv (J residue anchor and functionality file) -------------------------------------------------------------------------------- -Example call (example calls are formatted as executed functions instead of the -console script entry point. The arguments are identical in either case.) -#Generate 1,000 seqs from the human TCRB default model, delimiter is tab -./generate_synthetic_sequences.py example_seqs.tsv --humanTCRB -n 1e3 + +It is required to specify the number of sequences to be generated. This is done +with -n (see Options). + +If a file is specified to write to (using -o, see Options), the generated +sequences are written to the file, otherwise they are printed to stdout. + +The default is to record both the nucleotide CDR3 sequence and the amino acid +CDR3 sequence. This can be specified (see Options). + +The V/J genes used to generate each sequence can be recorded or not. Default is +to record them, but this can be toggled off with --record_genes_off (see Options) ------------------------------------------------------------------------------- +Example calls: + +#Print 20 generated sequences to stdout +$ olga-generate_sequences --humanTRB -n 20 + +#Write the 200 generated sequences to example_seqs.tsv +$ olga-generate_sequences --humanTRB -o example_seqs.tsv -n 200 + +#Write 20,000 generated sequences to example_seqs.tsv +$ olga-generate_sequences --humanTRB -o example_seqs.tsv -n 2e4 + +#Write only the amino acid sequences +$ olga-generate_sequences --humanTRB -o example_seqs.tsv -n 200 --seq_type amino_acid + +-------------------------------------------------------------------------------- Options: -h, --help show this help message and exit - --humanTCRA, --human_T_alpha - use default human TCRA model (T cell alpha chain) - --humanTCRB, --human_T_beta - use default human TCRB model (T cell beta chain) - --mouseTCRB, --mouse_T_beta - use default mouse TCRB model (T cell beta chain) + --humanTRA, --human_T_alpha + use default human TRA model (T cell alpha chain) + --humanTRB, --human_T_beta + use default human TRB model (T cell beta chain) + --mouseTRB, --mouse_T_beta + use default mouse TRB model (T cell beta chain) --humanIGH, --human_B_heavy use default human IGH model (B cell heavy chain) --VDJ_model_folder=PATH/TO/FOLDER/ @@ -85,6 +86,8 @@ --VJ_model_folder=PATH/TO/FOLDER/ specify PATH/TO/FOLDER/ for a custom VJ generative model + -o PATH/TO/FILE, --outfile=PATH/TO/FILE + write CDR3 sequences to PATH/TO/FILE -n N, --num_seqs=N specify the number of sequences to generate. --seed=SEED set seed for pseudorandom number generator. Default is to not set a seed. @@ -119,32 +122,7 @@ Unless the anchor positions are changed, LEAVE THE DEFAULT. The default string is 'FW'. -------------------------------------------------------------------------------- -Example calls with options - -#Output a .csv file with delimiter = comma. -./generate_synthetic_sequences.py example_seqs.csv --VDJ_model_folder models/human_T_beta/ -n 1e3 - -#Output the amino acid CDR3 seqs and not the nucleotide sequences -./generate_synthetic_sequences.py example_seqs.tsv --humanTCRB -n 1e3 --seq_type amino_acid - -#Don't record the V and J genes/alleles used to generate the sequences -./generate_synthetic_sequences.py example_seqs.tsv --humanTCRB -n 1e3 --record_genes_off - -#Observe the time updates --- default is 1e5: -./generate_synthetic_sequences.py example_seqs.tsv --humanTCRB -n 1e3 --seqs_per_time_update 100 - -#Don't give time updates -./generate_synthetic_sequences.py example_seqs.tsv --humanTCRB -n 1e3 --seqs_per_time_update 100 --time_updates_off - -#Set the pseudo-random number generator seed to 100 -- should be repeatable. -./generate_synthetic_sequences.py example_seqs.tsv --humanTCRB -n 1e3 --seed 100 - -#Specify that the conserved J residue is an F -./generate_synthetic_sequences.py example_seqs.tsv --humanTCRB -n 1e3 --conserved_J_residues 'F' - -#Change the delimiter of the output file to space ' '. -./generate_synthetic_sequences.py example_seqs.tsv --humanTCRB -n 1e3 --delimiter space +-------------------------------------------------------------------------------- @author: zacharysethna @@ -168,12 +146,14 @@ def main(): parser = OptionParser(conflict_handler="resolve") - parser.add_option('--humanTCRA', '--human_T_alpha', action='store_true', dest='humanTCRA', default=False, help='use default human TCRA model (T cell alpha chain)') - parser.add_option('--humanTCRB', '--human_T_beta', action='store_true', dest='humanTCRB', default=False, help='use default human TCRB model (T cell beta chain)') - parser.add_option('--mouseTCRB', '--mouse_T_beta', action='store_true', dest='mouseTCRB', default=False, help='use default mouse TCRB model (T cell beta chain)') + parser.add_option('--humanTRA', '--human_T_alpha', action='store_true', dest='humanTRA', default=False, help='use default human TRA model (T cell alpha chain)') + parser.add_option('--humanTRB', '--human_T_beta', action='store_true', dest='humanTRB', default=False, help='use default human TRB model (T cell beta chain)') + parser.add_option('--mouseTRB', '--mouse_T_beta', action='store_true', dest='mouseTRB', default=False, help='use default mouse TRB model (T cell beta chain)') parser.add_option('--humanIGH', '--human_B_heavy', action='store_true', dest='humanIGH', default=False, help='use default human IGH model (B cell heavy chain)') parser.add_option('--VDJ_model_folder', dest='vdj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VDJ generative model') parser.add_option('--VJ_model_folder', dest='vj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VJ generative model') + parser.add_option('-o', '--outfile', dest = 'outfile_name', metavar='PATH/TO/FILE', help='write CDR3 sequences to PATH/TO/FILE') + parser.add_option('-n', '--num_seqs', type='float', metavar='N', default = 0, dest='num_seqs_to_generate', help='specify the number of sequences to generate.') parser.add_option('--seed', type='int', dest='seed', help='set seed for pseudorandom number generator. Default is to not set a seed.') parser.add_option('--seqs_per_time_update', type='float', default = 100000, dest='seqs_per_time_update', help='specify the number of sequences between time updates. Default is 1e5') @@ -190,9 +170,9 @@ def main(): main_folder = os.path.dirname(__file__) default_models = {} - default_models['humanTCRA'] = [os.path.join(main_folder, 'default_models', 'human_T_alpha'), 'VJ'] - default_models['humanTCRB'] = [os.path.join(main_folder, 'default_models', 'human_T_beta'), 'VDJ'] - default_models['mouseTCRB'] = [os.path.join(main_folder, 'default_models', 'mouse_T_beta'), 'VDJ'] + default_models['humanTRA'] = [os.path.join(main_folder, 'default_models', 'human_T_alpha'), 'VJ'] + default_models['humanTRB'] = [os.path.join(main_folder, 'default_models', 'human_T_beta'), 'VDJ'] + default_models['mouseTRB'] = [os.path.join(main_folder, 'default_models', 'mouse_T_beta'), 'VDJ'] default_models['humanIGH'] = [os.path.join(main_folder, 'default_models', 'human_B_heavy'), 'VDJ'] num_models_specified = sum([1 for x in default_models.keys() + ['vj_model_folder', 'vdj_model_folder'] if getattr(options, x)]) @@ -236,23 +216,13 @@ def main(): print 'Exiting...' return -1 - #OUTFILE IS THE FIRST ARGUMENT - try: - outfile_name = args[0] - except IndexError: - print 'Need to specify outfile as the first argument!' - print 'Exiting...' - return -1 - if len(outfile_name.strip()) == 0: - print 'Need to specify outfile as the first argument!' - print 'Exiting...' - return -1 - - if os.path.isfile(outfile_name): - if not raw_input(outfile_name + ' already exists. Overwrite (y/n)? ').strip().lower() in ['y', 'yes']: - print 'Exiting...' - return -1 + if options.outfile_name is not None: + outfile_name = options.outfile_name + if os.path.isfile(outfile_name): + if not raw_input(outfile_name + ' already exists. Overwrite (y/n)? ').strip().lower() in ['y', 'yes']: + print 'Exiting...' + return -1 #Parse arguments @@ -263,15 +233,15 @@ def main(): print 'Exiting...' return -1 - #Parse default delimiter delimiter = options.delimiter if delimiter is None: delimiter = '\t' - if outfile_name.endswith('.tsv'): - delimiter = '\t' - elif outfile_name.endswith('.csv'): - delimiter = ',' + if options.outfile_name is not None: + if outfile_name.endswith('.tsv'): + delimiter = '\t' + elif outfile_name.endswith('.csv'): + delimiter = ',' else: try: delimiter = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[delimiter] @@ -285,11 +255,9 @@ def main(): time_updates = options.time_updates conserved_J_residues = options.conserved_J_residues - if options.seed is not None: np.random.seed(options.seed) - #VDJ recomb case --- used for TCRB and IGH if recomb_type == 'VDJ': genomic_data = load_model.GenomicDataVDJ() @@ -309,57 +277,71 @@ def main(): V_gene_names = [V[0].split('*')[0] for V in genomic_data.genV] J_gene_names = [J[0].split('*')[0] for J in genomic_data.genJ] - - outfile = open(outfile_name, 'w') - - print 'Starting sequence generation... ' - start_time = time.time() - for i in range(num_seqs_to_generate): - ntseq, aaseq, V_in, J_in = seq_gen.gen_rnd_prod_CDR3(conserved_J_residues) - if seq_type == 'all': #default, include both ntseq and aaseq - current_line_out = ntseq + delimiter + aaseq - elif seq_type == 'ntseq': #only record ntseq - current_line_out = ntseq - elif seq_type == 'aaseq': #only record aaseq - current_line_out = aaseq - - if record_genes: - current_line_out += delimiter + V_gene_names[V_in] + delimiter + J_gene_names[J_in] - outfile.write(current_line_out + '\n') - - if (i+1)%seqs_per_time_update == 0 and time_updates: - c_time = time.time() - start_time - eta = ((num_seqs_to_generate - (i+1))/float(i+1))*c_time - if c_time > 86400: #more than a day - c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(c_time)/86400, (int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) - elif c_time > 3600: #more than an hr - c_time_str = '%d hours, %d minutes, and %.2f seconds.'%((int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) - elif c_time > 60: #more than a min - c_time_str = '%d minutes and %.2f seconds.'%((int(c_time)/60)%60, c_time%60) - else: - c_time_str = '%.2f seconds.'%(c_time) - - if eta > 86400: #more than a day - eta_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(eta)/86400, (int(eta)/3600)%24, (int(eta)/60)%60, eta%60) - elif eta > 3600: #more than an hr - eta_str = '%d hours, %d minutes, and %.2f seconds.'%((int(eta)/3600)%24, (int(eta)/60)%60, eta%60) - elif eta > 60: #more than a min - eta_str = '%d minutes and %.2f seconds.'%((int(eta)/60)%60, eta%60) - else: - eta_str = '%.2f seconds.'%(eta) - - print '%d sequences generated in %s Estimated time remaining: %s'%(i+1, c_time_str, eta_str) - - c_time = time.time() - start_time - if c_time > 86400: #more than a day - c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(c_time)/86400, (int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) - elif c_time > 3600: #more than an hr - c_time_str = '%d hours, %d minutes, and %.2f seconds.'%((int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) - elif c_time > 60: #more than a min - c_time_str = '%d minutes and %.2f seconds.'%((int(c_time)/60)%60, c_time%60) - else: - c_time_str = '%.2f seconds.'%(c_time) - print 'Completed generated all %d sequences in %s'%(num_seqs_to_generate, c_time_str) - outfile.close() + if options.outfile_name is not None: + outfile = open(outfile_name, 'w') + + print 'Starting sequence generation... ' + start_time = time.time() + for i in range(num_seqs_to_generate): + ntseq, aaseq, V_in, J_in = seq_gen.gen_rnd_prod_CDR3(conserved_J_residues) + if seq_type == 'all': #default, include both ntseq and aaseq + current_line_out = ntseq + delimiter + aaseq + elif seq_type == 'ntseq': #only record ntseq + current_line_out = ntseq + elif seq_type == 'aaseq': #only record aaseq + current_line_out = aaseq + + if record_genes: + current_line_out += delimiter + V_gene_names[V_in] + delimiter + J_gene_names[J_in] + outfile.write(current_line_out + '\n') + + if (i+1)%seqs_per_time_update == 0 and time_updates: + c_time = time.time() - start_time + eta = ((num_seqs_to_generate - (i+1))/float(i+1))*c_time + if c_time > 86400: #more than a day + c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(c_time)/86400, (int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) + elif c_time > 3600: #more than an hr + c_time_str = '%d hours, %d minutes, and %.2f seconds.'%((int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) + elif c_time > 60: #more than a min + c_time_str = '%d minutes and %.2f seconds.'%((int(c_time)/60)%60, c_time%60) + else: + c_time_str = '%.2f seconds.'%(c_time) + + if eta > 86400: #more than a day + eta_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(eta)/86400, (int(eta)/3600)%24, (int(eta)/60)%60, eta%60) + elif eta > 3600: #more than an hr + eta_str = '%d hours, %d minutes, and %.2f seconds.'%((int(eta)/3600)%24, (int(eta)/60)%60, eta%60) + elif eta > 60: #more than a min + eta_str = '%d minutes and %.2f seconds.'%((int(eta)/60)%60, eta%60) + else: + eta_str = '%.2f seconds.'%(eta) + + print '%d sequences generated in %s Estimated time remaining: %s'%(i+1, c_time_str, eta_str) + + c_time = time.time() - start_time + if c_time > 86400: #more than a day + c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(c_time)/86400, (int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) + elif c_time > 3600: #more than an hr + c_time_str = '%d hours, %d minutes, and %.2f seconds.'%((int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) + elif c_time > 60: #more than a min + c_time_str = '%d minutes and %.2f seconds.'%((int(c_time)/60)%60, c_time%60) + else: + c_time_str = '%.2f seconds.'%(c_time) + print 'Completed generating all %d sequences in %s'%(num_seqs_to_generate, c_time_str) + outfile.close() + + else: #print to stdout + for i in range(num_seqs_to_generate): + ntseq, aaseq, V_in, J_in = seq_gen.gen_rnd_prod_CDR3(conserved_J_residues) + if seq_type == 'all': #default, include both ntseq and aaseq + current_line_out = ntseq + delimiter + aaseq + elif seq_type == 'ntseq': #only record ntseq + current_line_out = ntseq + elif seq_type == 'aaseq': #only record aaseq + current_line_out = aaseq + + if record_genes: + current_line_out += delimiter + V_gene_names[V_in] + delimiter + J_gene_names[J_in] + print current_line_out if __name__ == '__main__': main() diff --git a/olga/generation_probability.py b/olga/generation_probability.py index 600ac06..5a7a6ac 100644 --- a/olga/generation_probability.py +++ b/olga/generation_probability.py @@ -564,7 +564,7 @@ def list_seqs_from_regex(self, regex_seq, print_warnings = True, raise_overload_ new_expression = [int(ex.strip('{}').split(',')[0]), default_max_reps, syms] if new_expression[0] > new_expression[1]: if print_warnings: - print 'Check regex syntax --- should be {min, max}' + print 'Check regex syntax --- should be {min,max}' return [] max_num_seqs *= sum([len(syms)**n for n in range(new_expression[0], new_expression[1]+1)])/len(syms) #print new_expression diff --git a/olga/run_pgen.py b/olga/run_pgen.py deleted file mode 100755 index 8c4ec82..0000000 --- a/olga/run_pgen.py +++ /dev/null @@ -1,841 +0,0 @@ -#!/usr/bin/env python2 -# -*- coding: utf-8 -*- -"""Command line script to compute Pgens of sequences from file. - - Copyright (C) 2018 Zachary Sethna - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - -This program will compute the generation probabilities (Pgens) of sequences -read in from a file as defined by a specified generative model. -This program has only minimal parsing built in, so the file read in must be -structured with delimiter spaced values (i.e. the data is organized in columns -separated by delimiter like a .tsv or .csv file). Read the Options on delimiter -for more info. - -The default is to assume that the sequences to be read in are in the first -column (index 0), meaning that a text file with only a sequence on each line -will be read in okay by default. - -The sequences which are read in must be TRIMMED TO ONLY THE CDR3 region as -defined by the V and J anchor files (default is to INCLUDE the conserved -residues of the C in the V region and the F/W in the J region). - -It is also possible to condition the Pgen computation on V and J identity by -specifying what index column the V and J gene/allele information is stored. -Look at the example files/calls to see some examples of this syntax. - -The default is for the program to run in 'safe mode' where all of the sequences -and V/J masks are loaded up and checked before running. This mode also has a -display (default on) that will show some number of lines of the outfile as a -visual check that the program is running properly (it also includes time, speed -and progress). These can be disabled. - -This program runs on only one type of sequence (either nucleotide or amino -acid), which will be inferred by default in safe mode. Regular expression -sequences are not accepted in this program as they can incur large time costs -(instead consider defining a custom 'amino acid' alphabet to define the symbols -used in the regular expressions if possible, or use the single sequence -function). If nucleotide sequences are read in it is possible to specify if the -output should be the nucleotide sequence Pgen and/or the translated amino acid -sequence Pgen (the default is to compute and output both). - -If not running in safe mode, sequences will be read in one by one and Pgen -computed/written to the outfile as they are read in (so only one sequence is -held in memory at a time). In this mode, it is required to specify whether the -sequences to be read in are nucleotide sequences or 'amino acid' sequences. - -As it is rare for datasets to be >> 1e4, parallelization is not built in. -However, there are options to skip N lines of the file and to load at most M -sequences so, if wanted, one could build a parallelized wrapper around this -script (though it would be recommended to instead just import the modules and -build from there). - -Required inputs: - -1) File name for input file: PATH/TO/INFILE -THIS IS ASSUMED TO BE THE FIRST ARGUMENT (see example calls) - -2) File name for output file: PATH/TO/OUTFILE -THIS IS ASSUMED TO BE THE SECOND ARGUMENT (see example calls) - -3) Generative model used to define the generation probability of a sequence. -Flags for default models: - ---humanTCRA or --human_T_alpha (default Human T cell alpha chain model) ---humanTCRB or --human_T_beta (default Human T cell beta chain model) ---mouseTCRB or --mouse_T_beta (default Mouse T cell beta chain model) ---humanIGH or --human_B_heavy (default Human B cell heavy chain model) - -In order to use these default model flags, the program must be executed from -the same directory where models/ is. For example, models/human_T_beta/ should -be an okay pathing to the humanTCRB model folder. - -To specify a custom model folder use: - ---VJ_model_folder (a generative model of VJ recombination, e.g. T alpha chain) ---VDJ_model_folder (a generative model of VDJ recombination, e.g. T beta chain) - -Note, if specifying a custom model folder for either a VJ recombination model -(e.g. an alpha or light chain model) or a VDJ recombination model -(e.g. a beta or heavy chain model), the folder must contain the following files -with the exact naming convention: - -model_params.txt (IGoR inference param file) -model_marginals.txt (IGoR inference marginal file) -V_gene_CDR3_anchors.csv (V residue anchor and functionality file) -J_gene_CDR3_anchors.csv (J residue anchor and functionality file) -------------------------------------------------------------------------------- -Example calls (example calls are formatted as executed functions instead of the -console script entry point. The arguments are identical in either case.). These -calls assume the existance of example_seqs.tsv. - -./run_pgen.py example_seqs.tsv example_pgens.tsv --humanTCRB -./run_pgen.py example_seqs.tsv example_pgens.tsv --VDJ_model_folder models/human_T_beta/ - -------------------------------------------------------------------------------- -Options: - -h, --help show this help message and exit - --humanTCRA, --human_T_alpha - use default human TCRA model (T cell alpha chain) - --humanTCRB, --human_T_beta - use default human TCRB model (T cell beta chain) - --mouseTCRB, --mouse_T_beta - use default mouse TCRB model (T cell beta chain) - --humanIGH, --human_B_heavy - use default human IGH model (B cell heavy chain) - --VDJ_model_folder=PATH/TO/FOLDER/ - specify PATH/TO/FOLDER/ for a custom VDJ generative - model - --VJ_model_folder=PATH/TO/FOLDER/ - specify PATH/TO/FOLDER/ for a custom VJ generative - model - --seq_in_index=INDEX specifies sequences to be read in are in column INDEX. - Default is index 0 (the first column). - -v INDEX, --v_mask_index=INDEX - specifies V_masks are found in column INDEX. Default - is no V mask. - -j INDEX, --j_mask_index=INDEX - specifies J_masks are found in column INDEX. Default - is no J mask. - -m N, --max_number_of_seqs=N - compute Pgens for at most N sequences. - --lines_to_skip=N skip the first N lines of the file. Default is 0. - -a PATH/TO/FILE, --alphabet_filename=PATH/TO/FILE - specify PATH/TO/FILE defining a custom 'amino acid' - alphabet. Default is no custom alphabet. - --seq_type=SEQ_TYPE declare sequence type. Infers seq_type by default in - safe_mode. Need to declare when safe_mode is off. - Choices: 'ntseq', 'nucleotide', 'aaseq', 'amino_acid' - --seq_type_out=SEQ_TYPE - if read in sequences are ntseqs, declare what type of - sequence to compute pgen for. Default is all. Choices: - 'all', 'ntseq', 'nucleotide', 'aaseq', 'amino_acid' - --skip_off, --skip_empty_sequences_off - stop skipping empty or blank sequences/lines (if for - example you want to keep line index fidelity between - the infile and outfile). - --safe_mode_off turn safe_mode off. Default is on. - --display_seqs_off turn the sequence display off (only applies in safe - mode). Default is on. - --num_lines_for_display=N - N lines of the output file are displayed when sequence - display is on. Also used to determine the number of - sequences to average over for speed and time - estimates. - --time_updates_off turn time updates off (only applies when sequence - display is disabled). - --seqs_per_time_update=N - specify the number of sequences between time updates. - Default is 1e5. - --print_warnings_off turn Pgen print warnings off - --overwrite_on overwrites outfile without prompt when safe mode is - turned off. - -d DELIMITER, --delimiter=DELIMITER - declare infile delimiter. Default is tab for .tsv - input files, comma for .csv files, and any whitespace - for all others. Choices: 'tab', 'space', ',', ';', ':' - --raw_delimiter=DELIMITER - declare infile delimiter as a raw string. - --delimiter_out=DELIMITER_OUT - declare outfile delimiter. Default is tab for .tsv - output files, comma for .csv files, and the infile - delimiter for all others. Choices: 'tab', 'space', - ',', ';', ':' - --raw_delimiter_out=DELIMITER_OUT - declare for the delimiter outfile as a raw string. - --gene_mask_delimiter=GENE_MASK_DELIMITER - declare gene mask delimiter. Default comma unless - infile delimiter is comma, then default is a - semicolon. Choices: 'tab', 'space', ',', ';', ':' - --raw_gene_mask_delimiter=GENE_MASK_DELIMITER - declare delimiter of gene masks as a raw string. - --comment_delimiter=COMMENT_DELIMITER - character or string to indicate comment or header - lines to skip. -------------------------------------------------------------------------------- -Example calls with options - -#Only run on the first 250 seqs (all the other examples will also be restricted to 250 seqs) -./run_pgen.py example_seqs.tsv example_pgens.tsv --humanTCRB -m 250 - -#Read in amino acid sequences (which happen to be in the column of index 1 (second column) in example_seqs.txt) -./run_pgen.py example_seqs.tsv example_pgens.tsv --humanTCRB --seq_in_index 1 -m 250 - -#Specify the custom alphabet file expanded_alphabet_files/example_expanded_amino_acid_alphabet.txt -#(Note, to take advantage of the expanded 'amino acid' alphabet need to read in amino acid sequences -#as the nucleotide sequences always translate to the standard amino acids.) -./run_pgen.py example_seqs.tsv example_pgens.tsv --humanTCRB -a expanded_alphabet_files/example_expanded_amino_acid_alphabet.txt -m 250 - -#Read in nucleotide sequences but only run Pgen on the translated amino acid sequences -./run_pgen.py example_seqs.tsv example_pgens.tsv --humanTCRB --seq_type ntseq --seq_type_out aaseq -m 250 - -#Compute Pgen conditioned on the V and J gene/alleles provided in columns of index 2 and 3. -./run_pgen.py example_seqs.tsv example_pgens.tsv --humanTCRB -v 2 -j 3 -m 250 - -#Turn the sequence display off -./run_pgen.py example_seqs.tsv example_pgens.tsv --humanTCRB --display_seqs_off -m 250 - -#Change sequence display to 20 sequences (default is 50) -./run_pgen.py example_seqs.tsv example_pgens.tsv --humanTCRB --num_lines_for_display 20 -m 250 - -#Skip the first 5 lines -./run_pgen.py example_seqs.tsv example_pgens.tsv --humanTCRB --lines_to_skip 5 -m 250 - -#Safe mode off -./run_pgen.py example_seqs.tsv example_pgens.tsv --humanTCRB --seq_type ntseq --safe_mode_off -m 250 - - -@author: zacharysethna -""" - -#Function assumes that it is in the same directory that the folder app/ is -#in (which should contain all the modules imported). - -import os -import sys -import time - -import olga.load_model as load_model -import olga.generation_probability as generation_probability -from olga.utils import nt2aa -from optparse import OptionParser - - - - -def main(): - """Compute Pgens from a file and output to another file.""" - - parser = OptionParser(conflict_handler="resolve") - - parser.add_option('--humanTCRA', '--human_T_alpha', action='store_true', dest='humanTCRA', default=False, help='use default human TCRA model (T cell alpha chain)') - parser.add_option('--humanTCRB', '--human_T_beta', action='store_true', dest='humanTCRB', default=False, help='use default human TCRB model (T cell beta chain)') - parser.add_option('--mouseTCRB', '--mouse_T_beta', action='store_true', dest='mouseTCRB', default=False, help='use default mouse TCRB model (T cell beta chain)') - parser.add_option('--humanIGH', '--human_B_heavy', action='store_true', dest='humanIGH', default=False, help='use default human IGH model (B cell heavy chain)') - parser.add_option('--VDJ_model_folder', dest='vdj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VDJ generative model') - parser.add_option('--VJ_model_folder', dest='vj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VJ generative model') - parser.add_option('--seq_in_index', type='int', metavar='INDEX', dest='seq_in_index', default = 0, help='specifies sequences to be read in are in column INDEX. Default is index 0 (the first column).') - parser.add_option('-v', '--v_mask_index', type='int', metavar='INDEX', dest='V_mask_index', help='specifies V_masks are found in column INDEX. Default is no V mask.') - parser.add_option('-j', '--j_mask_index', type='int', metavar='INDEX', dest='J_mask_index', help='specifies J_masks are found in column INDEX. Default is no J mask.') - parser.add_option('-m', '--max_number_of_seqs', type='int',metavar='N', dest='max_number_of_seqs', help='compute Pgens for at most N sequences.') - parser.add_option('--lines_to_skip', type='int',metavar='N', dest='lines_to_skip', default = 0, help='skip the first N lines of the file. Default is 0.') - parser.add_option('-a', '--alphabet_filename', dest='alphabet_filename', metavar='PATH/TO/FILE', help="specify PATH/TO/FILE defining a custom 'amino acid' alphabet. Default is no custom alphabet.") - parser.add_option('--seq_type', type='choice', dest='seq_type', choices=['ntseq', 'nucleotide', 'aaseq', 'amino_acid'], help="declare sequence type. Infers seq_type by default in safe_mode. Need to declare when safe_mode is off. Choices: 'ntseq', 'nucleotide', 'aaseq', 'amino_acid'") - parser.add_option('--seq_type_out', type='choice',metavar='SEQ_TYPE', dest='seq_type_out', choices=['all', 'ntseq', 'nucleotide', 'aaseq', 'amino_acid'], help="if read in sequences are ntseqs, declare what type of sequence to compute pgen for. Default is all. Choices: 'all', 'ntseq', 'nucleotide', 'aaseq', 'amino_acid'") - parser.add_option('--skip_off','--skip_empty_sequences_off', action='store_true', dest = 'skip_empty_sequences', default=True, help='stop skipping empty or blank sequences/lines (if for example you want to keep line index fidelity between the infile and outfile).') - - parser.add_option('--safe_mode_off', action='store_false', dest='safe_mode', default=True, help='turn safe_mode off. Default is on.') - parser.add_option('--display_seqs_off', action='store_false', dest='display_seqs', default=True, help='turn the sequence display off (only applies in safe mode). Default is on.') - parser.add_option('--num_lines_for_display', type='int', metavar='N', default = 50, dest='num_lines_for_display', help='N lines of the output file are displayed when sequence display is on. Also used to determine the number of sequences to average over for speed and time estimates.') - parser.add_option('--time_updates_off', action='store_false', dest='time_updates', default=True, help='turn time updates off (only applies when sequence display is disabled).') - parser.add_option('--seqs_per_time_update', type='float', metavar='N', default = 100, dest='seqs_per_time_update', help='specify the number of sequences between time updates. Default is 1e5.') - parser.add_option('--print_warnings_off', action='store_false', dest="print_warnings", default=True, help='turn Pgen print warnings off') - parser.add_option('--overwrite_on', action='store_true', dest="overwrite", default=False, help='overwrites outfile without prompt when safe mode is turned off.') - - parser.add_option('-d', '--delimiter', type='choice', dest='delimiter', choices=['tab', 'space', ',', ';', ':'], help="declare infile delimiter. Default is tab for .tsv input files, comma for .csv files, and any whitespace for all others. Choices: 'tab', 'space', ',', ';', ':'") - parser.add_option('--raw_delimiter', type='str', dest='delimiter', help="declare infile delimiter as a raw string.") - parser.add_option('--delimiter_out', type='choice', dest='delimiter_out', choices=['tab', 'space', ',', ';', ':'], help="declare outfile delimiter. Default is tab for .tsv output files, comma for .csv files, and the infile delimiter for all others. Choices: 'tab', 'space', ',', ';', ':'") - parser.add_option('--raw_delimiter_out', type='str', dest='delimiter_out', help="declare for the delimiter outfile as a raw string.") - parser.add_option('--gene_mask_delimiter', type='choice', dest='gene_mask_delimiter', choices=['tab', 'space', ',', ';', ':'], help="declare gene mask delimiter. Default comma unless infile delimiter is comma, then default is a semicolon. Choices: 'tab', 'space', ',', ';', ':'") - parser.add_option('--raw_gene_mask_delimiter', type='str', dest='gene_mask_delimiter', help="declare delimiter of gene masks as a raw string.") - parser.add_option('--comment_delimiter', type='str', dest='comment_delimiter', help="character or string to indicate comment or header lines to skip.") - - - (options, args) = parser.parse_args() - - #INFILE IS THE FIRST ARGUMENT - try: - infile_name = args[0] - except IndexError: - print 'Need to specify PATH/TO/INFILE as the first argument!' - print 'Exiting...' - return -1 - - if len(infile_name.strip()) == 0: - print 'Need to specify PATH/TO/INFILE as the first argument!' - print 'Exiting...' - return -1 - - if not os.path.isfile(infile_name): - print 'Cannot find file: ' + infile_name - print 'Exiting...' - return -1 - - main_folder = os.path.dirname(__file__) - - default_models = {} - default_models['humanTCRA'] = [os.path.join(main_folder, 'default_models', 'human_T_alpha'), 'VJ'] - default_models['humanTCRB'] = [os.path.join(main_folder, 'default_models', 'human_T_beta'), 'VDJ'] - default_models['mouseTCRB'] = [os.path.join(main_folder, 'default_models', 'mouse_T_beta'), 'VDJ'] - default_models['humanIGH'] = [os.path.join(main_folder, 'default_models', 'human_B_heavy'), 'VDJ'] - - num_models_specified = sum([1 for x in default_models.keys() + ['vj_model_folder', 'vdj_model_folder'] if getattr(options, x)]) - - if num_models_specified == 1: #exactly one model specified - try: - d_model = [x for x in default_models.keys() if getattr(options, x)][0] - model_folder = default_models[d_model][0] - recomb_type = default_models[d_model][1] - except IndexError: - if options.vdj_model_folder: #custom VDJ model specified - model_folder = options.vdj_model_folder - recomb_type = 'VDJ' - elif options.vj_model_folder: #custom VJ model specified - model_folder = options.vj_model_folder - recomb_type = 'VJ' - elif num_models_specified == 0: - print 'Need to indicate generative model.' - print 'Exiting...' - return -1 - elif num_models_specified > 1: - print 'Only specify one model' - print 'Exiting...' - return -1 - - #Check that all model and genomic files exist in the indicated model folder - if not os.path.isdir(model_folder): - print 'Check pathing... cannot find the model folder: ' + model_folder - print 'Exiting...' - return -1 - - params_file_name = os.path.join(model_folder,'model_params.txt') - marginals_file_name = os.path.join(model_folder,'model_marginals.txt') - V_anchor_pos_file = os.path.join(model_folder,'V_gene_CDR3_anchors.csv') - J_anchor_pos_file = os.path.join(model_folder,'J_gene_CDR3_anchors.csv') - - for x in [params_file_name, marginals_file_name, V_anchor_pos_file, J_anchor_pos_file]: - if not os.path.isfile(x): - print 'Cannot find: ' + x - print 'Please check the files (and naming conventions) in the model folder ' + model_folder - print 'Exiting...' - return -1 - - - safe_mode = options.safe_mode - overwrite = options.overwrite - #OUTFILE IS THE SECOND ARGUMENT - try: - outfile_name = args[1] - except IndexError: - print 'Need to specify PATH/TO/OUTFILE as the second argument!' - print 'Exiting...' - return -1 - - if len(outfile_name.strip()) == 0: - print 'Need to specify PATH/TO/OUTFILE as the second argument!' - print 'Exiting...' - return -1 - - if os.path.isfile(outfile_name): - if safe_mode: - if not raw_input(outfile_name + ' already exists. Overwrite (y/n)? ').strip().lower() in ['y', 'yes']: - print 'Exiting...' - return -1 - else: - if not overwrite: - print outfile_name + ' already exists! To overwrite use --overwrite_on' - print 'Exiting...' - return -1 - - - - - #Parse delimiter - delimiter = options.delimiter - if delimiter is None: #Default case - if infile_name.endswith('.tsv'): #parse TAB separated value file - delimiter = '\t' - elif infile_name.endswith('.csv'): #parse COMMA separated value file - delimiter = ',' - else: - try: - delimiter = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[delimiter] - except KeyError: - pass #Other string passed as the delimiter. - - #Parse delimiter_out - delimiter_out = options.delimiter_out - if delimiter_out is None: #Default case - if delimiter is None: - delimiter_out = '\t' - else: - delimiter_out = delimiter - if outfile_name.endswith('.tsv'): #output TAB separated value file - delimiter_out = '\t' - elif outfile_name.endswith('.csv'): #output COMMA separated value file - delimiter_out = ',' - else: - try: - delimiter_out = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[delimiter_out] - except KeyError: - pass #Other string passed as the delimiter. - - #Parse gene_delimiter - gene_mask_delimiter = options.gene_mask_delimiter - if gene_mask_delimiter is None: #Default case - gene_mask_delimiter = ',' - if delimiter == ',': - gene_mask_delimiter = ';' - else: - try: - gene_mask_delimiter = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[gene_mask_delimiter] - except KeyError: - pass #Other string passed as the delimiter. - - - #More options - alphabet_filename = options.alphabet_filename #used if a custom alphabet is to be specified - seq_type_in = options.seq_type - print_warnings = options.print_warnings - time_updates = options.time_updates - display_seqs = options.display_seqs - num_lines_for_display = options.num_lines_for_display - seq_type_out = options.seq_type_out #type of pgens to be computed. Can be ntseq, aaseq, or both - seq_in_index = options.seq_in_index #where in the line the sequence is after line.split(delimiter) - lines_to_skip = options.lines_to_skip #one method of skipping header - comment_delimiter = options.comment_delimiter #another method of skipping header - seqs_per_time_update = options.seqs_per_time_update - max_number_of_seqs = options.max_number_of_seqs - V_mask_index = options.V_mask_index #Default is not conditioning on V identity - J_mask_index = options.J_mask_index #Default is not conditioning on J identity - skip_empty_sequences = options.skip_empty_sequences - - - #Set seq_types - if seq_type_in is not None: - seq_type_in = {'ntseq': 'ntseq', 'nucleotide': 'ntseq', 'aaseq': 'aaseq', 'amino_acid': 'aaseq'}[seq_type_in] - if seq_type_out is not None: - seq_type_out = {'all': None, 'ntseq': 'ntseq', 'nucleotide': 'ntseq', 'aaseq': 'aaseq', 'amino_acid': 'aaseq'}[seq_type_out] - - - if seq_type_in == 'aaseq' and seq_type_out == 'ntseq': - print "Can't compute nucleotide Pgen from an amino acid sequence (seq_type_in = aaseq, seq_type_out = ntseq)" - print 'Exiting...' - return -1 - - - #Load up model based on recomb_type - #VDJ recomb case --- used for TCRB and IGH - if recomb_type == 'VDJ': - genomic_data = load_model.GenomicDataVDJ() - genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) - generative_model = load_model.GenerativeModelVDJ() - generative_model.load_and_process_igor_model(marginals_file_name) - pgen_model = generation_probability.GenerationProbabilityVDJ(generative_model, genomic_data, alphabet_filename) - #VJ recomb case --- used for TCRA and light chain - elif recomb_type == 'VJ': - genomic_data = load_model.GenomicDataVJ() - genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) - generative_model = load_model.GenerativeModelVJ() - generative_model.load_and_process_igor_model(marginals_file_name) - pgen_model = generation_probability.GenerationProbabilityVJ(generative_model, genomic_data, alphabet_filename) - - #SAFE MODE - if safe_mode: - seqs = [] - V_usage_masks = [] - J_usage_masks = [] - - infile = open(infile_name, 'r') - - for i, line in enumerate(infile): - if comment_delimiter is not None: #Default case -- no comments/header delimiter - if line.startswith(comment_delimiter): #allow comments - continue - if i < lines_to_skip: - continue - - if delimiter is None: #Default delimiter is just any whitespace - split_line = line.split() - else: - split_line = line.split(delimiter) - - #Find seq - try: - seq = split_line[seq_in_index].strip() - if skip_empty_sequences and len(seq.strip()) == 0: - continue - if seq_type_in == 'aaseq': - #check seq is composed only of symbols that are in the codons_dict - if all([x in pgen_model.codons_dict.keys() for x in seq]): - seqs.append(seq) - else: - print seq + " is not a CDR3 'amino acid' sequence composed exclusively of recognized symbols" - print 'Unrecognized symbols: ' + ', '.join([x for x in seq if not x in pgen_model.codons_dict.keys()]) - print 'Exiting...' - infile.close() - return -1 - elif seq_type_in == 'ntseq': - #check that seq is only composed of nucleotides - if all([x in 'ACGTacgt' for x in seq]): - if len(seq)%3 == 0: - seqs.append(seq) - else: - #Out of frame sequence -- print warning. - print 'WARNING: ' + seq + ' is out of frame --- cannot run Pgen for it (will output 0).' - seqs.append(seq) - else: - print seq + " is not a CDR3 nucleotide sequence composed exclusively of A, C, G, or T." - print 'Unrecognized symbols: ' + ', '.join([x for x in seq if not x in 'ACGTacgt']) - print 'Exiting...' - infile.close() - return -1 - else: #Will infer seq_type_in and vet sequences later - seqs.append(seq) - - except IndexError: #no index match for seq - if skip_empty_sequences: - if len(line.strip()) == 0: - continue - print 'seq_in_index is out of range' - print 'Exiting...' - infile.close() - return -1 - - #Find and format V_usage_mask - if V_mask_index is None: - V_usage_masks.append(None) #default mask - else: - try: - V_usage_mask = split_line[V_mask_index].strip().split(gene_mask_delimiter) - #check that all V gene/allele names are recognized - if all([v in pgen_model.V_mask_mapping for v in V_usage_mask]): - V_usage_masks.append(V_usage_mask) - else: - print str(V_usage_mask) + " is not a usable V_usage_mask composed exclusively of recognized V gene/allele names" - print 'Unrecognized V gene/allele names: ' + ', '.join([v for v in V_usage_mask if not v in pgen_model.V_mask_mapping.keys()]) - print 'Exiting...' - infile.close() - return -1 - except IndexError: #no index match for V_mask_index - print 'V_mask_index is out of range' - print 'Exiting...' - infile.close() - return -1 - - #Find and format J_usage_mask - if J_mask_index is None: - J_usage_masks.append(None) #default mask - else: - try: - J_usage_mask = split_line[J_mask_index].strip().split(gene_mask_delimiter) - #check that all V gene/allele names are recognized - if all([j in pgen_model.J_mask_mapping for j in J_usage_mask]): - J_usage_masks.append(J_usage_mask) - else: - print str(J_usage_mask) + " is not a usable J_usage_mask composed exclusively of recognized J gene/allele names" - print 'Unrecognized J gene/allele names: ' + ', '.join([j for j in J_usage_mask if not j in pgen_model.J_mask_mapping.keys()]) - print 'Exiting...' - infile.close() - return -1 - except IndexError: #no index match for J_mask_index - print 'J_mask_index is out of range' - print 'Exiting...' - infile.close() - return -1 - - if max_number_of_seqs is not None: - if len(seqs) >= max_number_of_seqs: - break - - infile.close() - - if seq_type_in == 'aaseq': #Check if the 'aaseqs' read in are actually 'ntseqs' - all_symbols_nts = True - for seq in seqs: - if not all([x in 'ACGTacgt' for x in seq]): - all_symbols_nts = False - break - if all_symbols_nts: - print 'It looks like the sequences read in are all nucleotide sequences composed of A, C, G, or T.' - if raw_input('Change seq_type_in to nucleotide? (y/n)? ') in ['y', 'yes']: - seq_type_in = 'ntseq' - else: - print 'Okay... you are the boss... chances are all the Pgens will be 0.' - - elif seq_type_in is None: #no specified seq_type_in --- infer if possible - ntseq_poss = True - aaseq_poss = True - ntseq_found = False - for seq in seqs: - if len(seq) == 0: #skip empty entry - continue - if all([x in 'ACGTacgt' for x in seq]): #ntseq - ntseq_found = True - elif all([x in pgen_model.codons_dict.keys() for x in seq]): #aaseq that CAN'T be ntseq - ntseq_poss = False - else: #Can't be either a ntseq or aaseq - ntseq_poss = False - aaseq_poss = False - break - - if ntseq_poss: #If nucleotide sequence is possible set seq_type_in to nucleotide - seq_type_in = 'ntseq' - elif aaseq_poss: - if not ntseq_found: #No sequences that appear to be nucleotide sequences found - seq_type_in = 'aaseq' - else: - print "It looks like the some sequences are 'amino acid' sequences, while others are nucleotide sequences." - if raw_input('Compute amino acid Pgen for all sequences (including nucleotide sequences)? y/n (if no will break)? ') in ['y', 'yes']: - seq_type_in = 'aaseq' - else: - print 'Exiting...' - return -1 - else: - print 'Sequence ' + seq + " is unrecognized as either an 'amino acid' sequence or a nucleotide sequence." - print 'Exiting...' - return -1 - - if seq_type_in == 'aaseq' and seq_type_out == 'ntseq': - print "Can't compute nucleotide Pgen from an amino acid sequence (seq_type_in = aaseq, seq_type_out = ntseq)" - print 'Exiting...' - return -1 - - print 'Successfully read in and formatted ' + str(len(seqs)) + ' sequences and any V or J usages.' - if display_seqs: - print_warnings = False #Can't print warnings with display on - sys.stdout.write('\r'+'Continuing to Pgen computation in 3... ') - sys.stdout.flush() - time.sleep(0.4) - sys.stdout.write('\r'+'Continuing to Pgen computation in 2... ') - sys.stdout.flush() - time.sleep(0.4) - sys.stdout.write('\r'+'Continuing to Pgen computation in 1... ') - sys.stdout.flush() - time.sleep(0.4) - else: - print 'Continuing to Pgen computation.' - - if display_seqs: - lines_for_display = [] - times_for_speed_calc = [time.time()] - - outfile = open(outfile_name, 'w') - start_time = time.time() - for i, seq in enumerate(seqs): - if seq_type_in == 'aaseq': - aaseq = seq - #Compute Pgen and print out - c_pgen_line = aaseq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) - elif seq_type_in == 'ntseq': - ntseq = seq - if len(ntseq) % 3 == 0: #inframe sequence - aaseq = nt2aa(ntseq) - - #Compute Pgen and print out based on recomb_type and seq_type_out - if seq_type_out is None: - c_pgen_line = ntseq + delimiter_out + str(pgen_model.compute_nt_CDR3_pgen(ntseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) + delimiter_out + aaseq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) - elif seq_type_out == 'ntseq': - c_pgen_line = ntseq + delimiter_out + str(pgen_model.compute_nt_CDR3_pgen(ntseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) - elif seq_type_out == 'aaseq': - c_pgen_line = aaseq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) - - else: #out of frame sequence -- Pgens are 0 and use 'out_of_frame' for aaseq - if seq_type_out is None: - c_pgen_line = ntseq + delimiter_out + '0' + delimiter_out + 'out_of_frame' + delimiter_out + '0' - elif seq_type_out == 'ntseq': - c_pgen_line = ntseq + delimiter_out + '0' - elif seq_type_out == 'aaseq': - c_pgen_line = 'out_of_frame' + delimiter_out + '0' - - outfile.write(c_pgen_line + '\n') - - #Print time update - if display_seqs: - cc_time = time.time() - c_time = cc_time - start_time - times_for_speed_calc = [cc_time] + times_for_speed_calc[:num_lines_for_display] - c_avg_speed = (len(times_for_speed_calc)-1)/float(times_for_speed_calc[0] - times_for_speed_calc[-1]) - - #eta = ((len(seqs) - (i+1))/float(i+1))*c_time - - eta = (len(seqs) - (i+1))/c_avg_speed - - lines_for_display = [c_pgen_line] + lines_for_display[:num_lines_for_display] - - - c_time_str = '%s hours, %s minutes, and %s seconds.'%(repr(int(c_time)/3600).rjust(3), repr((int(c_time)/60)%60).rjust(2), repr(int(c_time)%60).rjust(2)) - eta_str = '%s hours, %s minutes, and %s seconds.'%(repr(int(eta)/3600).rjust(3), repr((int(eta)/60)%60).rjust(2), repr(int(eta)%60).rjust(2)) - time_str = 'Time to compute Pgen on %s seqs: %s \nEst. time for remaining %s seqs: %s'%(repr(i+1).rjust(9), c_time_str, repr(len(seqs) - (i + 1)).rjust(9), eta_str) - speed_str = 'Current Pgen computation speed: %s seqs/min'%(repr(round((len(times_for_speed_calc)-1)*60/float(times_for_speed_calc[0] - times_for_speed_calc[-1]), 2)).rjust(8)) - display_str = '\n'.join(lines_for_display[::-1]) + '\n' + '-'*80 + '\n' + time_str + '\n' + speed_str + '\n' + '-'*80 - print '\033[2J' + display_str - elif (i+1)%seqs_per_time_update == 0 and time_updates: - c_time = time.time() - start_time - eta = ((len(seqs) - (i+1))/float(i+1))*c_time - if c_time > 86400: #more than a day - c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(c_time)/86400, (int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) - elif c_time > 3600: #more than an hr - c_time_str = '%d hours, %d minutes, and %.2f seconds.'%((int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) - elif c_time > 60: #more than a min - c_time_str = '%d minutes and %.2f seconds.'%((int(c_time)/60)%60, c_time%60) - else: - c_time_str = '%.2f seconds.'%(c_time) - - if eta > 86400: #more than a day - eta_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(eta)/86400, (int(eta)/3600)%24, (int(eta)/60)%60, eta%60) - elif eta > 3600: #more than an hr - eta_str = '%d hours, %d minutes, and %.2f seconds.'%((int(eta)/3600)%24, (int(eta)/60)%60, eta%60) - elif eta > 60: #more than a min - eta_str = '%d minutes and %.2f seconds.'%((int(eta)/60)%60, eta%60) - else: - eta_str = '%.2f seconds.'%(eta) - - print 'Pgen computed for %d sequences in: %s Estimated time remaining: %s'%(i+1, c_time_str, eta_str) - - c_time = time.time() - start_time - if c_time > 86400: #more than a day - c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(c_time)/86400, (int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) - elif c_time > 3600: #more than an hr - c_time_str = '%d hours, %d minutes, and %.2f seconds.'%((int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) - elif c_time > 60: #more than a min - c_time_str = '%d minutes and %.2f seconds.'%((int(c_time)/60)%60, c_time%60) - else: - c_time_str = '%.2f seconds.'%(c_time) - print 'Completed Pgen computation for %d sequences: in %s'%(len(seqs), c_time_str) - outfile.close() - - - #Non-safe mode - else: - print 'Starting Pgen computation with safe mode disabled.' - pgens_computed = 0 - - infile = open(infile_name, 'r') - outfile = open(outfile_name, 'w') - - start_time = time.time() - for i, line in enumerate(infile): - if comment_delimiter is not None: #Default case -- no comments/header delimiter - if line.startswith(comment_delimiter): #allow comments - continue - if i < lines_to_skip: - continue - - if delimiter is None: #Default delimiter is just any whitespace - split_line = line.split() - else: - split_line = line.split(delimiter) - - #Find seq - try: - seq = split_line[seq_in_index].strip() - if skip_empty_sequences and len(seq.strip()) == 0: - continue - except IndexError: #no index match for seq - if skip_empty_sequences: - if len(line.strip()) == 0: - continue - print 'seq_in_index is out of range' - print 'Exiting...' - infile.close() - outfile.close() - return -1 - #Find and format V_usage_mask - if V_mask_index is None: - V_usage_mask = None #default mask - else: - try: - V_usage_mask = split_line[V_mask_index].strip().split(gene_mask_delimiter) - except IndexError: #no index match for V_mask_index - print 'V_mask_index is out of range' - print 'Exiting...' - infile.close() - outfile.close() - return -1 - - #Find and format J_usage_mask - if J_mask_index is None: - J_usage_mask = None #default mask - else: - try: - J_usage_mask = split_line[J_mask_index].strip().split(gene_mask_delimiter) - except IndexError: #no index match for J_mask_index - print 'J_mask_index is out of range' - print 'Exiting...' - infile.close() - outfile.close() - return -1 - - if seq_type_in == 'aaseq': - aaseq = seq - #Compute Pgen and print out - outfile.write(aaseq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_mask, J_usage_mask, print_warnings)) + '\n') - elif seq_type_in == 'ntseq': - ntseq = seq - aaseq = nt2aa(ntseq) - #Compute Pgen and print out based on recomb_type and seq_type_out - if seq_type_out == None: - outfile.write(ntseq + delimiter_out + str(pgen_model.compute_nt_CDR3_pgen(ntseq, V_usage_mask, J_usage_mask, print_warnings)) + delimiter_out + aaseq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_mask, J_usage_mask, print_warnings)) + '\n') - elif seq_type_out == 'ntseq': - outfile.write(ntseq + delimiter_out + str(pgen_model.compute_nt_CDR3_pgen(ntseq, V_usage_mask, J_usage_mask, print_warnings)) + '\n') - elif seq_type_out == 'aaseq': - outfile.write(aaseq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_mask, J_usage_mask, print_warnings)) + '\n') - - pgens_computed += 1 - #Print time update - if (pgens_computed)%seqs_per_time_update == 0 and time_updates: - c_time = time.time() - start_time - if c_time > 86400: #more than a day - c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(c_time)/86400, (int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) - elif c_time > 3600: #more than an hr - c_time_str = '%d hours, %d minutes, and %.2f seconds.'%((int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) - elif c_time > 60: #more than a min - c_time_str = '%d minutes and %.2f seconds.'%((int(c_time)/60)%60, c_time%60) - else: - c_time_str = '%.2f seconds.'%(c_time) - - - print 'Pgen computed for %d sequences in %s'%(pgens_computed, c_time_str) - - if max_number_of_seqs is not None: - if pgens_computed >= max_number_of_seqs: - break - - c_time = time.time() - start_time - if c_time > 86400: #more than a day - c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(c_time)/86400, (int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) - elif c_time > 3600: #more than an hr - c_time_str = '%d hours, %d minutes, and %.2f seconds.'%((int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) - elif c_time > 60: #more than a min - c_time_str = '%d minutes and %.2f seconds.'%((int(c_time)/60)%60, c_time%60) - else: - c_time_str = '%.2f seconds.'%(c_time) - print 'Completed Pgen computation for %d sequences in %s'%(pgens_computed, c_time_str) - - infile.close() - outfile.close() - -if __name__ == '__main__': main() diff --git a/olga/utils.py b/olga/utils.py index e299be3..93b1dca 100644 --- a/olga/utils.py +++ b/olga/utils.py @@ -302,6 +302,40 @@ def generate_sub_codons_right(codons_dict): return sub_codons_right +def determine_seq_type(seq, aa_alphabet): + """Determine the type of a sequence. + + Parameters + ---------- + + seq : str + Sequence to be typed. + aa_alphabet : str + String of all characters recoginized as 'amino acids'. (i.e. the keys + of codons_dict: aa_alphabet = ''.join(codons_dict.keys()) ) + + Returns + ------- + seq_type : str + The type of sequence (ntseq, aaseq, regex, None) seq is. + + Example + -------- + >>> determine_seq_type('TGTGCCAGCAGTTCCGAAGGGGCGGGAGGGCCCTCCCTGAGAGGTCATGAGCAGTTCTTC', aa_alphabet) + 'ntseq' + >>> determine_seq_type('CSARDX[TV]GNX{0,}', aa_alphabet) + 'regex + + """ + + if all([x in 'ACGTacgt' for x in seq]): + return 'ntseq' + elif all([x in aa_alphabet for x in seq]): + return 'aaseq' + elif all([x in aa_alphabet + '[]{}0123456789,']): + return 'regex' + +#%% #If using the steady-state distribution for first nucleotide probabilities we include a function to compute it def calc_steady_state_dist(R): """Calculate the steady state dist of a 4 state markov transition matrix. diff --git a/setup.py b/setup.py index 89bc35f..389af1b 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ def readme(): data_files_to_include = [('', ['README.md', 'LICENSE', 'example_expanded_amino_acid_alphabet.txt'])] setup(name='olga', - version='0.1.0', + version='1.0.0', description='Compute generation probability of CDR3 sequences', long_description=readme(), url='https://github.com/zsethna/OLGA', @@ -37,8 +37,7 @@ def readme(): data_files = data_files_to_include, include_package_data=True, entry_points = {'console_scripts': [ - 'olga-compute_single_sequence_pgen=olga.compute_single_sequence_pgen:main', - 'olga-run_pgen=olga.run_pgen:main', - 'olga-generate_synthetic_sequences=olga.generate_synthetic_sequences:main' + 'olga-compute_pgen=olga.compute_pgen:main', + 'olga-generate_sequences=olga.generate_sequences:main' ], }, zip_safe=False)