#### This Jupyter Notebook contains example code to demonstrate how one may use the functions in biosynseq.generate to generate nucleotide sequences in fasta format and to obtain sequence embeddings from the fasta file.

In [1]:
from biosynseq import generate
from gene_transformer.config import ModelSettings
from gene_transformer.model import LoadPTCheckpointStrategy

  from .autonotebook import tqdm as notebook_tqdm


# Part 1: Get fasta

In [2]:
yaml_path = "/homes/lind/MDH-pipeline/mdh_gpt.yaml"
pt_path = "/homes/mzvyagin/gpt2_mdh_example/gpt2_earnest_river_122_mdh.pt"
fasta_path = "/homes/lind/MDH-pipeline/fasta/fasta_test2.fasta"
num_seqs = 7 # number of nucleotide sequences to generate, default = 5

In [3]:
config = ModelSettings.from_yaml(yaml_path)

# given a pt file:
model_strategy = LoadPTCheckpointStrategy(config, pt_path)

In [4]:
results = generate.generate_fasta(model_strategy=model_strategy, fasta_path=fasta_path, num_seqs = num_seqs)
print(results)

  rank_zero_warn(
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for

{'unique_seqs': ['ATGCCCCTCTTGCGTGACGTGCCCCTGTGCGATATGTTATAA', 'ATGCCACAGGGTCAGTTGTCCTATGTCAAAGAGCGTCAAGAAGATATCGGTCAGCGCTAG', 'ATGCAGATGGTATAG', 'ATGCTTCGAAGCCCCTGA', 'ATGAGCTGA', 'ATGGAACTGATTCTGAAACCGCTCCCATTGGGTACGTATCATGTTAGCCCTCGGCCAATCTGA', 'ATGCTGCCCCGCATCTTGACCACTCGTTTCGGCCCGCAATCAGAAGTGCTCGGTAGACCCGGCGGTCACGGGTGGTCCGTAGTAATCTACTGA'], 'all_generated_seqs': ['ATGCCCCTCTTGCGTGACGTGCCCCTGTGCGATATGTTATAA', 'ATGCTGCCCCGCATCTTGACCACTCGTTTCGGCCCGCAATCAGAAGTGCTCGGTAGACCCGGCGGTCACGGGTGGTCCGTAGTAATCTACTGA', 'ATGCCACAGGGTCAGTTGTCCTATGTCAAAGAGCGTCAAGAAGATATCGGTCAGCGCTAG', 'ATGGAACTGATTCTGAAACCGCTCCCATTGGGTACGTATCATGTTAGCCCTCGGCCAATCTGA', 'ATGCAGATGGTATAG', 'ATGAGCTGA', 'ATGCTTCGAAGCCCCTGA'], 'seconds_elapsed': 27.565093517303467}


# Part 2: Get sequence embeddings from fasta

In [5]:
yaml_path = "/homes/lind/MDH-pipeline/mdh_gpt.yaml"
pt_path = "/homes/mzvyagin/gpt2_mdh_example/gpt2_earnest_river_122_mdh.pt"
fasta_path = "/homes/lind/MDH-pipeline/fasta/fasta_test2.fasta"
embeddings_output_path="/homes/lind/MDH-pipeline/embeddings/embeddings_test.npy"

In [6]:
config = ModelSettings.from_yaml(yaml_path)

# given a pt file:
model_strategy = LoadPTCheckpointStrategy(config, pt_path)

In [7]:
embeddings = generate.fasta_to_embeddings(model_strategy, fasta_path, embeddings_output_path)
print(embeddings)

Running inference with dataset length 7


  0%|                                    | 0/7 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

100%|████████████████████████████| 7/7 [00:00<00:00, 13.11it/s]

	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)





Embeddings shape: (7, 1024, 768)
[[[ 0.04868069 -0.01195381 -0.00645662 ... -0.03122215  0.01094359
   -0.0035614 ]
  [ 0.         -0.03224799  0.         ... -0.00355151  0.02965949
    0.00884153]
  [ 0.02999547 -0.00865247  0.04809479 ...  0.02695912  0.05683281
    0.01760943]
  ...
  [ 0.00519243  0.01638022  0.         ...  0.07288225 -0.03332997
    0.01813556]
  [-0.00391198  0.00714018 -0.02851456 ...  0.0271398  -0.01800548
    0.        ]
  [ 0.05963241 -0.01018279 -0.02210587 ... -0.06037784 -0.04219067
    0.01604172]]

 [[ 0.04868069 -0.01195381 -0.00645662 ... -0.03122215  0.01094359
   -0.0035614 ]
  [ 0.02008249 -0.07793572 -0.0165618  ... -0.01285611  0.08464704
   -0.02021168]
  [ 0.01411847  0.          0.06301758 ...  0.06581816  0.04579304
   -0.0272202 ]
  ...
  [ 0.00519243  0.01638022 -0.04830012 ...  0.07288225 -0.03332997
    0.01813556]
  [-0.00391198  0.00714018 -0.02851456 ...  0.0271398  -0.01800548
    0.01944104]
  [ 0.05963241 -0.01018279 -0.02210587 .