sacdallago · SebieF · Jun 16, 2024 · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024
diff --git a/Changelog.md b/Changelog.md
@@ -1,5 +1,25 @@
 # Biotrainer Changelog
 
+## 09.06.2024 - Version 0.9.0
+### Maintenance
+* Adding more extensive code documentation
+* Optimizing imports
+* Applying consistent file naming
+* Updating dependencies. Note that `jupyter` was removed as a direct optional dependency. 
+You can always add it via `poetry add jupyter`.
+* Adding simple differentiation between t5 and esm tokenizer and models in `embedders` module
+
+### Features
+* Adding new `residues_to_value` protocol.
+Similar to the residues_to_class protocol, 
+this protocol predicts a value for each sequence, using per-residue embeddings. It might, in some situations, outperform
+the sequence_to_value protocol.
+
+### Bug fixes
+* For `huggingface_transformer_embedder.py`, all special tokens are now always deleted from the final embedding
+(e.g. first/last for esm1b, last for t5)
+
+
 ## 04.06.2024 - Version 0.8.4
 ### Maintenance
 * Updating dependencies

diff --git a/README.md b/README.md
@@ -24,8 +24,9 @@ C=number of classes (e.g. 13)
 
 - residue_to_class --> Predict a class C for each residue encoded in D dimensions in a sequence of length L. Input BxLxD --> output BxLxC
 - residues_to_class --> Predict a class C for all residues encoded in D dimensions in a sequence of length L. Input BxLxD --> output BxC
+- residues_to_value --> Predict a value V for all residues encoded in D dimensions in a sequence of length L. Input BxLxD --> output Bx1
 - sequence_to_class --> Predict a class C for each sequence encoded in a fixed dimension D. Input BxD --> output BxC
-- sequence_to_value --> Predict a value V for each sequence encoded in a fixed dimension D. Input BxD --> output BxV
+- sequence_to_value --> Predict a value V for each sequence encoded in a fixed dimension D. Input BxD --> output Bx1
 ```
 
 ### Input file standardization
@@ -86,17 +87,15 @@ as described in the [configuration options](docs/config_file_options.md#embeddin
 
 1. Make sure you have [poetry](https://python-poetry.org/) installed: 
 ```bash
-curl -sSL https://install.python-poetry.org/ | python3 - --version 1.4.2
+curl -sSL https://install.python-poetry.org/ | python3 -
 ```
 
 2. Install dependencies and biotrainer via `poetry`:
 ```bash
 # In the base directory:
 poetry install
-# Optional: Add jupyter to use notebooks
-poetry install --extras "jupyter"
-# You can also install all extras at once
-poetry install --all-extras
+# Adding jupyter notebook (if needed):
+poetry add jupyter
 ```
 
 ## Running

diff --git a/biotrainer/config/__init__.py b/biotrainer/config/__init__.py
@@ -1,5 +1,5 @@
-from .configurator import Configurator
 from .config_option import ConfigurationException
+from .configurator import Configurator
 
 __all__ = [
     Configurator,

diff --git a/biotrainer/config/config_option.py b/biotrainer/config/config_option.py
@@ -1,14 +1,13 @@
 from __future__ import annotations
 
-import os
 import logging
+import os
 import shutil
-
+from abc import ABC, abstractmethod
 from pathlib import Path
+from typing import List, Union, Any, Dict
 from urllib import request
 from urllib.parse import urlparse
-from abc import ABC, abstractmethod
-from typing import List, Union, Any
 
 from ..protocols import Protocol
 
@@ -60,6 +59,11 @@ def required(self) -> bool:
         return False
 
     def is_list_option(self) -> bool:
+        """
+        For hyperparameter optimization, some values can be provided as a list format.
+
+        :return: True if option is provided as a list type for hyperparameter optimization, else False
+        """
         return ("range" in str(self.value) or type(self.value) is list or
                 (type(self.value) is str and "[" in self.value and "]" in self.value))
 
@@ -96,7 +100,7 @@ def category(self) -> str:
     def transform_value_if_necessary(self, config_file_path: Path = None):
         pass
 
-    def to_dict(self):
+    def to_dict(self) -> Dict[str, Any]:
         return {"name": str(self.name),
                 "category": str(self.category),
                 "required": str(self.required),

diff --git a/biotrainer/config/config_rules.py b/biotrainer/config/config_rules.py
@@ -1,8 +1,7 @@
 from abc import ABC, abstractmethod
-from typing import List, Type, Union, Tuple, Any
+from typing import List, Union, Tuple, Any
 
-from .config_option import ConfigOption, FileOption
-from .cross_validation_options import Method
+from .config_option import ConfigOption
 from ..protocols import Protocol
 
 
@@ -21,6 +20,9 @@ def apply(self, protocol: Protocol, config: List, ignore_file_checks: bool) -> T
 
 
 class MutualExclusive(ConfigRule):
+    """
+    ConfigRule to declare that two or more config options are mutually exclusive.
+    """
 
     def __init__(self, exclusive: List, allowed_values: List[str] = None, error_message: str = ""):
         if allowed_values is None:
@@ -48,6 +50,9 @@ def apply(self, protocol: Protocol, config: List[ConfigOption], ignore_file_chec
 
 
 class ProtocolRequires(ConfigRule):
+    """
+    ConfigRule to declare that a protocol requires certain options (for example file input)
+    """
 
     def __init__(self, protocol: Union[Protocol, List[Protocol]], requires: List):
         if type(protocol) == Protocol:
@@ -72,6 +77,9 @@ def apply(self, protocol: Protocol, config: List, ignore_file_checks: bool) -> T
 
 
 class OptionValueRequires(ConfigRule):
+    """
+    ConfigRule to declare that a certain value of an option requires one or more other options in order to be valid
+    """
 
     def __init__(self, option: Any, value: Any, requires: List):
         self._option = option
@@ -95,6 +103,9 @@ def apply(self, protocol: Protocol, config: List, ignore_file_checks: bool) -> T
 
 
 class AllowHyperparameterOptimization(ConfigRule):
+    """
+    ConfigRule to state which config options allow hyperparameter optimization
+    """
 
     def __init__(self, option: Any, value: Any):
         self._option = option

diff --git a/biotrainer/config/configurator.py b/biotrainer/config/configurator.py
@@ -1,22 +1,21 @@
 import os
+from pathlib import Path
+from typing import Union, List, Dict, Any, Tuple
 
 from ruamel import yaml
-from pathlib import Path
 from ruamel.yaml import YAMLError
-from typing import Union, List, Dict, Any, Tuple
 
-from .model_options import model_options
-from .general_options import general_options
-from .input_options import SequenceFile, LabelsFile, input_options
 from .config_option import ConfigurationException, ConfigOption, FileOption
-from .training_options import AutoResume, PretrainedModel, training_options
-from .embedding_options import EmbedderName, EmbeddingsFile, embedding_options
 from .config_rules import (MutualExclusive, ProtocolRequires, OptionValueRequires,
                            AllowHyperparameterOptimization)
 from .cross_validation_options import (cross_validation_options, CROSS_VALIDATION_CONFIG_KEY, Method, ChooseBy,
                                        CrossValidationOption, K, Nested, NestedK, SearchMethod, NMaxEvaluationsRandom,
                                        P)
-
+from .embedding_options import EmbedderName, EmbeddingsFile, embedding_options
+from .general_options import general_options
+from .input_options import SequenceFile, LabelsFile, input_options
+from .model_options import model_options
+from .training_options import AutoResume, PretrainedModel, training_options
 from ..protocols import Protocol
 
 protocol_rules = [
@@ -54,6 +53,9 @@
 
 
 class Configurator:
+    """
+    Class to read, validate and transform the input yaml configuration.
+    """
 
     def __init__(self, config_dict: Dict, config_file_path: Path = None):
         if not config_file_path:
@@ -72,7 +74,15 @@ def from_config_path(cls, config_path: Union[str, Path]):
                    config_file_path=Path(os.path.dirname(os.path.abspath(config_path))))
 
     @staticmethod
-    def get_option_dicts_by_protocol(protocol: Protocol, include_cross_validation_options: bool = False):
+    def get_option_dicts_by_protocol(protocol: Protocol,
+                                     include_cross_validation_options: bool = False) -> List[Dict[str, Any]]:
+        """
+        Returns all possible configuration options as dicts for the given protocol.
+
+        :param protocol: Protocol to get all options for
+        :param include_cross_validation_options: If cross validation options should be included (same for all protocols)
+        :return: List of all config options as dicts
+        """
         result = []
         all_config_options_dict = all_options_dict | cross_validation_dict \
             if include_cross_validation_options else all_options_dict
@@ -237,6 +247,13 @@ def _verify_cv_config(protocol: Protocol, config_map: Dict[str, ConfigOption],
                     f"{cv_object.value} not valid for cross validation option {cv_object.name}!")
 
     def get_verified_config(self, ignore_file_checks: bool = False) -> Dict[str, Any]:
+        """
+        Reads the yaml config, performs value transformations (such as downloading files) and verifies the config's
+        correctness.
+
+        :param ignore_file_checks: If True, files are not checked for correctness.
+        :return: Dictionary with config option names as keys and their respective (transformed) values
+        """
         config_map, cv_map = self._get_config_maps(protocol=self.protocol, config_dict=self._config_dict,
                                                    config_file_path=self._config_file_path)
         self._verify_config(protocol=self.protocol, config_map=config_map, ignore_file_checks=ignore_file_checks)

diff --git a/biotrainer/config/cross_validation_options.py b/biotrainer/config/cross_validation_options.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import List, Type, Union, Any, Final
+from typing import List, Union, Any, Final
 
 from .config_option import ConfigOption, classproperty
 

diff --git a/biotrainer/config/embedding_options.py b/biotrainer/config/embedding_options.py
@@ -3,7 +3,6 @@
 from typing import List, Any, Union
 
 from .config_option import FileOption, classproperty, ConfigOption
-
 from ..embedders import get_predefined_embedder_names
 
 

diff --git a/biotrainer/config/general_options.py b/biotrainer/config/general_options.py
@@ -1,7 +1,7 @@
-import os.path
+from abc import ABC
 from abc import ABC
 from pathlib import Path
-from typing import List, Type, Any, Union
+from typing import List, Any, Union
 
 from .config_option import ConfigOption, classproperty
 from ..protocols import Protocol

diff --git a/biotrainer/config/input_options.py b/biotrainer/config/input_options.py
@@ -1,6 +1,5 @@
 from abc import ABC
-from pathlib import Path
-from typing import List, Type, Any, Union
+from typing import List, Any, Union
 
 from .config_option import FileOption, classproperty
 from ..protocols import Protocol

diff --git a/biotrainer/config/model_options.py b/biotrainer/config/model_options.py
@@ -1,12 +1,11 @@
 from abc import ABC
-from typing import List, Type, Any, Union
+from typing import List, Any, Union
 
 from .config_option import ConfigOption, classproperty
-from ..protocols import Protocol
-
-from ..models import get_available_models_dict
 from ..losses import get_available_losses_dict
+from ..models import get_available_models_dict
 from ..optimizers import get_available_optimizers_dict
+from ..protocols import Protocol
 
 
 class ModelOption(ConfigOption, ABC):

diff --git a/biotrainer/config/training_options.py b/biotrainer/config/training_options.py
@@ -1,5 +1,5 @@
 from abc import ABC
-from typing import List, Type, Any, Union
+from typing import List, Any, Union
 
 from .config_option import ConfigOption, FileOption, classproperty
 

diff --git a/biotrainer/datasets/__init__.py b/biotrainer/datasets/__init__.py
@@ -1,21 +1,23 @@
 from typing import List
 
 from .collate_functions import pad_sequence_embeddings, pad_residue_embeddings, pad_residues_embeddings
-from .EmbeddingsDataset import ResidueEmbeddingsClassificationDataset, SequenceEmbeddingsClassificationDataset, \
-    SequenceEmbeddingsRegressionDataset
+from .embeddings_dataset import ResidueEmbeddingsClassificationDataset, ResidueEmbeddingsRegressionDataset, \
+    SequenceEmbeddingsClassificationDataset, SequenceEmbeddingsRegressionDataset
 
 from ..protocols import Protocol
 
 __DATASETS = {
     Protocol.residue_to_class: ResidueEmbeddingsClassificationDataset,
     Protocol.residues_to_class: ResidueEmbeddingsClassificationDataset,
+    Protocol.residues_to_value: ResidueEmbeddingsRegressionDataset,
     Protocol.sequence_to_class: SequenceEmbeddingsClassificationDataset,
     Protocol.sequence_to_value: SequenceEmbeddingsRegressionDataset,
 }
 
 __COLLATE_FUNCTIONS = {
     Protocol.residue_to_class: pad_residue_embeddings,
     Protocol.residues_to_class: pad_residues_embeddings,
+    Protocol.residues_to_value: pad_residues_embeddings,
     Protocol.sequence_to_class: pad_sequence_embeddings,
     Protocol.sequence_to_value: pad_sequence_embeddings,
 }

diff --git a/biotrainer/datasets/EmbeddingsDataset.py → biotrainer/datasets/embeddings_dataset.py b/biotrainer/datasets/EmbeddingsDataset.py → biotrainer/datasets/embeddings_dataset.py
@@ -25,6 +25,14 @@ class ResidueEmbeddingsClassificationDataset(__EmbeddingsDataset):
     pass
 
 
+class ResidueEmbeddingsRegressionDataset(__EmbeddingsDataset):
+    def __getitem__(self, index: int) -> Tuple[str, torch.FloatTensor, torch.LongTensor]:
+        seq_id = self.ids[index]
+        x = self.inputs[index].float()
+        y = self.targets[index].float()
+        return seq_id, x, y
+
+
 class SequenceEmbeddingsClassificationDataset(__EmbeddingsDataset):
     pass
 

diff --git a/biotrainer/embedders/__init__.py b/biotrainer/embedders/__init__.py
@@ -2,8 +2,8 @@
 import logging
 
 from pathlib import Path
-from typing import Union, Optional, List
-from transformers import AutoTokenizer, T5Tokenizer, T5EncoderModel
+from typing import Union, Optional, List, Tuple
+from transformers import AutoTokenizer, T5Tokenizer, T5EncoderModel, EsmTokenizer, EsmModel
 from importlib.util import spec_from_file_location, module_from_spec
 
 from .custom_embedder import CustomEmbedder
@@ -18,7 +18,6 @@
     "one_hot_encoding": OneHotEncodingEmbedder
 }
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -34,6 +33,21 @@ def get_embedding_service(embeddings_file_path: Union[str, None], embedder_name:
     return EmbeddingService(embedder=embedder, use_half_precision=use_half_precision)
 
 
+def _determine_tokenizer_and_model(embedder_name: str) -> Tuple:
+    """
+    Simple method to guess the model architecture from the embedder name for huggingface transformers
+
+    @param embedder_name: Name of huggingface transformer
+    @return: Tuple of tokenizer and model class
+    """
+    if "t5" in embedder_name.lower():
+        return T5Tokenizer, T5EncoderModel
+    elif "esm" in embedder_name.lower():
+        return EsmTokenizer, EsmModel
+    # Use T5 as default
+    return T5Tokenizer, T5EncoderModel
+
+
 def _get_embedder(embedder_name: str, use_half_precision: bool,
                   device: Optional[Union[str, torch.device]]) -> EmbedderInterface:
     # Predefined Embedders
@@ -51,23 +65,25 @@ def _get_embedder(embedder_name: str, use_half_precision: bool,
         raise Exception(f"use_half_precision mode is not compatible with embedding "
                         f"on the CPU. (See: https://github.com/huggingface/transformers/issues/11546)")
     torch_dtype = torch.float16 if use_half_precision else torch.float32
-    auto_tokenizer = False
+
+    tokenizer_class, model_class = _determine_tokenizer_and_model(embedder_name)
+    logger.info(f"Loading embedder model {embedder_name}..")
     try:
-        tokenizer = T5Tokenizer.from_pretrained(embedder_name, torch_dtype=torch_dtype)
-        model = T5EncoderModel.from_pretrained(embedder_name, torch_dtype=torch_dtype)
+        tokenizer = tokenizer_class.from_pretrained(embedder_name, torch_dtype=torch_dtype)
+        model = model_class.from_pretrained(embedder_name, torch_dtype=torch_dtype)
     except OSError as os_error:
         raise Exception(f"{embedder_name} could not be found!") from os_error
     except Exception:
         try:
             tokenizer = AutoTokenizer.from_pretrained(embedder_name, torch_dtype=torch_dtype)
             model = T5EncoderModel.from_pretrained(embedder_name, torch_dtype=torch_dtype)
-            auto_tokenizer = True
         except Exception as e:
-            raise Exception(f"Loading {embedder_name} automatically and as T5Tokenizer failed! Please provide "
-                            f"a custom_embedder script for your use-case.") from e
+            raise Exception(f"Loading {embedder_name} automatically and as {tokenizer_class.__class__.__name__} failed!"
+                            f" Please provide a custom_embedder script for your use-case.") from e
 
     logger.info(f"Using huggingface transformer embedder: {embedder_name} "
-                f"- Tokenizer: {'Auto' if auto_tokenizer else 'T5Tokenizer'} "
+                f"- Model: {model.__class__.__name__} "
+                f"- Tokenizer: {tokenizer.__class__.__name__} "
                 f"- Half-Precision: {str(use_half_precision)}")
     return HuggingfaceTransformerEmbedder(name=embedder_name, model=model, tokenizer=tokenizer,
                                           use_half_precision=use_half_precision, device=device)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,7 +3,6 @@
		from typing import List, Any, Union

		from .config_option import FileOption, classproperty, ConfigOption

		from ..embedders import get_predefined_embedder_names


Expand Down