polaris-hub · cwognum · May 23, 2024 · May 8, 2024 · May 8, 2024 · May 9, 2024
@@ -33,6 +33,12 @@ curator = Curator(
 # Run the curation
 dataset, report = curator(dataset)
 ```
+### Run curation with command line
+A `Curator` object is serializable, so you can save it to and load it from a JSON file to reproduce the curation.
+
+```
+auroris [config_file] [destination] --dataset-path [data_path]
+```
 
 ## Documentation
 

@@ -1,5 +1,6 @@
+from typing import Optional
+
 import datamol as dm
-import pandas as pd
 import typer
 
 from auroris.curation import Curator
@@ -9,18 +10,25 @@
 
 
 @app.command()
-def curate(config_path: str, dataset_path: str, destination: str, overwrite: bool = False):
-    # Load data
-    dataset = pd.read_csv(dataset_path)
+def curate(config_path: str, destination: str, dataset_path: Optional[str] = None, overwrite: bool = False):
+    # Create the curator
     curator = Curator.from_json(config_path)
 
+    # Overwrite the source dataset if it is set
+    if dataset_path is not None:
+        curator.src_dataset_path = dataset_path
+
     # Run curation
-    dataset, report = curator(dataset)
+    dataset, report = curator.transform()
 
     # Save dataset
     dm.fs.mkdir(destination, exist_ok=overwrite)
-    path = dm.fs.join(destination, "curated.csv")
-    dataset.to_csv(path, index=False)
+    path = dm.fs.join(destination, "curated.parquet")
+    dataset.to_parquet(path, index=False)
+
+    # Save a copy of the curation config
+    config_destination = dm.fs.join(destination, "config.json")
+    curator.to_json(config_destination)
 
     # Save report as HTML
     report_destination = dm.fs.join(destination, "report")

@@ -1,28 +1,41 @@
 import json
-from typing import List, Tuple, Union
+from typing import Annotated, List, Optional, Tuple, Union
 
+import datamol as dm
 import fsspec
 import pandas as pd
 from loguru import logger
 from pydantic import BaseModel, Field, field_serializer, field_validator
 
-from auroris.curation.actions._base import ACTION_REGISTRY, BaseAction
+from auroris.curation.actions import BaseAction
 from auroris.report import CurationReport
 from auroris.types import VerbosityLevel
+from auroris.utils import is_parquet_file
 
 
 class Curator(BaseModel):
     """
-    A curator is a collection of actions that are applied to a dataset.
-    Can be serialized.
+    A curator is a serializable collection of actions that are applied to a dataset.
+
+    Attributes:
+        steps (List[BaseAction]): Ordered list of curation actions to apply to the dataset.
+        src_dataset_path: An optional path to load the source dataset from. Can be used to specify a reproducible workflow.
+        verbosity: Verbosity level for logging.
+        parallelized_kwargs: Keyword arguments to affect parallelization in the steps.
     """
 
     # To know which Action object to create, we need a discriminated union.
     # This is the recommended way to add all subclasses in the type.
     # See e.g. https://github.com/pydantic/pydantic/issues/2200
     # and https://github.com/pydantic/pydantic/issues/2036
-    steps: List[Union[tuple(ACTION_REGISTRY)]] = Field(..., discriminator="name")  # type: ignore
-
+    steps: List[
+        Annotated[
+            Union[tuple(BaseAction.__subclasses__())],  # type: ignore
+            Field(..., discriminator="name"),
+        ]
+    ]
+
+    src_dataset_path: Optional[str] = None
     verbosity: VerbosityLevel = VerbosityLevel.NORMAL
     parallelized_kwargs: dict = Field(default_factory=dict)
 
@@ -36,8 +49,56 @@ def _validate_verbosity(cls, v):
     def _serialize_verbosity(self, value: VerbosityLevel):
         return value.name
 
-    def transform(self, dataset: pd.DataFrame) -> Tuple[pd.DataFrame, CurationReport]:
+    @field_validator("src_dataset_path")
+    def _validate_src_dataset_path(cls, value: Optional[str]):
+        # If not set, no need to validate
+        if value is None:
+            return value
+
+        # Efficient check to see if it's a valid path to a supported file
+        if not is_parquet_file(value):
+            try:
+                pd.read_csv(value, nrows=5)
+            except Exception:
+                raise ValueError(
+                    f"Dataset can't be loaded by `pandas.read_csv('{value}')` nor `pandas.read_parquet('{value}')`."
+                    f"Consider passing the DataFrame directly to `Curator.curate(dataset=...)`."
+                )
+
+        # If it's set, but local, warn the user that this hinders reproducibility.
+        if dm.utils.fs.is_local_path(value):
+            logger.warning(
+                "Using a local path for `src_dataset_path` hinders reproducibility. "
+                "Consider uploading the file to a public cloud storage service."
+            )
+        return value
+
+    def transform(self, dataset: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFrame, CurationReport]:
+        """Runs the curation process.
+
+        Args:
+            dataset: The dataset to be curated. If `src_dataset_path` is set, this parameter is ignored.
+
+        Returns:
+            A tuple of the curated dataset and a report summarizing the changes made.
+        """
+
+        if self.src_dataset_path is not None:
+            if dataset is not None:
+                logger.warning(
+                    "Both `self.scr_dataset_path` and the `dataset` parameter are specified. "
+                    "Ignoring the `dataset` parameter."
+                )
+
+            dataset = self.load_dataset(self.src_dataset_path)
+
+        if dataset is None:
+            raise ValueError("Running the curator requires a source dataset.")
+
+        # The report summarizes the changes made to the dataset
         report = CurationReport()
+
+        # Changes are not made in place
         dataset = dataset.copy(deep=True)
 
         action: BaseAction
@@ -54,6 +115,20 @@ def transform(self, dataset: pd.DataFrame) -> Tuple[pd.DataFrame, CurationReport
 
         return dataset, report
 
+    @staticmethod
+    def load_dataset(path: str):
+        """
+        Loads a dataset, to be curated, from a path.
+
+        Info: File-format support
+            This currently only supports CSV and Parquet files and uses the default
+            parameters for `pd.read_csv` and `pd.read_parquet`. If you need more flexibility,
+            consider loading the data yourself and passing it directly to `Curator.transform(dataset=...)`.
+        """
+        if not is_parquet_file(path):
+            return pd.read_csv(path)
+        return pd.read_parquet(path)
+
     def __call__(self, dataset):
         return self.transform(dataset)
 
@@ -66,14 +141,13 @@ def from_json(cls, path: str):
         """
         with fsspec.open(path, "r") as f:
             data = json.load(f)
-        return cls.model_validate(data)
+        return cls(**data)
 
     def to_json(self, path: str):
         """Saves the curation workflow to a JSON file.
 
         Args:
-            path: The destination to save to
+            path: The destination to save to.
         """
         with fsspec.open(path, "w") as f:
             json.dump(self.model_dump(), f)
-        return path
@@ -1,8 +1,9 @@
-from typing import Dict, List, Optional
+from typing import Dict, List, Literal, Optional
 
 import datamol as dm
 import numpy as np
 import pandas as pd
+from pydantic import Field
 
 from auroris.curation.actions._base import BaseAction
 from auroris.curation.actions._outlier import modified_zscore
@@ -15,9 +16,19 @@ def detect_streoisomer_activity_cliff(
     dataset: pd.DataFrame,
     stereoisomer_id_col: str,
     y_cols: List[str],
-    threshold: float = 1.0,
+    threshold: float = 2.0,
     prefix: str = "AC_",
-):
+) -> pd.DataFrame:
+    """
+    Detect activity cliff among stereoisomers based on classification label or pre-defined threshold for continuous values.
+
+    Args:
+        dataset: Dataframe
+        stereoisomer_id_col: Column which identifies the stereoisomers
+        y_cols: List of columns for bioactivities
+        threshold: Threshold to identify the activity cliff. Currently, the difference of zscores between isomers are used for identification.
+        prefix: Prefix for the adding columns
+    """
     dataset_ori = dataset.copy(deep=True)
     ac_cols = {y_col: [] for y_col in y_cols}
     group_index_list = np.array(
@@ -51,14 +62,23 @@ def detect_streoisomer_activity_cliff(
 
 class StereoIsomerACDetection(BaseAction):
     """
-    Automatic detection of outliers.
+    Automatic detection of activity shift between stereoisomers.
+
+    See [`auroris.curation.functional.detect_streoisomer_activity_cliff`][] for the docs of the
+    `stereoisomer_id_col`, `y_cols` and `threshold` attributes
+
+    Attributes:
+        mol_col: Column with the SMILES or RDKit Molecule objects.
+            If specified, will be used to render an image for the activity cliffs.
     """
 
-    stereoisomer_id_col: str
-    y_cols: List[str]
-    threshold: float = 2.0
+    name: Literal["ac_stereoisomer"] = "ac_stereoisomer"
     prefix: str = "AC_"
-    mol_col: str = "MOL_smiles"
+
+    stereoisomer_id_col: str = "MOL_molhash_id_no_stereo"
+    y_cols: List[str] = Field(default_factory=list)
+    threshold: float = 2.0
+    mol_col: Optional[str] = "MOL_smiles"
 
     def transform(
         self,
@@ -75,27 +95,35 @@ def transform(
             prefix=self.prefix,
         )
 
+        # Log the following information to the report:
+        # - Newly added columns
+        # - Number of activity cliffs found
+        # - Image of the activity cliffs
+
         if report is not None:
             for col in self.y_cols:
                 col_with_prefix = self.get_column_name(col)
                 report.log_new_column(col_with_prefix)
 
-                has_cliff = dataset[col_with_prefix].notna()
+                has_cliff = dataset[col_with_prefix]
                 num_cliff = has_cliff.sum()
 
                 if num_cliff > 0:
                     report.log(
                         f"Found {num_cliff} activity cliffs among stereoisomers "
                         f"with respect to the {col} column."
                     )
-                    to_plot = dataset.loc[has_cliff, self.mol_col]
-                    legends = (col + dataset.loc[has_cliff, col].astype(str)).tolist()
 
-                    image = dm.to_image([dm.to_mol(s) for s in to_plot], legends=legends, use_svg=False)
-                    report.log_image(image)
+                    if self.mol_col is not None:
+                        to_plot = dataset.loc[has_cliff, self.mol_col]
+                        legends = (col + dataset.loc[has_cliff, col].astype(str)).tolist()
+                        image = dm.to_image([dm.to_mol(s) for s in to_plot], legends=legends, use_svg=False)
+                        report.log_image(
+                            image_or_figure=image, title="Detection of activity shifts among stereoisomers"
+                        )
 
                 else:
                     report.log(
-                        "Found no activity cliffs among stereoisomers with respect to the {col} column."
+                        f"Found no activity cliffs among stereoisomers with respect to the {col} column."
                     )
         return dataset
@@ -10,26 +10,24 @@
     from auroris.report import CurationReport
 
 
-ACTION_REGISTRY = []
-
-
 class BaseAction(BaseModel, abc.ABC):
     """
     An action in the curation process.
 
-    Args:
-        prefix: If the action adds columns, use this prefix.
-        completed: If the action has completed.
-        dep_action: Name of dependent action.
+    Info: The importance of reproducibility
+        One of the main goals in designing `auroris` is to make it easy to reproduce the curation process.
+        Reproducibility is key to scientific research. This is why a BaseAction needs to be serializable and
+        uniquely identified by a `name`.
+
+    Attributes:
+        name: The name that uniquely identifies the action. This is used to serialize and deserialize the action.
+        prefix: This prefix is used when an action adds columns to a dataset.
+            If not set, it defaults to the name in uppercase.
     """
 
+    name: str
     prefix: str = None
 
-    @property
-    def name(self) -> str:
-        """The name of the action. Needs to be unique."""
-        return self.__class__.__name__
-
     @model_validator(mode="after")
     @classmethod
     def _validate_model(cls, m: "BaseAction"):
@@ -52,7 +50,3 @@ def transform(
 
     def __call__(self, dataset: pd.DataFrame):
         return self.transform(dataset)
-
-    def __init_subclass__(cls, **kwargs):
-        super().__init_subclass__(**kwargs)
-        ACTION_REGISTRY.append(cls)
@@ -53,8 +53,13 @@ def deduplicate(
 class Deduplication(BaseAction):
     """
     Automatic detection of outliers.
+
+    See [`auroris.curation.functional.deduplicate`][] for the docs of the
+    `deduplicate_on`, `y_cols`, `keep` and `method` attributes
     """
 
+    name: Literal["deduplicate"] = "deduplicate"
+
     deduplicate_on: Optional[Union[str, List[str]]] = None
     y_cols: Optional[Union[str, List[str]]] = None
     keep: Literal["first", "last"] = "first"
@@ -67,10 +72,14 @@ def transform(
         verbosity: VerbosityLevel = VerbosityLevel.NORMAL,
         parallelized_kwargs: Optional[Dict] = None,
     ):
-        return deduplicate(
+        dataset_dedup = deduplicate(
             dataset,
             deduplicate_on=self.deduplicate_on,
             y_cols=self.y_cols,
             keep=self.keep,
             method=self.method,
         )
+        if report is not None:
+            num_duplicates = len(dataset) - len(dataset_dedup)
+            report.log(f"Deduplication merged and removed {num_duplicates} duplicated molecules from dataset")
+        return dataset_dedup