polaris-hub · cwognum · May 23, 2024 · May 8, 2024 · May 8, 2024 · May 9, 2024
@@ -2,16 +2,17 @@
 import pandas as pd
 import typer
 
+from typing import Optional
 from auroris.curation import Curator
 from auroris.report.broadcaster import HTMLBroadcaster
 
 app = typer.Typer()
 
 
 @app.command()
-def curate(config_path: str, dataset_path: str, destination: str, overwrite: bool = False):
+def curate(config_path: str, destination: str, dataset_path: Optional[str] = None, overwrite: bool = False):
     # Load data
-    dataset = pd.read_csv(dataset_path)
+    dataset = pd.read_csv(dataset_path) if dataset_path else None
     curator = Curator.from_json(config_path)
 
     # Run curation

@@ -1,6 +1,7 @@
 import json
-from typing import List, Tuple, Union
+from typing import List, Tuple, Union, Optional
 
+from os import PathLike
 import fsspec
 import pandas as pd
 from loguru import logger
@@ -15,14 +16,23 @@ class Curator(BaseModel):
     """
     A curator is a collection of actions that are applied to a dataset.
     Can be serialized.
+
     """
 
     # To know which Action object to create, we need a discriminated union.
     # This is the recommended way to add all subclasses in the type.
     # See e.g. https://github.com/pydantic/pydantic/issues/2200
     # and https://github.com/pydantic/pydantic/issues/2036
-    steps: List[Union[tuple(ACTION_REGISTRY)]] = Field(..., discriminator="name")  # type: ignore
-
+    data_path: Optional[Union[str, PathLike]] = Field(
+        default=None,
+        description="Data path. The data must be loadable by `pd.read_csv` with default parameters.",
+    )
+
+    steps: List[Union[tuple(ACTION_REGISTRY)]] = Field(
+        ...,
+        discriminator="name",
+        description="List of curation actions. Check all the available action <auroris.curation.actions.__all__>.",
+    )
     verbosity: VerbosityLevel = VerbosityLevel.NORMAL
     parallelized_kwargs: dict = Field(default_factory=dict)
 
@@ -36,9 +46,26 @@ def _validate_verbosity(cls, v):
     def _serialize_verbosity(self, value: VerbosityLevel):
         return value.name
 
-    def transform(self, dataset: pd.DataFrame) -> Tuple[pd.DataFrame, CurationReport]:
+    @field_validator("data_path", mode="before")
+    def _validate_data_path(cls, value: Union[str, PathLike]):
+        try:
+            pd.read_csv(value, nrows=5)
+            return value
+        except Exception:
+            raise ValueError(
+                f"Dataset cann't be loaded by `panda.read_csv('{value}')`."
+                f"Consider to directly pass the loaded the data to `Curator.curate()`."
+            )
+
+    def _load_data(self):
+        return pd.read_csv(self.data_path)
+
+    def transform(self, dataset: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFrame, CurationReport]:
+        if dataset is None:
+            dataset = self._load_data()
+
         report = CurationReport()
-        dataset = dataset.copy(deep=True)
+        dataset = dataset.copy()
 
         action: BaseAction
         for action in self.steps:
@@ -57,6 +84,13 @@ def transform(self, dataset: pd.DataFrame) -> Tuple[pd.DataFrame, CurationReport
     def __call__(self, dataset):
         return self.transform(dataset)
 
+    @classmethod
+    def _get_action(cls, name: str):
+        for action in ACTION_REGISTRY:
+            if action.__name__ == name:
+                return action
+        return None
+
     @classmethod
     def from_json(cls, path: str):
         """Loads a curation workflow from a JSON file.
@@ -66,6 +100,9 @@ def from_json(cls, path: str):
         """
         with fsspec.open(path, "r") as f:
             data = json.load(f)
+
+        steps = [cls._get_action(name)(**args) for step in data["steps"] for name, args in step.items()]
+        data["steps"] = steps
         return cls.model_validate(data)
 
     def to_json(self, path: str):
@@ -74,6 +111,12 @@ def to_json(self, path: str):
         Args:
             path: The destination to save to
         """
+        serialization = self.model_dump(exclude="steps")
+        # remove data_path
+        if self.data_path is None:
+            serialization.pop("data_path")
+        # save steps in defined order
+        serialization["steps"] = [{step.name: step.model_dump()} for step in self.steps]
         with fsspec.open(path, "w") as f:
-            json.dump(self.model_dump(), f)
+            json.dump(serialization, f)
         return path
@@ -1,9 +1,11 @@
 from typing import Dict, List, Optional
+from pydantic import Field
 
 import datamol as dm
 import numpy as np
 import pandas as pd
 
+
 from auroris.curation.actions._base import BaseAction
 from auroris.curation.actions._outlier import modified_zscore
 from auroris.report import CurationReport
@@ -15,9 +17,19 @@ def detect_streoisomer_activity_cliff(
     dataset: pd.DataFrame,
     stereoisomer_id_col: str,
     y_cols: List[str],
-    threshold: float = 1.0,
+    threshold: float = 2.0,
     prefix: str = "AC_",
-):
+) -> pd.DataFrame:
+    """
+    Detect activity cliff among stereoisomers based on classification label or pre-defined threshold for continuous values.
+
+    Args:
+        dataset: Dataframe
+        stereoisomer_id_col: Column which identifies the stereoisomers
+        y_cols: List of columns for bioactivities
+        threshold: Threshold to identify the activity cliff. Currently, the difference of zscores between isomers are used for identification.
+        prefix: Prefix for the adding columns
+    """
     dataset_ori = dataset.copy(deep=True)
     ac_cols = {y_col: [] for y_col in y_cols}
     group_index_list = np.array(
@@ -51,14 +63,19 @@ def detect_streoisomer_activity_cliff(
 
 class StereoIsomerACDetection(BaseAction):
     """
-    Automatic detection of outliers.
+    Automatic detection of activity shift between stereoisomers.
     """
 
-    stereoisomer_id_col: str
-    y_cols: List[str]
-    threshold: float = 2.0
-    prefix: str = "AC_"
-    mol_col: str = "MOL_smiles"
+    stereoisomer_id_col: str = Field(
+        default="MOL_molhash_id_no_stereo", description="Column which identifies the stereoisomers."
+    )
+    y_cols: List[str] = Field(..., description="List of columns for bioactivities.")
+    threshold: float = Field(
+        default=2.0,
+        description=" Threshold to identify the activity cliff. Currently, the difference of zscores between isomers are used for identification.",
+    )
+    prefix: str = Field(default="AC_", description="Prefix for the adding columns.")
+    mol_col: str = Field(default="MOL_smiles", description="Column for molecule strings.")
 
     def transform(
         self,
@@ -80,7 +97,7 @@ def transform(
                 col_with_prefix = self.get_column_name(col)
                 report.log_new_column(col_with_prefix)
 
-                has_cliff = dataset[col_with_prefix].notna()
+                has_cliff = dataset[col_with_prefix].__eq__(True)
                 num_cliff = has_cliff.sum()
 
                 if num_cliff > 0:
@@ -92,10 +109,12 @@ def transform(
                     legends = (col + dataset.loc[has_cliff, col].astype(str)).tolist()
 
                     image = dm.to_image([dm.to_mol(s) for s in to_plot], legends=legends, use_svg=False)
-                    report.log_image(image)
+                    report.log_image(
+                        image_or_figure=image, title="Detection of activity shifts among stereoisomers"
+                    )
 
                 else:
                     report.log(
-                        "Found no activity cliffs among stereoisomers with respect to the {col} column."
+                        f"Found no activity cliffs among stereoisomers with respect to the {col} column."
                     )
         return dataset
@@ -2,7 +2,7 @@
 from typing import TYPE_CHECKING, Dict, Optional
 
 import pandas as pd
-from pydantic import BaseModel, model_validator
+from pydantic import BaseModel, model_validator, Field
 
 from auroris.types import VerbosityLevel
 
@@ -16,14 +16,9 @@
 class BaseAction(BaseModel, abc.ABC):
     """
     An action in the curation process.
-
-    Args:
-        prefix: If the action adds columns, use this prefix.
-        completed: If the action has completed.
-        dep_action: Name of dependent action.
     """
 
-    prefix: str = None
+    prefix: str = Field(default=None, description="If the action adds columns, use this prefix.")
 
     @property
     def name(self) -> str:

@@ -1,4 +1,5 @@
 from typing import Dict, List, Literal, Optional, Union
+from pydantic import Field
 
 import pandas as pd
 
@@ -55,10 +56,16 @@ class Deduplication(BaseAction):
     Automatic detection of outliers.
     """
 
-    deduplicate_on: Optional[Union[str, List[str]]] = None
-    y_cols: Optional[Union[str, List[str]]] = None
-    keep: Literal["first", "last"] = "first"
-    method: Literal["mean", "median"] = "median"
+    deduplicate_on: Optional[Union[str, List[str]]] = Field(
+        default=None, description="A subset of the columns to deduplicate on (can be default)."
+    )
+    y_cols: Optional[Union[str, List[str]]] = Field(default=None, description="The columns to aggregate.")
+    keep: Literal["first", "last"] = Field(
+        default="first", description="Whether to keep the first or last copy of the duplicates."
+    )
+    method: Literal["mean", "median"] = Field(
+        default="median", description="The method to aggregate the data."
+    )
 
     def transform(
         self,
@@ -67,10 +74,14 @@ def transform(
         verbosity: VerbosityLevel = VerbosityLevel.NORMAL,
         parallelized_kwargs: Optional[Dict] = None,
     ):
-        return deduplicate(
+        dataset_dedup = deduplicate(
             dataset,
             deduplicate_on=self.deduplicate_on,
             y_cols=self.y_cols,
             keep=self.keep,
             method=self.method,
         )
+        if report is not None:
+            num_duplicates = dataset.shape[0] - dataset_dedup.shape[0]
+            report.log(f"Deduplication merged and removed {num_duplicates} duplicated molecules from dataset")
+        return dataset_dedup
@@ -1,4 +1,5 @@
 from typing import Dict, List, Literal, Optional, Union
+from pydantic import Field
 
 import numpy as np
 import pandas as pd
@@ -17,7 +18,8 @@ def discretize(
     allow_nan: bool = True,
     label_order: Literal["ascending", "descending"] = "ascending",
 ) -> np.ndarray:
-    """Thresholding of array-like or scipy.sparse matrix into binary or multiclass labels.
+    """
+    Thresholding of array-like or scipy.sparse matrix into binary or multiclass labels.
 
     Args:
         X : The data to discretize, element by element.
@@ -76,13 +78,39 @@ def discretize(
 
 
 class Discretization(BaseAction):
-    input_column: str
+    """
+    Thresholding bioactivity columns to binary or multiclass labels.
+    """
+
+    input_column: str = Field(..., description="Column to be discretized.")
     prefix: str = "CLS_"
-    thresholds: List[float]
-    inplace: bool = False
-    allow_nan: bool = True
-    label_order: Literal["ascending", "descending"] = "ascending"
-    log_scale: bool = True
+    thresholds: List[float] = Field(..., description="Interval boundaries that include the right bin edge.")
+    inplace: bool = Field(
+        default=False,
+        description="""Set to True to perform inplace discretization and avoid a copy
+            (if the input is already a numpy array or a scipy.sparse CSR / CSC
+            matrix and if axis is 1).""",
+    )
+    allow_nan: bool = Field(
+        default=True,
+        description="Set to True to allow nans in the array for discretization. Otherwise, an error will be raised instead.",
+    )
+    label_order: Literal["ascending", "descending"] = Field(
+        default="ascending",
+        description="""The continuous values are discretized to labels 0, 1, 2, .., N with respect to given
+            threshold bins [threshold_1, threshold_2,.., threshould_n].
+            When set to 'ascending', the class label is in ascending order with the threshold
+            bins that `0` represents negative class or lower class, while 1, 2, 3 are for higher classes.
+            When set to 'descending' the class label is in ascending order with the threshold bins.
+            Sometimes the positive labels are on the left side of provided threshold.
+            E.g. For binarization with threshold [0.5],  the positive label is defined
+            by`X < 0.5`. In this case, `label_order` should be `descending`.""",
+    )
+    log_scale: bool = Field(
+        default=False,
+        description="""Whether visualize distribution in log scale.
+                   See more in <auroris.visualization.visualize_continuous_distribution>""",
+    )
 
     def transform(
         self,

@@ -1,7 +1,6 @@
-from typing import Dict, List, Optional
-
-import pandas as pd
+from typing import Dict, List, Optional, Sequence
 from pydantic import Field
+import pandas as pd
 
 from auroris.curation.actions._base import BaseAction
 from auroris.report import CurationReport
@@ -11,12 +10,16 @@
 
 class ContinuousDistributionVisualization(BaseAction):
     """
-    Visualize a continuous distribution
+    Visualize a continuous distribution.
     """
 
-    y_cols: Optional[List[str]] = None
-    log_scale: bool = False
-    kwargs: Dict = Field(default_factory=dict)
+    y_cols: Optional[List[str]] = Field(
+        default=None, description="List of columns for bioactivity for visualization."
+    )
+    log_scale: bool = Field(default=False, description="Whether visualize distribution in log scale.")
+    bins: Optional[Sequence[float]] = Field(
+        default=None, description="The bin boundaries to color the area under the KDE curve."
+    )
 
     def transform(
         self,
@@ -28,7 +31,7 @@ def transform(
         if report is not None:
             for y_col in self.y_cols:
                 fig = visualize_continuous_distribution(
-                    data=dataset[y_col], label_name=y_col, log_scale=self.log_scale
+                    data=dataset[y_col], log_scale=self.log_scale, bins=self.bins
                 )
                 report.log_image(fig, title=f"Data distribution - {y_col}")