Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Several bug fixes and improved documentation #4

Merged
merged 40 commits into from
May 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
4b04c1c
add ipython image
zhu0619 May 8, 2024
9096b30
minor fix
zhu0619 May 8, 2024
42b833c
allow image export to remote path
zhu0619 May 9, 2024
f1f3abb
minor changes
zhu0619 May 9, 2024
9d365ae
change loglevel
zhu0619 May 15, 2024
2ab83d7
update chemspace viz
zhu0619 May 15, 2024
8412c9a
add dup logger
zhu0619 May 15, 2024
041cc7a
simplify distribution viz
zhu0619 May 15, 2024
d1f41de
update outlier logs
zhu0619 May 15, 2024
f841f12
minor changes
zhu0619 May 15, 2024
1f25fa5
fix serialization
zhu0619 May 16, 2024
a9669a6
add more docstrings
zhu0619 May 16, 2024
582070a
fix docstring and extension
zhu0619 May 17, 2024
021c11e
update css
zhu0619 May 17, 2024
d73f685
minor change
zhu0619 May 17, 2024
085da1c
minor fix
zhu0619 May 17, 2024
8454d5e
add dep
zhu0619 May 17, 2024
e13c677
Merge branch 'main' into fix/minor
zhu0619 May 17, 2024
9e75e01
Update auroris/curation/_curator.py
zhu0619 May 17, 2024
53ad1da
Update auroris/curation/actions/_mol.py
zhu0619 May 17, 2024
ecdf785
Update docs/index.md
zhu0619 May 17, 2024
33569eb
Update auroris/visualization/_distribution.py
zhu0619 May 17, 2024
83cf949
Update docs/index.md
zhu0619 May 17, 2024
26f0a6f
update test_curator_save_load
zhu0619 May 17, 2024
3a3b5e8
minor fix
zhu0619 May 17, 2024
78c731d
Update auroris/report/broadcaster/_logger.py
zhu0619 May 17, 2024
ed0c7f6
curator save/load
zhu0619 May 22, 2024
499eb26
avoid ipython image
zhu0619 May 22, 2024
d901238
minor viz fix
zhu0619 May 22, 2024
97c9bd3
wip
zhu0619 May 22, 2024
ed7a16e
refactor image name
zhu0619 May 22, 2024
e0699c5
Update docs/index.md
zhu0619 May 22, 2024
b417935
Update auroris/utils.py
zhu0619 May 22, 2024
c50af5b
fix attribute discriminator
zhu0619 May 22, 2024
4ee0270
fix report
zhu0619 May 22, 2024
b6757ca
minor fix
zhu0619 May 22, 2024
4b2bae8
remove gcp
zhu0619 May 22, 2024
5ec838a
update tutorial
zhu0619 May 23, 2024
6d836ff
add pyarrow dep
zhu0619 May 23, 2024
becae34
Update the documentation (#5)
cwognum May 23, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ curator = Curator(
# Run the curation
dataset, report = curator(dataset)
```
### Run curation with command line
A `Curator` object is serializable, so you can save it to and load it from a JSON file to reproduce the curation.

```
auroris [config_file] [destination] --dataset-path [data_path]
```

## Documentation

Expand Down
22 changes: 15 additions & 7 deletions auroris/cli.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Optional

import datamol as dm
import pandas as pd
import typer

from auroris.curation import Curator
Expand All @@ -9,18 +10,25 @@


@app.command()
def curate(config_path: str, dataset_path: str, destination: str, overwrite: bool = False):
# Load data
dataset = pd.read_csv(dataset_path)
def curate(config_path: str, destination: str, dataset_path: Optional[str] = None, overwrite: bool = False):
# Create the curator
curator = Curator.from_json(config_path)

# Overwrite the source dataset if it is set
if dataset_path is not None:
curator.src_dataset_path = dataset_path

# Run curation
dataset, report = curator(dataset)
dataset, report = curator.transform()

# Save dataset
dm.fs.mkdir(destination, exist_ok=overwrite)
path = dm.fs.join(destination, "curated.csv")
dataset.to_csv(path, index=False)
path = dm.fs.join(destination, "curated.parquet")
dataset.to_parquet(path, index=False)

# Save a copy of the curation config
config_destination = dm.fs.join(destination, "config.json")
curator.to_json(config_destination)

# Save report as HTML
report_destination = dm.fs.join(destination, "report")
Expand Down
94 changes: 84 additions & 10 deletions auroris/curation/_curator.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,41 @@
import json
from typing import List, Tuple, Union
from typing import Annotated, List, Optional, Tuple, Union

import datamol as dm
import fsspec
import pandas as pd
from loguru import logger
from pydantic import BaseModel, Field, field_serializer, field_validator

from auroris.curation.actions._base import ACTION_REGISTRY, BaseAction
from auroris.curation.actions import BaseAction
from auroris.report import CurationReport
from auroris.types import VerbosityLevel
from auroris.utils import is_parquet_file


class Curator(BaseModel):
"""
A curator is a collection of actions that are applied to a dataset.
Can be serialized.
A curator is a serializable collection of actions that are applied to a dataset.

Attributes:
steps (List[BaseAction]): Ordered list of curation actions to apply to the dataset.
src_dataset_path: An optional path to load the source dataset from. Can be used to specify a reproducible workflow.
verbosity: Verbosity level for logging.
parallelized_kwargs: Keyword arguments to affect parallelization in the steps.
"""

# To know which Action object to create, we need a discriminated union.
# This is the recommended way to add all subclasses in the type.
# See e.g. https://github.com/pydantic/pydantic/issues/2200
# and https://github.com/pydantic/pydantic/issues/2036
steps: List[Union[tuple(ACTION_REGISTRY)]] = Field(..., discriminator="name") # type: ignore

steps: List[
Annotated[
Union[tuple(BaseAction.__subclasses__())], # type: ignore
Field(..., discriminator="name"),
]
]

src_dataset_path: Optional[str] = None
verbosity: VerbosityLevel = VerbosityLevel.NORMAL
parallelized_kwargs: dict = Field(default_factory=dict)

Expand All @@ -36,8 +49,56 @@ def _validate_verbosity(cls, v):
def _serialize_verbosity(self, value: VerbosityLevel):
return value.name

def transform(self, dataset: pd.DataFrame) -> Tuple[pd.DataFrame, CurationReport]:
@field_validator("src_dataset_path")
def _validate_src_dataset_path(cls, value: Optional[str]):
# If not set, no need to validate
if value is None:
return value

# Efficient check to see if it's a valid path to a supported file
if not is_parquet_file(value):
try:
pd.read_csv(value, nrows=5)
except Exception:
raise ValueError(
f"Dataset can't be loaded by `pandas.read_csv('{value}')` nor `pandas.read_parquet('{value}')`."
f"Consider passing the DataFrame directly to `Curator.curate(dataset=...)`."
)

# If it's set, but local, warn the user that this hinders reproducibility.
if dm.utils.fs.is_local_path(value):
logger.warning(
"Using a local path for `src_dataset_path` hinders reproducibility. "
"Consider uploading the file to a public cloud storage service."
)
return value

def transform(self, dataset: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFrame, CurationReport]:
"""Runs the curation process.

Args:
dataset: The dataset to be curated. If `src_dataset_path` is set, this parameter is ignored.

Returns:
A tuple of the curated dataset and a report summarizing the changes made.
"""

if self.src_dataset_path is not None:
if dataset is not None:
logger.warning(
"Both `self.scr_dataset_path` and the `dataset` parameter are specified. "
"Ignoring the `dataset` parameter."
)

dataset = self.load_dataset(self.src_dataset_path)

if dataset is None:
raise ValueError("Running the curator requires a source dataset.")

# The report summarizes the changes made to the dataset
report = CurationReport()

# Changes are not made in place
dataset = dataset.copy(deep=True)

action: BaseAction
Expand All @@ -54,6 +115,20 @@ def transform(self, dataset: pd.DataFrame) -> Tuple[pd.DataFrame, CurationReport

return dataset, report

@staticmethod
def load_dataset(path: str):
"""
Loads a dataset, to be curated, from a path.

Info: File-format support
This currently only supports CSV and Parquet files and uses the default
parameters for `pd.read_csv` and `pd.read_parquet`. If you need more flexibility,
consider loading the data yourself and passing it directly to `Curator.transform(dataset=...)`.
"""
if not is_parquet_file(path):
return pd.read_csv(path)
return pd.read_parquet(path)

def __call__(self, dataset):
return self.transform(dataset)

Expand All @@ -66,14 +141,13 @@ def from_json(cls, path: str):
"""
with fsspec.open(path, "r") as f:
data = json.load(f)
return cls.model_validate(data)
return cls(**data)

def to_json(self, path: str):
"""Saves the curation workflow to a JSON file.

Args:
path: The destination to save to
path: The destination to save to.
"""
with fsspec.open(path, "w") as f:
json.dump(self.model_dump(), f)
return path
56 changes: 42 additions & 14 deletions auroris/curation/actions/_ac_stereoisomer.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from typing import Dict, List, Optional
from typing import Dict, List, Literal, Optional

import datamol as dm
import numpy as np
import pandas as pd
from pydantic import Field

from auroris.curation.actions._base import BaseAction
from auroris.curation.actions._outlier import modified_zscore
Expand All @@ -15,9 +16,19 @@ def detect_streoisomer_activity_cliff(
dataset: pd.DataFrame,
stereoisomer_id_col: str,
y_cols: List[str],
threshold: float = 1.0,
threshold: float = 2.0,
prefix: str = "AC_",
):
) -> pd.DataFrame:
"""
Detect activity cliff among stereoisomers based on classification label or pre-defined threshold for continuous values.

Args:
dataset: Dataframe
stereoisomer_id_col: Column which identifies the stereoisomers
y_cols: List of columns for bioactivities
threshold: Threshold to identify the activity cliff. Currently, the difference of zscores between isomers are used for identification.
prefix: Prefix for the adding columns
"""
dataset_ori = dataset.copy(deep=True)
ac_cols = {y_col: [] for y_col in y_cols}
group_index_list = np.array(
Expand Down Expand Up @@ -51,14 +62,23 @@ def detect_streoisomer_activity_cliff(

class StereoIsomerACDetection(BaseAction):
"""
Automatic detection of outliers.
Automatic detection of activity shift between stereoisomers.

See [`auroris.curation.functional.detect_streoisomer_activity_cliff`][] for the docs of the
`stereoisomer_id_col`, `y_cols` and `threshold` attributes

Attributes:
mol_col: Column with the SMILES or RDKit Molecule objects.
If specified, will be used to render an image for the activity cliffs.
"""

stereoisomer_id_col: str
y_cols: List[str]
threshold: float = 2.0
name: Literal["ac_stereoisomer"] = "ac_stereoisomer"
prefix: str = "AC_"
mol_col: str = "MOL_smiles"

stereoisomer_id_col: str = "MOL_molhash_id_no_stereo"
y_cols: List[str] = Field(default_factory=list)
threshold: float = 2.0
mol_col: Optional[str] = "MOL_smiles"

def transform(
self,
Expand All @@ -75,27 +95,35 @@ def transform(
prefix=self.prefix,
)

# Log the following information to the report:
# - Newly added columns
# - Number of activity cliffs found
# - Image of the activity cliffs

if report is not None:
for col in self.y_cols:
col_with_prefix = self.get_column_name(col)
report.log_new_column(col_with_prefix)

has_cliff = dataset[col_with_prefix].notna()
has_cliff = dataset[col_with_prefix]
num_cliff = has_cliff.sum()

if num_cliff > 0:
report.log(
f"Found {num_cliff} activity cliffs among stereoisomers "
f"with respect to the {col} column."
)
to_plot = dataset.loc[has_cliff, self.mol_col]
legends = (col + dataset.loc[has_cliff, col].astype(str)).tolist()

image = dm.to_image([dm.to_mol(s) for s in to_plot], legends=legends, use_svg=False)
report.log_image(image)
if self.mol_col is not None:
to_plot = dataset.loc[has_cliff, self.mol_col]
legends = (col + dataset.loc[has_cliff, col].astype(str)).tolist()
image = dm.to_image([dm.to_mol(s) for s in to_plot], legends=legends, use_svg=False)
report.log_image(
image_or_figure=image, title="Detection of activity shifts among stereoisomers"
)

else:
report.log(
"Found no activity cliffs among stereoisomers with respect to the {col} column."
f"Found no activity cliffs among stereoisomers with respect to the {col} column."
)
return dataset
26 changes: 10 additions & 16 deletions auroris/curation/actions/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,26 +10,24 @@
from auroris.report import CurationReport


ACTION_REGISTRY = []


class BaseAction(BaseModel, abc.ABC):
"""
An action in the curation process.

Args:
prefix: If the action adds columns, use this prefix.
completed: If the action has completed.
dep_action: Name of dependent action.
Info: The importance of reproducibility
One of the main goals in designing `auroris` is to make it easy to reproduce the curation process.
Reproducibility is key to scientific research. This is why a BaseAction needs to be serializable and
uniquely identified by a `name`.

Attributes:
name: The name that uniquely identifies the action. This is used to serialize and deserialize the action.
prefix: This prefix is used when an action adds columns to a dataset.
If not set, it defaults to the name in uppercase.
"""

name: str
prefix: str = None

@property
def name(self) -> str:
"""The name of the action. Needs to be unique."""
return self.__class__.__name__

@model_validator(mode="after")
@classmethod
def _validate_model(cls, m: "BaseAction"):
Expand All @@ -52,7 +50,3 @@ def transform(

def __call__(self, dataset: pd.DataFrame):
return self.transform(dataset)

def __init_subclass__(cls, **kwargs):
super().__init_subclass__(**kwargs)
ACTION_REGISTRY.append(cls)
11 changes: 10 additions & 1 deletion auroris/curation/actions/_deduplicate.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,13 @@ def deduplicate(
class Deduplication(BaseAction):
"""
Automatic detection of outliers.

See [`auroris.curation.functional.deduplicate`][] for the docs of the
`deduplicate_on`, `y_cols`, `keep` and `method` attributes
"""

name: Literal["deduplicate"] = "deduplicate"

deduplicate_on: Optional[Union[str, List[str]]] = None
y_cols: Optional[Union[str, List[str]]] = None
keep: Literal["first", "last"] = "first"
Expand All @@ -67,10 +72,14 @@ def transform(
verbosity: VerbosityLevel = VerbosityLevel.NORMAL,
parallelized_kwargs: Optional[Dict] = None,
):
return deduplicate(
dataset_dedup = deduplicate(
dataset,
deduplicate_on=self.deduplicate_on,
y_cols=self.y_cols,
keep=self.keep,
method=self.method,
)
if report is not None:
num_duplicates = len(dataset) - len(dataset_dedup)
report.log(f"Deduplication merged and removed {num_duplicates} duplicated molecules from dataset")
return dataset_dedup
Loading
Loading