Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Several bug fixes and improved documentation #4

Merged
merged 40 commits into from
May 23, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
4b04c1c
add ipython image
zhu0619 May 8, 2024
9096b30
minor fix
zhu0619 May 8, 2024
42b833c
allow image export to remote path
zhu0619 May 9, 2024
f1f3abb
minor changes
zhu0619 May 9, 2024
9d365ae
change loglevel
zhu0619 May 15, 2024
2ab83d7
update chemspace viz
zhu0619 May 15, 2024
8412c9a
add dup logger
zhu0619 May 15, 2024
041cc7a
simplify distribution viz
zhu0619 May 15, 2024
d1f41de
update outlier logs
zhu0619 May 15, 2024
f841f12
minor changes
zhu0619 May 15, 2024
1f25fa5
fix serialization
zhu0619 May 16, 2024
a9669a6
add more docstrings
zhu0619 May 16, 2024
582070a
fix docstring and extension
zhu0619 May 17, 2024
021c11e
update css
zhu0619 May 17, 2024
d73f685
minor change
zhu0619 May 17, 2024
085da1c
minor fix
zhu0619 May 17, 2024
8454d5e
add dep
zhu0619 May 17, 2024
e13c677
Merge branch 'main' into fix/minor
zhu0619 May 17, 2024
9e75e01
Update auroris/curation/_curator.py
zhu0619 May 17, 2024
53ad1da
Update auroris/curation/actions/_mol.py
zhu0619 May 17, 2024
ecdf785
Update docs/index.md
zhu0619 May 17, 2024
33569eb
Update auroris/visualization/_distribution.py
zhu0619 May 17, 2024
83cf949
Update docs/index.md
zhu0619 May 17, 2024
26f0a6f
update test_curator_save_load
zhu0619 May 17, 2024
3a3b5e8
minor fix
zhu0619 May 17, 2024
78c731d
Update auroris/report/broadcaster/_logger.py
zhu0619 May 17, 2024
ed0c7f6
curator save/load
zhu0619 May 22, 2024
499eb26
avoid ipython image
zhu0619 May 22, 2024
d901238
minor viz fix
zhu0619 May 22, 2024
97c9bd3
wip
zhu0619 May 22, 2024
ed7a16e
refactor image name
zhu0619 May 22, 2024
e0699c5
Update docs/index.md
zhu0619 May 22, 2024
b417935
Update auroris/utils.py
zhu0619 May 22, 2024
c50af5b
fix attribute discriminator
zhu0619 May 22, 2024
4ee0270
fix report
zhu0619 May 22, 2024
b6757ca
minor fix
zhu0619 May 22, 2024
4b2bae8
remove gcp
zhu0619 May 22, 2024
5ec838a
update tutorial
zhu0619 May 23, 2024
6d836ff
add pyarrow dep
zhu0619 May 23, 2024
becae34
Update the documentation (#5)
cwognum May 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions auroris/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,17 @@
import pandas as pd
import typer

from typing import Optional
from auroris.curation import Curator
from auroris.report.broadcaster import HTMLBroadcaster

app = typer.Typer()


@app.command()
def curate(config_path: str, dataset_path: str, destination: str, overwrite: bool = False):
def curate(config_path: str, destination: str, dataset_path: Optional[str] = None, overwrite: bool = False):
# Load data
dataset = pd.read_csv(dataset_path)
dataset = pd.read_csv(dataset_path) if dataset_path else None
curator = Curator.from_json(config_path)

# Run curation
Expand Down
55 changes: 49 additions & 6 deletions auroris/curation/_curator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
from typing import List, Tuple, Union
from typing import List, Tuple, Union, Optional

from os import PathLike
import fsspec
import pandas as pd
from loguru import logger
Expand All @@ -15,14 +16,23 @@ class Curator(BaseModel):
"""
A curator is a collection of actions that are applied to a dataset.
Can be serialized.

"""

# To know which Action object to create, we need a discriminated union.
# This is the recommended way to add all subclasses in the type.
# See e.g. https://github.com/pydantic/pydantic/issues/2200
# and https://github.com/pydantic/pydantic/issues/2036
steps: List[Union[tuple(ACTION_REGISTRY)]] = Field(..., discriminator="name") # type: ignore

data_path: Optional[Union[str, PathLike]] = Field(
cwognum marked this conversation as resolved.
Show resolved Hide resolved
default=None,
description="Data path. The data must be loadable by `pd.read_csv` with default parameters.",
)

steps: List[Union[tuple(ACTION_REGISTRY)]] = Field(
...,
discriminator="name",
description="List of curation actions. Check all the available action <auroris.curation.actions.__all__>.",
cwognum marked this conversation as resolved.
Show resolved Hide resolved
)
verbosity: VerbosityLevel = VerbosityLevel.NORMAL
parallelized_kwargs: dict = Field(default_factory=dict)

Expand All @@ -36,9 +46,26 @@ def _validate_verbosity(cls, v):
def _serialize_verbosity(self, value: VerbosityLevel):
return value.name

def transform(self, dataset: pd.DataFrame) -> Tuple[pd.DataFrame, CurationReport]:
@field_validator("data_path", mode="before")
def _validate_data_path(cls, value: Union[str, PathLike]):
try:
pd.read_csv(value, nrows=5)
cwognum marked this conversation as resolved.
Show resolved Hide resolved
return value
except Exception:
raise ValueError(
f"Dataset cann't be loaded by `panda.read_csv('{value}')`."
f"Consider to directly pass the loaded the data to `Curator.curate()`."
zhu0619 marked this conversation as resolved.
Show resolved Hide resolved
)

def _load_data(self):
return pd.read_csv(self.data_path)

def transform(self, dataset: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFrame, CurationReport]:
if dataset is None:
dataset = self._load_data()
cwognum marked this conversation as resolved.
Show resolved Hide resolved

report = CurationReport()
dataset = dataset.copy(deep=True)
dataset = dataset.copy()
cwognum marked this conversation as resolved.
Show resolved Hide resolved

action: BaseAction
for action in self.steps:
Expand All @@ -57,6 +84,13 @@ def transform(self, dataset: pd.DataFrame) -> Tuple[pd.DataFrame, CurationReport
def __call__(self, dataset):
return self.transform(dataset)

@classmethod
def _get_action(cls, name: str):
for action in ACTION_REGISTRY:
if action.__name__ == name:
return action
return None
cwognum marked this conversation as resolved.
Show resolved Hide resolved

@classmethod
def from_json(cls, path: str):
"""Loads a curation workflow from a JSON file.
Expand All @@ -66,6 +100,9 @@ def from_json(cls, path: str):
"""
with fsspec.open(path, "r") as f:
data = json.load(f)

steps = [cls._get_action(name)(**args) for step in data["steps"] for name, args in step.items()]
data["steps"] = steps
cwognum marked this conversation as resolved.
Show resolved Hide resolved
return cls.model_validate(data)

def to_json(self, path: str):
Expand All @@ -74,6 +111,12 @@ def to_json(self, path: str):
Args:
path: The destination to save to
"""
serialization = self.model_dump(exclude="steps")
# remove data_path
if self.data_path is None:
serialization.pop("data_path")
# save steps in defined order
serialization["steps"] = [{step.name: step.model_dump()} for step in self.steps]
cwognum marked this conversation as resolved.
Show resolved Hide resolved
with fsspec.open(path, "w") as f:
json.dump(self.model_dump(), f)
json.dump(serialization, f)
return path
41 changes: 30 additions & 11 deletions auroris/curation/actions/_ac_stereoisomer.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from typing import Dict, List, Optional
from pydantic import Field

import datamol as dm
import numpy as np
import pandas as pd


from auroris.curation.actions._base import BaseAction
from auroris.curation.actions._outlier import modified_zscore
from auroris.report import CurationReport
Expand All @@ -15,9 +17,19 @@ def detect_streoisomer_activity_cliff(
dataset: pd.DataFrame,
stereoisomer_id_col: str,
y_cols: List[str],
threshold: float = 1.0,
threshold: float = 2.0,
prefix: str = "AC_",
):
) -> pd.DataFrame:
"""
Detect activity cliff among stereoisomers based on classification label or pre-defined threshold for continuous values.

Args:
dataset: Dataframe
stereoisomer_id_col: Column which identifies the stereoisomers
y_cols: List of columns for bioactivities
threshold: Threshold to identify the activity cliff. Currently, the difference of zscores between isomers are used for identification.
prefix: Prefix for the adding columns
"""
dataset_ori = dataset.copy(deep=True)
ac_cols = {y_col: [] for y_col in y_cols}
group_index_list = np.array(
Expand Down Expand Up @@ -51,14 +63,19 @@ def detect_streoisomer_activity_cliff(

class StereoIsomerACDetection(BaseAction):
"""
Automatic detection of outliers.
Automatic detection of activity shift between stereoisomers.
"""

stereoisomer_id_col: str
y_cols: List[str]
threshold: float = 2.0
prefix: str = "AC_"
mol_col: str = "MOL_smiles"
stereoisomer_id_col: str = Field(
default="MOL_molhash_id_no_stereo", description="Column which identifies the stereoisomers."
)
y_cols: List[str] = Field(..., description="List of columns for bioactivities.")
threshold: float = Field(
default=2.0,
description=" Threshold to identify the activity cliff. Currently, the difference of zscores between isomers are used for identification.",
)
prefix: str = Field(default="AC_", description="Prefix for the adding columns.")
mol_col: str = Field(default="MOL_smiles", description="Column for molecule strings.")

def transform(
self,
Expand All @@ -80,7 +97,7 @@ def transform(
col_with_prefix = self.get_column_name(col)
report.log_new_column(col_with_prefix)

has_cliff = dataset[col_with_prefix].notna()
has_cliff = dataset[col_with_prefix].__eq__(True)
cwognum marked this conversation as resolved.
Show resolved Hide resolved
num_cliff = has_cliff.sum()

if num_cliff > 0:
Expand All @@ -92,10 +109,12 @@ def transform(
legends = (col + dataset.loc[has_cliff, col].astype(str)).tolist()

image = dm.to_image([dm.to_mol(s) for s in to_plot], legends=legends, use_svg=False)
report.log_image(image)
report.log_image(
image_or_figure=image, title="Detection of activity shifts among stereoisomers"
)

else:
report.log(
"Found no activity cliffs among stereoisomers with respect to the {col} column."
f"Found no activity cliffs among stereoisomers with respect to the {col} column."
)
return dataset
9 changes: 2 additions & 7 deletions auroris/curation/actions/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import TYPE_CHECKING, Dict, Optional

import pandas as pd
from pydantic import BaseModel, model_validator
from pydantic import BaseModel, model_validator, Field

from auroris.types import VerbosityLevel

Expand All @@ -16,14 +16,9 @@
class BaseAction(BaseModel, abc.ABC):
"""
An action in the curation process.

Args:
prefix: If the action adds columns, use this prefix.
completed: If the action has completed.
dep_action: Name of dependent action.
"""

prefix: str = None
prefix: str = Field(default=None, description="If the action adds columns, use this prefix.")

@property
def name(self) -> str:
Expand Down
21 changes: 16 additions & 5 deletions auroris/curation/actions/_deduplicate.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import Dict, List, Literal, Optional, Union
from pydantic import Field

import pandas as pd

Expand Down Expand Up @@ -55,10 +56,16 @@ class Deduplication(BaseAction):
Automatic detection of outliers.
"""

deduplicate_on: Optional[Union[str, List[str]]] = None
y_cols: Optional[Union[str, List[str]]] = None
keep: Literal["first", "last"] = "first"
method: Literal["mean", "median"] = "median"
deduplicate_on: Optional[Union[str, List[str]]] = Field(
default=None, description="A subset of the columns to deduplicate on (can be default)."
)
y_cols: Optional[Union[str, List[str]]] = Field(default=None, description="The columns to aggregate.")
keep: Literal["first", "last"] = Field(
default="first", description="Whether to keep the first or last copy of the duplicates."
)
method: Literal["mean", "median"] = Field(
default="median", description="The method to aggregate the data."
)

def transform(
self,
Expand All @@ -67,10 +74,14 @@ def transform(
verbosity: VerbosityLevel = VerbosityLevel.NORMAL,
parallelized_kwargs: Optional[Dict] = None,
):
return deduplicate(
dataset_dedup = deduplicate(
dataset,
deduplicate_on=self.deduplicate_on,
y_cols=self.y_cols,
keep=self.keep,
method=self.method,
)
if report is not None:
num_duplicates = dataset.shape[0] - dataset_dedup.shape[0]
cwognum marked this conversation as resolved.
Show resolved Hide resolved
report.log(f"Deduplication merged and removed {num_duplicates} duplicated molecules from dataset")
return dataset_dedup
42 changes: 35 additions & 7 deletions auroris/curation/actions/_discretize.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import Dict, List, Literal, Optional, Union
from pydantic import Field

import numpy as np
import pandas as pd
Expand All @@ -17,7 +18,8 @@ def discretize(
allow_nan: bool = True,
label_order: Literal["ascending", "descending"] = "ascending",
) -> np.ndarray:
"""Thresholding of array-like or scipy.sparse matrix into binary or multiclass labels.
"""
Thresholding of array-like or scipy.sparse matrix into binary or multiclass labels.

Args:
X : The data to discretize, element by element.
Expand Down Expand Up @@ -76,13 +78,39 @@ def discretize(


class Discretization(BaseAction):
input_column: str
"""
Thresholding bioactivity columns to binary or multiclass labels.
"""

input_column: str = Field(..., description="Column to be discretized.")
prefix: str = "CLS_"
thresholds: List[float]
inplace: bool = False
allow_nan: bool = True
label_order: Literal["ascending", "descending"] = "ascending"
log_scale: bool = True
thresholds: List[float] = Field(..., description="Interval boundaries that include the right bin edge.")
inplace: bool = Field(
default=False,
description="""Set to True to perform inplace discretization and avoid a copy
(if the input is already a numpy array or a scipy.sparse CSR / CSC
matrix and if axis is 1).""",
)
allow_nan: bool = Field(
default=True,
description="Set to True to allow nans in the array for discretization. Otherwise, an error will be raised instead.",
)
label_order: Literal["ascending", "descending"] = Field(
default="ascending",
description="""The continuous values are discretized to labels 0, 1, 2, .., N with respect to given
threshold bins [threshold_1, threshold_2,.., threshould_n].
When set to 'ascending', the class label is in ascending order with the threshold
bins that `0` represents negative class or lower class, while 1, 2, 3 are for higher classes.
When set to 'descending' the class label is in ascending order with the threshold bins.
Sometimes the positive labels are on the left side of provided threshold.
E.g. For binarization with threshold [0.5], the positive label is defined
by`X < 0.5`. In this case, `label_order` should be `descending`.""",
)
log_scale: bool = Field(
default=False,
description="""Whether visualize distribution in log scale.
See more in <auroris.visualization.visualize_continuous_distribution>""",
)

def transform(
self,
Expand Down
19 changes: 11 additions & 8 deletions auroris/curation/actions/_distribution.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import Dict, List, Optional

import pandas as pd
from typing import Dict, List, Optional, Sequence
from pydantic import Field
import pandas as pd

from auroris.curation.actions._base import BaseAction
from auroris.report import CurationReport
Expand All @@ -11,12 +10,16 @@

class ContinuousDistributionVisualization(BaseAction):
"""
Visualize a continuous distribution
Visualize a continuous distribution.
"""

y_cols: Optional[List[str]] = None
log_scale: bool = False
kwargs: Dict = Field(default_factory=dict)
y_cols: Optional[List[str]] = Field(
default=None, description="List of columns for bioactivity for visualization."
)
log_scale: bool = Field(default=False, description="Whether visualize distribution in log scale.")
bins: Optional[Sequence[float]] = Field(
default=None, description="The bin boundaries to color the area under the KDE curve."
)

def transform(
self,
Expand All @@ -28,7 +31,7 @@ def transform(
if report is not None:
for y_col in self.y_cols:
fig = visualize_continuous_distribution(
data=dataset[y_col], label_name=y_col, log_scale=self.log_scale
data=dataset[y_col], log_scale=self.log_scale, bins=self.bins
)
report.log_image(fig, title=f"Data distribution - {y_col}")

Expand Down
Loading
Loading