diff --git a/.isort.cfg b/.isort.cfg index 4399fe2..ec9f798 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -6,4 +6,4 @@ force_grid_wrap=0 combine_as_imports=True line_length=88 known_first_party = emmental,tests -known_third_party = numpy,scipy,setuptools,sklearn,torch,yaml +known_third_party = numpy,pytest,scipy,setuptools,sklearn,torch,yaml diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 6d80c96..8f381b9 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,9 +1,15 @@ Unreleased_ ----------- -dded +added ^^^^^ * `@senwu`_: Support probabilistic gold label in scorer. +* `@senwu`_: Add `add_tasks` to support adding one task or mulitple tasks into model. + +Changed +^^^^^^^ +* `@senwu`_: Change running evaluation only when evaluation is triggered. + 0.0.5_ - 2020-03-01 ------------------- diff --git a/src/emmental/learner.py b/src/emmental/learner.py index e126607..6df324a 100644 --- a/src/emmental/learner.py +++ b/src/emmental/learner.py @@ -364,11 +364,12 @@ def _logging( self.logging_manager.update(batch_size) + trigger_evaluation = self.logging_manager.trigger_evaluation() + # Log the loss and lr - metric_dict.update(self._aggregate_running_metrics(model)) + metric_dict.update(self._aggregate_running_metrics(model, trigger_evaluation)) # Evaluate the model and log the metric - trigger_evaluation = self.logging_manager.trigger_evaluation() if trigger_evaluation: # Log task specific metric @@ -407,11 +408,14 @@ def _logging( return metric_dict - def _aggregate_running_metrics(self, model: EmmentalModel) -> Dict[str, float]: + def _aggregate_running_metrics( + self, model: EmmentalModel, calc_running_scores: bool = False + ) -> Dict[str, float]: r"""Calculate the running overall and task specific metrics. Args: model(EmmentalModel): The model to evaluate. + calc_running_scores(bool): Whether to calc running scores Returns: dict: The score dict. @@ -435,36 +439,43 @@ def _aggregate_running_metrics(self, model: EmmentalModel) -> Dict[str, float]: total_loss = sum(self.running_losses.values()) metric_dict["model/all/train/loss"] = total_loss / total_count - micro_score_dict: Dict[str, List[ndarray]] = defaultdict(list) - macro_score_dict: Dict[str, List[ndarray]] = defaultdict(list) + if calc_running_scores: + micro_score_dict: Dict[str, List[ndarray]] = defaultdict(list) + macro_score_dict: Dict[str, List[ndarray]] = defaultdict(list) - # Calculate training metric - for identifier in self.running_uids.keys(): - task_name, data_name, split = identifier.split("/") + # Calculate training metric + for identifier in self.running_uids.keys(): + task_name, data_name, split = identifier.split("/") - metric_score = model.scorers[task_name].score( - self.running_golds[identifier], - self.running_probs[identifier], - prob_to_pred(self.running_probs[identifier]), - self.running_uids[identifier], - ) - for metric_name, metric_value in metric_score.items(): - metric_dict[f"{identifier}/{metric_name}"] = metric_value + metric_score = model.scorers[task_name].score( + self.running_golds[identifier], + self.running_probs[identifier], + prob_to_pred(self.running_probs[identifier]), + self.running_uids[identifier], + ) + for metric_name, metric_value in metric_score.items(): + metric_dict[f"{identifier}/{metric_name}"] = metric_value - # Collect average score - identifier = construct_identifier(task_name, data_name, split, "average") + # Collect average score + identifier = construct_identifier( + task_name, data_name, split, "average" + ) - metric_dict[identifier] = np.mean(list(metric_score.values())) + metric_dict[identifier] = np.mean(list(metric_score.values())) - micro_score_dict[split].extend(list(metric_score.values())) - macro_score_dict[split].append(metric_dict[identifier]) + micro_score_dict[split].extend(list(metric_score.values())) + macro_score_dict[split].append(metric_dict[identifier]) - # Collect split-wise micro/macro average score - for split in micro_score_dict.keys(): - identifier = construct_identifier("model", "all", split, "micro_average") - metric_dict[identifier] = np.mean(micro_score_dict[split]) - identifier = construct_identifier("model", "all", split, "macro_average") - metric_dict[identifier] = np.mean(macro_score_dict[split]) + # Collect split-wise micro/macro average score + for split in micro_score_dict.keys(): + identifier = construct_identifier( + "model", "all", split, "micro_average" + ) + metric_dict[identifier] = np.mean(micro_score_dict[split]) + identifier = construct_identifier( + "model", "all", split, "macro_average" + ) + metric_dict[identifier] = np.mean(macro_score_dict[split]) # Log the learning rate metric_dict["model/all/train/lr"] = self.optimizer.param_groups[0]["lr"] @@ -539,7 +550,6 @@ def learn( ) for batch_num, batch in batches: - # Covert single batch into a batch list if not isinstance(batch, list): batch = [batch] diff --git a/src/emmental/metrics/pearson_correlation.py b/src/emmental/metrics/pearson_correlation.py index 77c08a9..8f4d17d 100644 --- a/src/emmental/metrics/pearson_correlation.py +++ b/src/emmental/metrics/pearson_correlation.py @@ -28,9 +28,6 @@ def pearson_correlation_scorer( probs = np.vstack(probs).squeeze() correlation, pvalue = pearsonr(golds, probs) - if np.isnan(correlation): - correlation = 0.0 - pvalue = 0.0 if return_pvalue: return {"pearson_correlation": correlation, "pearson_pvalue": pvalue} diff --git a/src/emmental/metrics/roc_auc.py b/src/emmental/metrics/roc_auc.py index c0eaa34..2c2399e 100644 --- a/src/emmental/metrics/roc_auc.py +++ b/src/emmental/metrics/roc_auc.py @@ -29,10 +29,20 @@ def roc_auc_scorer( """ - if len(golds.shape) == 1: - golds = pred_to_prob(golds, n_classes=probs.shape[1]) + if len(probs.shape) == 2 and probs.shape[1] == 1: + probs = probs.reshape(probs.shape[0]) + + if len(golds.shape) == 2 and golds.shape[1] == 1: + golds = golds.reshape(golds.shape[0]) + + if len(probs.shape) > 1: + if len(golds.shape) > 1: + golds = pred_to_prob(prob_to_pred(golds), n_classes=probs.shape[1]) + else: + golds = pred_to_prob(golds, n_classes=probs.shape[1]) else: - golds = pred_to_prob(prob_to_pred(golds), n_classes=probs.shape[1]) + if len(golds.shape) > 1: + golds = prob_to_pred(golds) try: roc_auc = roc_auc_score(golds, probs) diff --git a/src/emmental/metrics/spearman_correlation.py b/src/emmental/metrics/spearman_correlation.py index 9e4f5bf..5c004ce 100644 --- a/src/emmental/metrics/spearman_correlation.py +++ b/src/emmental/metrics/spearman_correlation.py @@ -29,9 +29,6 @@ def spearman_correlation_scorer( probs = np.vstack(probs).squeeze() correlation, pvalue = spearmanr(golds, probs) - if np.isnan(correlation): - correlation = 0.0 - pvalue = 0.0 if return_pvalue: return {"spearman_correlation": correlation, "spearman_pvalue": pvalue} diff --git a/src/emmental/model.py b/src/emmental/model.py index 9aca97b..df67d88 100644 --- a/src/emmental/model.py +++ b/src/emmental/model.py @@ -49,7 +49,7 @@ def __init__( # Build network with given tasks if tasks is not None: - self._build_network(tasks) + self.add_tasks(tasks) if Meta.config["meta_config"]["verbose"]: logger.info( @@ -75,7 +75,7 @@ def _move_to_device(self) -> None: if Meta.config["meta_config"]["verbose"]: logger.info("No cuda device available. Switch to cpu instead.") - def _build_network(self, tasks: Union[EmmentalTask, List[EmmentalTask]]) -> None: + def add_tasks(self, tasks: Union[EmmentalTask, List[EmmentalTask]]) -> None: r"""Build the MTL network using all tasks. Args: @@ -86,13 +86,6 @@ def _build_network(self, tasks: Union[EmmentalTask, List[EmmentalTask]]) -> None if not isinstance(tasks, Iterable): tasks = [tasks] for task in tasks: - if task.name in self.task_names: - raise ValueError( - f"Found duplicate task {task.name}, different task should use " - f"different task name." - ) - if not isinstance(task, EmmentalTask): - raise ValueError(f"Unrecognized task type {task}.") self.add_task(task) def add_task(self, task: EmmentalTask) -> None: @@ -102,6 +95,14 @@ def add_task(self, task: EmmentalTask) -> None: task(EmmentalTask): A task to add. """ + if not isinstance(task, EmmentalTask): + raise ValueError(f"Unrecognized task type {task}.") + + if task.name in self.task_names: + raise ValueError( + f"Found duplicate task {task.name}, different task should use " + f"different task name." + ) # Combine module_pool from all tasks for key in task.module_pool.keys(): diff --git a/tests/data/test_data.py b/tests/data/test_data.py index c8b04d6..887773e 100644 --- a/tests/data/test_data.py +++ b/tests/data/test_data.py @@ -1,5 +1,6 @@ import logging +import pytest import torch from emmental.data import EmmentalDataLoader, EmmentalDataset @@ -38,6 +39,11 @@ def test_emmental_dataset(caplog): dataset.add_features(X_dict={"data2": x2}) + dataset.remove_feature("data2") + assert "data2" not in dataset.X_dict + + dataset.add_features(X_dict={"data2": x2}) + # Check add one more feature to dataset assert torch.equal(dataset[0][0]["data2"], x2[0]) @@ -45,6 +51,9 @@ def test_emmental_dataset(caplog): dataset.add_labels(Y_dict={"label2": y2}) + with pytest.raises(ValueError): + dataset.add_labels(Y_dict={"label2": x2}) + # Check add one more label to dataset assert torch.equal(dataset[0][1]["label2"], y2[0]) @@ -53,6 +62,25 @@ def test_emmental_dataset(caplog): # Check remove one more label to dataset assert "label1" not in dataset.Y_dict + with pytest.raises(ValueError): + dataset = EmmentalDataset( + X_dict={"data1": x1}, Y_dict={"label1": y1}, name="new_data", uid="ids" + ) + + dataset = EmmentalDataset( + X_dict={"_uids_": x1}, Y_dict={"label1": y1}, name="new_data" + ) + + with pytest.raises(ValueError): + dataset = EmmentalDataset( + X_dict={"data1": x1}, Y_dict={"label1": x1}, name="new_data" + ) + + with pytest.raises(ValueError): + dataset = EmmentalDataset( + X_dict={"data1": x1}, Y_dict={"label1": x1}, name="new_data" + ) + def test_emmental_dataloader(caplog): """Unit test of emmental dataloader""" diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index e2e7873..3bfae0d 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -52,6 +52,14 @@ def test_accuracy(caplog): assert isequal(metric_dict, {"accuracy@2": 1.0}) + metric_dict = accuracy_scorer(golds, None, preds, normalize=False) + + assert isequal(metric_dict, {"accuracy": 4}) + + metric_dict = accuracy_scorer(gold_probs, probs, preds, topk=2, normalize=False) + + assert isequal(metric_dict, {"accuracy@2": 6}) + def test_precision(caplog): """Unit test of precision_scorer""" @@ -238,27 +246,36 @@ def test_roc_auc(caplog): caplog.set_level(logging.INFO) - golds = np.array([1, 0, 1, 0, 1, 0]) + golds = np.array([[1], [0], [1], [0], [1], [0]]) gold_probs = np.array( [[0.4, 0.6], [0.9, 0.1], [0.3, 0.7], [0.8, 0.2], [0.1, 0.9], [0.6, 0.4]] ) probs = np.array( [[0.2, 0.8], [0.4, 0.6], [0.1, 0.9], [0.3, 0.7], [0.3, 0.7], [0.8, 0.2]] ) + preds = np.array([[0.8], [0.6], [0.9], [0.7], [0.7], [0.2]]) metric_dict = roc_auc_scorer(golds, probs, None) assert isequal(metric_dict, {"roc_auc": 0.9444444444444444}) - golds = np.array([1, 1, 1, 1, 1, 1]) + metric_dict = roc_auc_scorer(gold_probs, probs, None) - metric_dict = roc_auc_scorer(golds, probs, None) - assert isequal(metric_dict, {"roc_auc": float("nan")}) + assert isequal(metric_dict, {"roc_auc": 0.9444444444444444}) - metric_dict = roc_auc_scorer(gold_probs, probs, None) + metric_dict = roc_auc_scorer(golds, preds, None) assert isequal(metric_dict, {"roc_auc": 0.9444444444444444}) + metric_dict = roc_auc_scorer(gold_probs, preds, None) + + assert isequal(metric_dict, {"roc_auc": 0.9444444444444444}) + + golds = np.array([1, 1, 1, 1, 1, 1]) + + metric_dict = roc_auc_scorer(golds, probs, None) + assert isequal(metric_dict, {"roc_auc": float("nan")}) + def test_accuracy_f1(caplog): """Unit test of accuracy_f1_scorer""" diff --git a/tests/optimizers/test_bert_adam_optimizer.py b/tests/optimizers/test_bert_adam_optimizer.py index 133f6ea..21fad62 100644 --- a/tests/optimizers/test_bert_adam_optimizer.py +++ b/tests/optimizers/test_bert_adam_optimizer.py @@ -1,6 +1,7 @@ import logging import shutil +import pytest import torch import torch.nn as nn import torch.nn.functional as F @@ -64,4 +65,64 @@ def test_bert_adam_optimizer(caplog): F.mse_loss(model(torch.randn(1, 1)), torch.randn(1, 1)).backward() emmental_learner.optimizer.step() + # Test wrong lr + with pytest.raises(ValueError): + config = { + "learner_config": { + "optimizer_config": { + "optimizer": optimizer, + "lr": -0.1, + "l2": 0.05, + f"{optimizer}_config": {"betas": (0.8, 0.9), "eps": 1e-05}, + } + } + } + emmental.Meta.update_config(config) + emmental_learner._set_optimizer(model) + + # Test wrong eps + with pytest.raises(ValueError): + config = { + "learner_config": { + "optimizer_config": { + "optimizer": optimizer, + "lr": 0.1, + "l2": 0.05, + f"{optimizer}_config": {"betas": (0.8, 0.9), "eps": -1e-05}, + } + } + } + emmental.Meta.update_config(config) + emmental_learner._set_optimizer(model) + + # Test wrong betas + with pytest.raises(ValueError): + config = { + "learner_config": { + "optimizer_config": { + "optimizer": optimizer, + "lr": 0.1, + "l2": 0.05, + f"{optimizer}_config": {"betas": (-0.8, 0.9), "eps": 1e-05}, + } + } + } + emmental.Meta.update_config(config) + emmental_learner._set_optimizer(model) + + # Test wrong betas + with pytest.raises(ValueError): + config = { + "learner_config": { + "optimizer_config": { + "optimizer": optimizer, + "lr": 0.1, + "l2": 0.05, + f"{optimizer}_config": {"betas": (0.8, -0.9), "eps": 1e-05}, + } + } + } + emmental.Meta.update_config(config) + emmental_learner._set_optimizer(model) + shutil.rmtree(dirpath) diff --git a/tests/test_model.py b/tests/test_model.py index c4b0f0f..49bf1c7 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -141,4 +141,10 @@ def output(module_name, immediate_output_dict): model.load(f"{dirpath}/saved_model.pth") + # Test add_tasks + model = EmmentalModel(name="test") + + model.add_tasks([task1, task2]) + assert model.task_names == set(["task_1", "task_2"]) + shutil.rmtree(dirpath)