make release-tag: Merge branch 'main' into stable

sdv-dev · Oct 5, 2023 · 08a463a · 08a463a
2 parents d3d3b7c + 43803a6
commit 08a463a
Show file tree

Hide file tree

Showing 15 changed files with 242 additions and 342 deletions.
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
@@ -174,17 +174,17 @@ Release Workflow
 The process of releasing a new version involves several steps combining both ``git`` and
 ``bumpversion`` which, briefly:
 
-1. Merge what is in ``master`` branch into ``stable`` branch.
+1. Merge what is in ``main`` branch into ``stable`` branch.
 2. Update the version in ``setup.cfg``, ``ctgan/__init__.py`` and
    ``HISTORY.md`` files.
 3. Create a new git tag pointing at the corresponding commit in ``stable`` branch.
-4. Merge the new commit from ``stable`` into ``master``.
+4. Merge the new commit from ``stable`` into ``main``.
 5. Update the version in ``setup.cfg`` and ``ctgan/__init__.py``
    to open the next development iteration.
 
 .. note:: Before starting the process, make sure that ``HISTORY.md`` has been updated with a new
           entry that explains the changes that will be included in the new version.
-          Normally this is just a list of the Pull Requests that have been merged to master
+          Normally this is just a list of the Pull Requests that have been merged to main
           since the last release.
 
 Once this is done, run of the following commands:

diff --git a/HISTORY.md b/HISTORY.md
@@ -1,5 +1,19 @@
 # History
 
+## v0.7.5 - 2023-10-05
+
+This release adds a progress bar that will show when setting the `verbose` parameter to True when initializing `CTGAN`. It also removes a warning that was showing.
+
+### Maintenance
+
+* Remove model_missing_values from ClusterBasedNormalizer call - PR [#310](https://github.com/sdv-dev/CTGAN/pull/310) by @fealho
+* Switch default branch from master to main - Issue [#311](https://github.com/sdv-dev/CTGAN/issues/311) by @amontanez24
+* Remove or implement CTGAN tests - Issue [#312](https://github.com/sdv-dev/CTGAN/issues/312) by @fealho
+
+### New Features
+
+* Add progress bar for CTGAN fitting (+ save the loss values) - Issue [#298](https://github.com/sdv-dev/CTGAN/issues/298) by @frances-h
+
 ## v0.7.4 - 2023-07-25
 
 This release adds support for Python 3.11 and drops support for Python 3.7.

diff --git a/Makefile b/Makefile
@@ -158,22 +158,22 @@ publish: dist publish-confirm ## package and upload a release
 	twine upload dist/*
 
 .PHONY: bumpversion-release
-bumpversion-release: ## Merge master to stable and bumpversion release
+bumpversion-release: ## Merge main to stable and bumpversion release
 	git checkout stable || git checkout -b stable
-	git merge --no-ff master -m"make release-tag: Merge branch 'master' into stable"
+	git merge --no-ff main -m"make release-tag: Merge branch 'main' into stable"
 	bumpversion release
 	git push --tags origin stable
 
 .PHONY: bumpversion-release-test
-bumpversion-release-test: ## Merge master to stable and bumpversion release
+bumpversion-release-test: ## Merge main to stable and bumpversion release
 	git checkout stable || git checkout -b stable
-	git merge --no-ff master -m"make release-tag: Merge branch 'master' into stable"
+	git merge --no-ff main -m"make release-tag: Merge branch 'main' into stable"
 	bumpversion release --no-tag
 	@echo git push --tags origin stable
 
 .PHONY: bumpversion-patch
-bumpversion-patch: ## Merge stable to master and bumpversion patch
-	git checkout master
+bumpversion-patch: ## Merge stable to main and bumpversion patch
+	git checkout main
 	git merge stable
 	bumpversion --no-tag patch
 	git push
@@ -192,7 +192,7 @@ bumpversion-major: ## Bump the version the next major skipping the release
 
 .PHONY: bumpversion-revert
 bumpversion-revert: ## Undo a previous bumpversion-release
-	git checkout master
+	git checkout main
 	git branch -D stable
 
 CLEAN_DIR := $(shell git status --short | grep -v ??)
@@ -205,10 +205,10 @@ ifneq ($(CLEAN_DIR),)
 	$(error There are uncommitted changes)
 endif
 
-.PHONY: check-master
-check-master: ## Check if we are in master branch
-ifneq ($(CURRENT_BRANCH),master)
-	$(error Please make the release from master branch\n)
+.PHONY: check-main
+check-main: ## Check if we are in main branch
+ifneq ($(CURRENT_BRANCH),main)
+	$(error Please make the release from main branch\n)
 endif
 
 .PHONY: check-history
@@ -218,7 +218,7 @@ ifeq ($(CHANGELOG_LINES),0)
 endif
 
 .PHONY: check-release
-check-release: check-clean check-master check-history ## Check if the release can be made
+check-release: check-clean check-main check-history ## Check if the release can be made
 	@echo "A new release can be made"
 
 .PHONY: release
@@ -228,10 +228,10 @@ release: check-release bumpversion-release publish bumpversion-patch
 release-test: check-release bumpversion-release-test publish-test bumpversion-revert
 
 .PHONY: release-candidate
-release-candidate: check-master publish bumpversion-candidate
+release-candidate: check-main publish bumpversion-candidate
 
 .PHONY: release-candidate-test
-release-candidate-test: check-clean check-master publish-test
+release-candidate-test: check-clean check-main publish-test
 
 .PHONY: release-minor
 release-minor: check-release bumpversion-minor release

diff --git a/README.md b/README.md
@@ -8,13 +8,13 @@
 [![PyPI Shield](https://img.shields.io/pypi/v/ctgan.svg)](https://pypi.python.org/pypi/ctgan)
 [![Unit Tests](https://github.com/sdv-dev/CTGAN/actions/workflows/unit.yml/badge.svg)](https://github.com/sdv-dev/CTGAN/actions/workflows/unit.yml)
 [![Downloads](https://pepy.tech/badge/ctgan)](https://pepy.tech/project/ctgan)
-[![Coverage Status](https://codecov.io/gh/sdv-dev/CTGAN/branch/master/graph/badge.svg)](https://codecov.io/gh/sdv-dev/CTGAN)
+[![Coverage Status](https://codecov.io/gh/sdv-dev/CTGAN/branch/main/graph/badge.svg)](https://codecov.io/gh/sdv-dev/CTGAN)
 
 <div align="left">
 <br/>
 <p align="center">
 <a href="https://github.com/sdv-dev/CTGAN">
-<img align="center" width=40% src="https://github.com/sdv-dev/SDV/blob/master/docs/images/CTGAN-DataCebo.png"></img>
+<img align="center" width=40% src="https://github.com/sdv-dev/SDV/blob/stable/docs/images/CTGAN-DataCebo.png"></img>
 </a>
 </p>
 </div>
@@ -38,9 +38,9 @@ CTGAN is a collection of Deep Learning based synthetic data generators for s
 [Blog]: https://datacebo.com/blog
 [Documentation]: https://bit.ly/sdv-docs
 [Repository]: https://github.com/sdv-dev/CTGAN
-[License]: https://github.com/sdv-dev/CTGAN/blob/master/LICENSE
+[License]: https://github.com/sdv-dev/CTGAN/blob/main/LICENSE
 [Development Status]: https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha
-[Slack Logo]: https://github.com/sdv-dev/SDV/blob/master/docs/images/slack.png
+[Slack Logo]: https://github.com/sdv-dev/SDV/blob/stable/docs/images/slack.png
 [Community]: https://bit.ly/sdv-slack-invite
 
 Currently, this library implements the **CTGAN** and **TVAE** models described in the [Modeling Tabular data using Conditional GAN](https://arxiv.org/abs/1907.00503) paper, presented at the 2019 NeurIPS conference.
@@ -141,7 +141,7 @@ More details can be found in the corresponding repository: https://github.com/ka
 
 
 <div align="center">
-<a href="https://datacebo.com"><img align="center" width=40% src="https://github.com/sdv-dev/SDV/blob/master/docs/images/DataCebo.png"></img></a>
+<a href="https://datacebo.com"><img align="center" width=40% src="https://github.com/sdv-dev/SDV/blob/stable/docs/images/DataCebo.png"></img></a>
 </div>
 <br/>
 <br/>

diff --git a/ctgan/__init__.py b/ctgan/__init__.py
@@ -4,7 +4,7 @@
 
 __author__ = 'DataCebo, Inc.'
 __email__ = 'info@sdv.dev'
-__version__ = '0.7.4'
+__version__ = '0.7.5.dev1'
 
 from ctgan.demo import load_demo
 from ctgan.synthesizers.ctgan import CTGAN

diff --git a/ctgan/data_transformer.py b/ctgan/data_transformer.py
@@ -18,8 +18,8 @@
 class DataTransformer(object):
     """Data Transformer.
 
-    Model continuous columns with a BayesianGMM and normalized to a scalar [0, 1] and a vector.
-    Discrete columns are encoded using a scikit-learn OneHotEncoder.
+    Model continuous columns with a BayesianGMM and normalize them to a scalar between [-1, 1]
+    and a vector. Discrete columns are encoded using a OneHotEncoder.
     """
 
     def __init__(self, max_clusters=10, weight_threshold=0.005):
@@ -46,7 +46,11 @@ def _fit_continuous(self, data):
                 A ``ColumnTransformInfo`` object.
         """
         column_name = data.columns[0]
-        gm = ClusterBasedNormalizer(model_missing_values=True, max_clusters=min(len(data), 10))
+        gm = ClusterBasedNormalizer(
+            missing_value_generation='from_column',
+            max_clusters=min(len(data), self._max_clusters),
+            weight_threshold=self._weight_threshold
+        )
         gm.fit(data, column_name)
         num_components = sum(gm.valid_component_indicator)
 

diff --git a/ctgan/synthesizers/ctgan.py b/ctgan/synthesizers/ctgan.py
@@ -7,6 +7,7 @@
 import torch
 from torch import optim
 from torch.nn import BatchNorm1d, Dropout, LeakyReLU, Linear, Module, ReLU, Sequential, functional
+from tqdm import tqdm
 
 from ctgan.data_sampler import DataSampler
 from ctgan.data_transformer import DataTransformer
@@ -175,6 +176,8 @@ def __init__(self, embedding_dim=128, generator_dim=(256, 256), discriminator_di
         self._data_sampler = None
         self._generator = None
 
+        self.loss_values = pd.DataFrame(columns=['Epoch', 'Generator Loss', 'Distriminator Loss'])
+
     @staticmethod
     def _gumbel_softmax(logits, tau=1, hard=False, eps=1e-10, dim=-1):
         """Deals with the instability of the gumbel_softmax for older versions of torch.
@@ -335,8 +338,15 @@ def fit(self, train_data, discrete_columns=(), epochs=None):
         mean = torch.zeros(self._batch_size, self._embedding_dim, device=self._device)
         std = mean + 1
 
+        self.loss_values = pd.DataFrame(columns=['Epoch', 'Generator Loss', 'Distriminator Loss'])
+
+        epoch_iterator = tqdm(range(epochs), disable=(not self._verbose))
+        if self._verbose:
+            description = 'Gen. ({gen:.2f}) | Discrim. ({dis:.2f})'
+            epoch_iterator.set_description(description.format(gen=0, dis=0))
+
         steps_per_epoch = max(len(train_data) // self._batch_size, 1)
-        for i in range(epochs):
+        for i in epoch_iterator:
             for id_ in range(steps_per_epoch):
 
                 for n in range(self._discriminator_steps):
@@ -412,10 +422,25 @@ def fit(self, train_data, discrete_columns=(), epochs=None):
                 loss_g.backward()
                 optimizerG.step()
 
+            generator_loss = loss_g.detach().cpu()
+            discriminator_loss = loss_d.detach().cpu()
+
+            epoch_loss_df = pd.DataFrame({
+                'Epoch': [i],
+                'Generator Loss': [generator_loss],
+                'Discriminator Loss': [discriminator_loss]
+            })
+            if not self.loss_values.empty:
+                self.loss_values = pd.concat(
+                    [self.loss_values, epoch_loss_df]
+                ).reset_index(drop=True)
+            else:
+                self.loss_values = epoch_loss_df
+
             if self._verbose:
-                print(f'Epoch {i+1}, Loss G: {loss_g.detach().cpu(): .4f},'  # noqa: T001
-                      f'Loss D: {loss_d.detach().cpu(): .4f}',
-                      flush=True)
+                epoch_iterator.set_description(
+                    description.format(gen=generator_loss, dis=discriminator_loss)
+                )
 
     @random_state
     def sample(self, n, condition_column=None, condition_value=None):

diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.7.4
+current_version = 0.7.5.dev1
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?

diff --git a/setup.py b/setup.py
@@ -21,7 +21,8 @@
     "torch>=1.8.0;python_version<'3.10'",
     "torch>=1.11.0;python_version>='3.10' and python_version<'3.11'",
     "torch>=2.0.0;python_version>='3.11'",
-    'rdt>=1.3.0,<2.0',
+    'tqdm>=4.15,<5',
+    'rdt>=1.6.1,<2.0',
 ]
 
 setup_requires = [
@@ -118,6 +119,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/sdv-dev/CTGAN',
-    version='0.7.4',
+    version='0.7.5.dev1',
     zip_safe=False,
 )
diff --git a/tests/integration/synthesizer/test_ctgan.py b/tests/integration/synthesizer/test_ctgan.py
@@ -32,6 +32,8 @@ def test_ctgan_no_categoricals():
     assert sampled.shape == (100, 1)
     assert isinstance(sampled, pd.DataFrame)
     assert set(sampled.columns) == {'continuous'}
+    assert len(ctgan.loss_values) == 1
+    assert list(ctgan.loss_values.columns) == ['Epoch', 'Generator Loss', 'Discriminator Loss']
 
 
 def test_ctgan_dataframe():
@@ -51,6 +53,8 @@ def test_ctgan_dataframe():
     assert isinstance(sampled, pd.DataFrame)
     assert set(sampled.columns) == {'continuous', 'discrete'}
     assert set(sampled['discrete'].unique()) == {'a', 'b', 'c'}
+    assert len(ctgan.loss_values) == 1
+    assert list(ctgan.loss_values.columns) == ['Epoch', 'Generator Loss', 'Discriminator Loss']
 
 
 def test_ctgan_numpy():
@@ -69,6 +73,8 @@ def test_ctgan_numpy():
     assert sampled.shape == (100, 2)
     assert isinstance(sampled, np.ndarray)
     assert set(np.unique(sampled[:, 1])) == {'a', 'b', 'c'}
+    assert len(ctgan.loss_values) == 1
+    assert list(ctgan.loss_values.columns) == ['Epoch', 'Generator Loss', 'Discriminator Loss']
 
 
 def test_log_frequency():
@@ -83,13 +89,23 @@ def test_log_frequency():
     ctgan = CTGAN(epochs=100)
     ctgan.fit(data, discrete_columns)
 
+    assert len(ctgan.loss_values) == 100
+    assert list(ctgan.loss_values.columns) == ['Epoch', 'Generator Loss', 'Discriminator Loss']
+    pd.testing.assert_series_equal(ctgan.loss_values['Epoch'],
+                                   pd.Series(range(100), name='Epoch'))
+
     sampled = ctgan.sample(10000)
     counts = sampled['discrete'].value_counts()
     assert counts['a'] < 6500
 
     ctgan = CTGAN(log_frequency=False, epochs=100)
     ctgan.fit(data, discrete_columns)
 
+    assert len(ctgan.loss_values) == 100
+    assert list(ctgan.loss_values.columns) == ['Epoch', 'Generator Loss', 'Discriminator Loss']
+    pd.testing.assert_series_equal(ctgan.loss_values['Epoch'],
+                                   pd.Series(range(100), name='Epoch'))
+
     sampled = ctgan.sample(10000)
     counts = sampled['discrete'].value_counts()
     assert counts['a'] > 9000
@@ -231,56 +247,6 @@ def test_fixed_random_seed():
     np.testing.assert_array_equal(sampled_0_1, sampled_1_1)
 
 
-# Below are CTGAN tests that should be implemented in the future
-def test_continuous():
-    """Test training the CTGAN synthesizer on a continuous dataset."""
-    # assert the distribution of the samples is close to the distribution of the data
-    # using kstest:
-    #   - uniform (assert p-value > 0.05)
-    #   - gaussian (assert p-value > 0.05)
-    #   - inversely correlated (assert correlation < 0)
-    pass
-
-
-def test_categorical():
-    """Test training the CTGAN synthesizer on a categorical dataset."""
-    # assert the distribution of the samples is close to the distribution of the data
-    # using cstest:
-    #   - uniform (assert p-value > 0.05)
-    #   - very skewed / biased? (assert p-value > 0.05)
-    #   - inversely correlated (assert correlation < 0)
-    pass
-
-
-def test_categorical_log_frequency():
-    """Test training the CTGAN synthesizer on a small categorical dataset."""
-    # assert the distribution of the samples is close to the distribution of the data
-    # using cstest:
-    #   - uniform (assert p-value > 0.05)
-    #   - very skewed / biased? (assert p-value > 0.05)
-    #   - inversely correlated (assert correlation < 0)
-    pass
-
-
-def test_mixed():
-    """Test training the CTGAN synthesizer on a small mixed-type dataset."""
-    # assert the distribution of the samples is close to the distribution of the data
-    # using a kstest for continuous + a cstest for categorical.
-    pass
-
-
-def test_conditional():
-    """Test training the CTGAN synthesizer and sampling conditioned on a categorical."""
-    # verify that conditioning increases the likelihood of getting a sample with the specified
-    # categorical value
-    pass
-
-
-def test_batch_size_pack_size():
-    """Test that if batch size is not a multiple of pack size, it raises a sane error."""
-    pass
-
-
 def test_ctgan_save_and_load(tmpdir):
     """Test that the ``CTGAN`` model can be saved and loaded."""
     # Setup