Skip to content

Commit

Permalink
Rename covariance to correlation (#346)
Browse files Browse the repository at this point in the history
* rename covariance to correlation

* rename variables
  • Loading branch information
frances-h committed Apr 3, 2023
1 parent f75d2a7 commit dceaa43
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 129 deletions.
52 changes: 24 additions & 28 deletions copulas/multivariate/gaussian.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class GaussianMultivariate(Multivariate):
distribution names.
"""

covariance = None
correlation = None
columns = None
univariates = None

Expand Down Expand Up @@ -65,29 +65,29 @@ def _transform_to_normal(self, X):

return stats.norm.ppf(np.column_stack(U))

def _get_covariance(self, X):
"""Compute covariance matrix with transformed data.
def _get_correlation(self, X):
"""Compute correlation matrix with transformed data.
Args:
X (numpy.ndarray):
Data for which the covariance needs to be computed.
Data for which the correlation needs to be computed.
Returns:
numpy.ndarray:
computed covariance matrix.
computed correlation matrix.
"""
result = self._transform_to_normal(X)
covariance = pd.DataFrame(data=result).corr().to_numpy()
covariance = np.nan_to_num(covariance, nan=0.0)
correlation = pd.DataFrame(data=result).corr().to_numpy()
correlation = np.nan_to_num(correlation, nan=0.0)
# If singular, add some noise to the diagonal
if np.linalg.cond(covariance) > 1.0 / sys.float_info.epsilon:
covariance = covariance + np.identity(covariance.shape[0]) * EPSILON
if np.linalg.cond(correlation) > 1.0 / sys.float_info.epsilon:
correlation = correlation + np.identity(correlation.shape[0]) * EPSILON

return pd.DataFrame(covariance, index=self.columns, columns=self.columns)
return pd.DataFrame(correlation, index=self.columns, columns=self.columns)

@check_valid_values
def fit(self, X):
"""Compute the distribution for each variable and then its covariance matrix.
"""Compute the distribution for each variable and then its correlation matrix.
Arguments:
X (pandas.DataFrame):
Expand Down Expand Up @@ -126,8 +126,8 @@ def fit(self, X):
self.columns = columns
self.univariates = univariates

LOGGER.debug('Computing covariance')
self.covariance = self._get_covariance(X)
LOGGER.debug('Computing correlation')
self.correlation = self._get_correlation(X)
self.fitted = True

LOGGER.debug('GaussianMultivariate fitted successfully')
Expand All @@ -149,7 +149,7 @@ def probability_density(self, X):
"""
self.check_fit()
transformed = self._transform_to_normal(X)
return stats.multivariate_normal.pdf(transformed, cov=self.covariance)
return stats.multivariate_normal.pdf(transformed, cov=self.correlation)

def cumulative_distribution(self, X):
"""Compute the cumulative distribution value for each point in X.
Expand All @@ -168,7 +168,7 @@ def cumulative_distribution(self, X):
"""
self.check_fit()
transformed = self._transform_to_normal(X)
return stats.multivariate_normal.cdf(transformed, cov=self.covariance)
return stats.multivariate_normal.cdf(transformed, cov=self.correlation)

def _get_conditional_distribution(self, conditions):
"""Compute the parameters of a conditional multivariate normal distribution.
Expand All @@ -192,12 +192,12 @@ def _get_conditional_distribution(self, conditions):
names of the columns that will be sampled conditionally.
"""
columns2 = conditions.index
columns1 = self.covariance.columns.difference(columns2)
columns1 = self.correlation.columns.difference(columns2)

sigma11 = self.covariance.loc[columns1, columns1].to_numpy()
sigma12 = self.covariance.loc[columns1, columns2].to_numpy()
sigma21 = self.covariance.loc[columns2, columns1].to_numpy()
sigma22 = self.covariance.loc[columns2, columns2].to_numpy()
sigma11 = self.correlation.loc[columns1, columns1].to_numpy()
sigma12 = self.correlation.loc[columns1, columns2].to_numpy()
sigma21 = self.correlation.loc[columns2, columns1].to_numpy()
sigma22 = self.correlation.loc[columns2, columns2].to_numpy()

mu1 = np.zeros(len(columns1))
mu2 = np.zeros(len(columns2))
Expand All @@ -220,7 +220,7 @@ def _get_normal_samples(self, num_rows, conditions):
a standard normal multivariate conditioned on the given condition values.
"""
if conditions is None:
covariance = self.covariance
covariance = self.correlation
columns = self.columns
means = np.zeros(len(columns))
else:
Expand Down Expand Up @@ -277,11 +277,9 @@ def to_dict(self):
"""
self.check_fit()
univariates = [univariate.to_dict() for univariate in self.univariates]
warnings.warn('`covariance` will be renamed to `correlation` in v0.4.0',
DeprecationWarning)

return {
'covariance': self.covariance.to_numpy().tolist(),
'correlation': self.correlation.to_numpy().tolist(),
'univariates': univariates,
'columns': self.columns,
'type': get_qualified_name(self),
Expand All @@ -308,10 +306,8 @@ def from_dict(cls, copula_dict):
for parameters in copula_dict['univariates']:
instance.univariates.append(Univariate.from_dict(parameters))

covariance = copula_dict['covariance']
instance.covariance = pd.DataFrame(covariance, index=columns, columns=columns)
correlation = copula_dict['correlation']
instance.correlation = pd.DataFrame(correlation, index=columns, columns=columns)
instance.fitted = True
warnings.warn('`covariance` will be renamed to `correlation` in v0.4.0',
DeprecationWarning)

return instance
40 changes: 20 additions & 20 deletions tests/unit/multivariate/test_gaussian.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,26 +185,26 @@ def test__transform_to_normal_dataframe(self):
passed = dist_b.cdf.call_args[0][0]
np.testing.assert_allclose(expected, passed)

def test__get_covariance(self):
"""_get_covariance computes the covariance matrix of normalized values."""
def test__get_correlation(self):
"""_get_correlation computes the correlation matrix of normalized values."""
# Setup
copula = GaussianMultivariate(GaussianUnivariate)
copula.fit(self.data)

expected_covariance = np.array([
expected_correlation = np.array([
[1., -0.01261819, -0.19821644],
[-0.01261819, 1., -0.16896087],
[-0.19821644, -0.16896087, 1.]
])

# Run
covariance = copula._get_covariance(self.data)
correlation = copula._get_correlation(self.data)

# Check
assert np.isclose(covariance, expected_covariance).all().all()
assert np.isclose(correlation, expected_correlation).all().all()

def test_fit_default_distribution(self):
"""On fit, a distribution is created for each column along the covariance and means"""
"""On fit, a distribution is created for each column along the correlation and means"""

copula = GaussianMultivariate(GaussianUnivariate)
copula.fit(self.data)
Expand All @@ -215,8 +215,8 @@ def test_fit_default_distribution(self):
assert copula.univariates[i]._params['loc'] == self.data[key].mean()
assert copula.univariates[i]._params['scale'] == np.std(self.data[key])

expected_covariance = copula._get_covariance(self.data)
assert (copula.covariance == expected_covariance).all().all()
expected_correlation = copula._get_correlation(self.data)
assert (copula.correlation == expected_correlation).all().all()

def test_fit_distribution_arg(self):
"""On fit, the distributions for each column use instances of copula.distribution."""
Expand All @@ -234,8 +234,8 @@ def test_fit_distribution_arg(self):
assert copula.columns[i] == key
assert get_qualified_name(copula.univariates[i].__class__) == copula.distribution

expected_covariance = copula._get_covariance(self.data)
assert (copula.covariance == expected_covariance).all().all()
expected_correlation = copula._get_correlation(self.data)
assert (copula.correlation == expected_correlation).all().all()

def test_fit_distribution_selector(self):
"""
Expand Down Expand Up @@ -269,8 +269,8 @@ def test_fit_numpy_array(self):
assert univariate._params['loc'] == np.mean(self.data[column])
assert univariate._params['scale'] == np.std(self.data[column])

expected_covariance = copula._get_covariance(pd.DataFrame(self.data.to_numpy()))
assert (copula.covariance == expected_covariance).all().all()
expected_correlation = copula._get_correlation(pd.DataFrame(self.data.to_numpy()))
assert (copula.correlation == expected_correlation).all().all()

@patch('copulas.univariate.truncated_gaussian.TruncatedGaussian._fit')
@patch('copulas.multivariate.gaussian.warnings')
Expand Down Expand Up @@ -390,8 +390,8 @@ def test_sample(self, normal_mock):
assert result.equals(expected_result)

assert normal_mock.called_once_with(
np.zeros(instance.covariance.shape[0]),
instance.covariance,
np.zeros(instance.correlation.shape[0]),
instance.correlation,
5
)

Expand Down Expand Up @@ -439,8 +439,8 @@ def test_to_dict(self):
assert result['columns'] == ['column1', 'column2', 'column3']
assert len(result['univariates']) == 3

expected_cov = copula._get_covariance(self.data).to_numpy().tolist()
np.testing.assert_equal(result['covariance'], expected_cov)
expected_cov = copula._get_correlation(self.data).to_numpy().tolist()
np.testing.assert_equal(result['correlation'], expected_cov)

for univariate, result_univariate in zip(copula.univariates, result['univariates']):
assert univariate.to_dict() == result_univariate
Expand All @@ -466,7 +466,7 @@ def test_from_dict(self):
def test_sample_constant_column(self):
"""Gaussian copula can sample after being fit with a constant column.
This process will raise warnings when computing the covariance matrix
This process will raise warnings when computing the correlation matrix
"""
# Setup
instance = GaussianMultivariate()
Expand All @@ -490,12 +490,12 @@ def test_sample_constant_column(self):
# This is to check that the samples on the non constant column are not constant too.
assert len(result.loc[:, 1].unique()) > 1

covariance = instance.covariance
assert (~pd.isna(covariance)).all().all()
correlation = instance.correlation
assert (~pd.isna(correlation)).all().all()

def test__get_conditional_distribution(self):
gm = GaussianMultivariate()
gm.covariance = pd.DataFrame({
gm.correlation = pd.DataFrame({
'a': [1, 0.2, 0.3],
'b': [0.2, 1, 0.4],
'c': [0.3, 0.4, 1],
Expand Down
149 changes: 68 additions & 81 deletions tutorials/03_Multivariate_Distributions.ipynb

Large diffs are not rendered by default.

0 comments on commit dceaa43

Please sign in to comment.