# Example: Boston Housing Dataset

In [1]:
import numpy as np
from sklearn.datasets import load_boston
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from copulas.multivariate import GaussianMultivariate

Let's start by loading the Boston housing dataset and dividing it into a train/test set. We'll also train and evaluate a simple elastic net model to set a baseline.

In [2]:
X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)

model = ElasticNet()
model.fit(X_train, y_train);
print(model.score(X_test, y_test))

0.6771635740274344


We can now generate a synthetic copy of the training set.

In [3]:
def create_synthetic(X, y):
    """
    This function generates a synthetic version of the supervised learning
    dataset using a Gaussian copula with KDE marginals.
    """
    dataset = np.concatenate([X, np.expand_dims(y, 1)], axis=1)
    model = GaussianMultivariate('copulas.univariate.GaussianKDE')
    model.fit(dataset)
    synthetic = model.sample(len(dataset))
    X = synthetic.values[:,:-1]
    y = synthetic.values[:,-1]
    return X, y

X_synthetic, y_synthetic = create_synthetic(X_train, y_train)

Now we can train a simple linear model using the synthetic dataset.

In [4]:
from sklearn.linear_model import ElasticNet
model = ElasticNet()
model.fit(X_synthetic, y_synthetic);

We can evaluate this model - trained on purely synthetic data - on the original held out test set.

In [5]:
print(model.score(X_test, y_test))

0.6354049084917914


Note that this score is reasonably close to that achieved by our baseline model trained on the real data.