# Machine Learning Efficacy
In this tutorial, we will write a custom metrics generator to evaluate the efficacy of a synthetic dataset on a machine learning task.

## Loading the Dataset
The Boston housing prices dataset is available through sklearn.

In [1]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

X, y = load_boston(return_X_y=True)

We will want to rearrange this dataset into a DataFrame where the last column is the regression target.

In [2]:
import numpy as np
import pandas as pd

dataset = np.concatenate([X, np.expand_dims(y, 1)], axis=1)
dataset = pd.DataFrame(dataset)
dataset.columns = ["x%s" % i for i in range(X.shape[1])] + ["y"]
dataset.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,y
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


## Generating Synthetic Data
We'll use copulas to generate a synthetic copy of the data.

In [3]:
from copulas.multivariate import GaussianMultivariate

model = GaussianMultivariate()
model.fit(dataset)
synthetic_dataset = model.sample(len(dataset))

## Creating the Metadata
This dataset only has a single table; however, we still need to create the Metadata object to let SDMetrics know about this table.

In [4]:
from sdv import Metadata

real_tables = {
    "boston_housing_prices": dataset
}
synthetic_tables = {
    "boston_housing_prices": synthetic_dataset
}

metadata = Metadata()
metadata.add_table("boston_housing_prices", dataset)

## Evaluating Efficacy
Let's write a custom efficacy metric which attempts to predict `y` from all other columns.

In [5]:
from sklearn.linear_model import LinearRegression

from sdmetrics.multivariate.efficacy import MLEfficacy


class MyCustomEfficacyMetric(MLEfficacy):
    
    name = "housing_prices_prediction"
    
    # Specify the table + target column
    target_table_name = "boston_housing_prices"
    target_column_name = "y"
    
    # Define the output of the score function
    metric_unit = "r_squared"
    metric_domain = (-np.inf, 1.0)

    def fit(self, X, y):
        """
        Arguments:
            X (np.ndarray): The numerical features (i.e. transformed rows).
            y (np.ndarray): The binary classification target.
        """
        self.model = LinearRegression()
        self.model.fit(X, y)

    def score(self, X, y):
        """
        Arguments:
            X (np.ndarray): The numerical features (i.e. transformed rows).
            y (np.ndarray): The binary classification target.

        Returns:
            float: The value of the appropriate metric.
        """
        return self.model.score(X, y)

Let's go ahead and run this.

In [6]:
generator = MyCustomEfficacyMetric()

for metric in generator.metrics(metadata, real_tables, synthetic_tables):
    print(metric)
    print()

Metric(
  name=housing_prices_prediction, 
  value=0.58, 
  tags={'table:boston_housing_prices', 'efficacy:ml', 'column:y'}, 
  description=Score on the real test set using the machine learning model trained on synthetic data.
)

Metric(
  name=housing_prices_prediction, 
  value=-0.14, 
  tags={'table:boston_housing_prices', 'efficacy:ml', 'column:y'}, 
  description=Diff in score on real when trained on synthetic vs real.
)

