In [1]:
import pandas as pd
import numpy as np

### Read data

In [2]:
# Dataset: https://archive.ics.uci.edu/dataset/47/horse+colic

In [3]:
data_path = r"data\horse-colic.data"
real_data = pd.read_csv(data_path, header=None, sep=r'\s+')

In [4]:
real_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,2,1,530101,38.50,66,28,3,3,?,2,...,45.00,8.40,?,?,2,2,11300,0,0,2
1,1,1,534817,39.2,88,20,?,?,4,1,...,50,85,2,2,3,2,2208,0,0,2
2,2,1,530334,38.30,40,24,1,1,3,1,...,33.00,6.70,?,?,1,2,0,0,0,1
3,1,9,5290409,39.10,164,84,4,1,6,2,...,48.00,7.20,3,5.30,2,1,2208,0,0,1
4,2,1,530255,37.30,104,35,?,?,6,2,...,74.00,7.40,?,?,2,2,4300,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,1,1,533886,?,120,70,4,?,4,2,...,55,65,?,?,3,2,3205,0,0,2
296,2,1,527702,37.20,72,24,3,2,4,2,...,44.00,?,3,3.30,3,1,2208,0,0,1
297,1,1,529386,37.50,72,30,4,3,4,1,...,60.00,6.80,?,?,2,1,3205,0,0,2
298,1,1,530612,36.50,100,24,3,3,3,1,...,50.00,6.00,3,3.40,1,1,2208,0,0,1


In [5]:
# Little data celan
# ? are replaced by nans
real_data = real_data.replace('?', np.nan)

In [6]:
# Rename column names avoid to future errors
list_column_names = ["col" + str(i) for i in range(1, 29)]
real_data.columns = list_column_names

In [7]:
real_data.head()

Unnamed: 0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,...,col19,col20,col21,col22,col23,col24,col25,col26,col27,col28
0,2,1,530101,38.5,66,28,3.0,3.0,,2,...,45.0,8.4,,,2,2,11300,0,0,2
1,1,1,534817,39.2,88,20,,,4.0,1,...,50.0,85.0,2.0,2.0,3,2,2208,0,0,2
2,2,1,530334,38.3,40,24,1.0,1.0,3.0,1,...,33.0,6.7,,,1,2,0,0,0,1
3,1,9,5290409,39.1,164,84,4.0,1.0,6.0,2,...,48.0,7.2,3.0,5.3,2,1,2208,0,0,1
4,2,1,530255,37.3,104,35,,,6.0,2,...,74.0,7.4,,,2,2,4300,0,0,2


In [8]:
real_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 28 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   col1    299 non-null    object
 1   col2    300 non-null    int64 
 2   col3    300 non-null    int64 
 3   col4    240 non-null    object
 4   col5    276 non-null    object
 5   col6    242 non-null    object
 6   col7    244 non-null    object
 7   col8    231 non-null    object
 8   col9    253 non-null    object
 9   col10   268 non-null    object
 10  col11   245 non-null    object
 11  col12   256 non-null    object
 12  col13   244 non-null    object
 13  col14   196 non-null    object
 14  col15   194 non-null    object
 15  col16   53 non-null     object
 16  col17   198 non-null    object
 17  col18   182 non-null    object
 18  col19   271 non-null    object
 19  col20   267 non-null    object
 20  col21   135 non-null    object
 21  col22   102 non-null    object
 22  col23   299 non-null    ob

### Generate data

In [9]:
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.metadata import Metadata

In [10]:
# Detect metadata
metadata = Metadata.detect_from_dataframe(real_data)

# Optional delete previous metadata to avoid UserWarning
import os
filepath = r'data\demo_metadata.json'
if os.path.exists(filepath):
    os.remove(filepath)

# Save metadata to JSON
metadata.save_to_json(r'data\demo_metadata.json')

## Example load metadata from file
# from sdv.metadata import Metadata
# metadata = Metadata.load_from_json(r'data\demo_metadata.json')

# Create and fit synthesizer
synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(real_data)

# Generate synthetic data
synthetic_data = synthesizer.sample(num_rows=1000)

In [11]:
synthetic_data

Unnamed: 0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,...,col19,col20,col21,col22,col23,col24,col25,col26,col27,col28
0,1,1,518476,38.20,48,12,1,3,5,1,...,57.00,7.00,,,1,1,7426,0,0,2
1,1,1,518476,38.0,60,24,,1,2,1,...,,,,,1,1,2085,0,0,1
2,2,1,518476,,60,16,1,1,5,1,...,43.00,7.0,1,4.30,3,2,1360,0,0,1
3,1,1,518476,37.00,40,12,2,1,2,1,...,55.00,6.80,,,3,2,428,0,0,2
4,1,1,5305629,38.50,130,24,3,,4,2,...,47,8.90,2,8.00,1,2,16044,7111,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,1,518476,,30,34,3,,1,2,...,37.00,69,2,,2,1,5254,0,0,2
996,2,1,518476,38.00,60,32,1,4,,1,...,47.00,4.70,3,,1,2,4262,0,0,2
997,1,1,518476,37.90,40,32,1,,1,1,...,35.00,69,,,1,1,6406,0,0,2
998,1,1,518476,,64,40,3,3,,1,...,,5.70,,5.00,3,1,9463,0,0,2


In [12]:
output_path = "data/synthetic_data.csv"
synthetic_data.to_csv(output_path, index=False)
print(f"Synthetic data save to: {output_path}")

Synthetic data save to: data/synthetic_data.csv


### Evaulation data

In [13]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data,
    synthetic_data,
    metadata)

Generating report ...

(1/2) Evaluating Column Shapes: |████████████████████████████████████████████████████| 28/28 [00:00<00:00, 839.17it/s]|
Column Shapes Score: 91.41%

(2/2) Evaluating Column Pair Trends: |██████████████████████████████████████████████| 378/378 [00:04<00:00, 80.12it/s]|
Column Pair Trends Score: 78.27%

Overall Score (Average): 84.84%



In [14]:
print(quality_report.get_score())

0.8483546931002959


#### Score Interpretation  
Value Range | Interpretation  
---|---  
0.90 – 1.00 | 🚀 Excellent: The synthetic data almost perfectly matches the statistical patterns of the real data.  
0.70 – 0.89 | ✅ Good: The synthetic data resembles the original, but there may be some distortions.  
0.50 – 0.69 | ⚠️ Fair: Partial match, but several important patterns are missing or distorted.  
0.30 – 0.49 | ❌ Poor: The synthetic data does not represent the real data well.  
0.00 – 0.29 | 🔴 Very poor: Almost no similarity. Likely caused by a bad model, insufficient data, or faulty training.  

### Report

In [15]:
from sdmetrics.reports.single_table import QualityReport

metadata = metadata._convert_to_single_table()
# Convert metadata to dict
metadata_dict = metadata.to_dict()

report = QualityReport()
report.generate(real_data, synthetic_data, metadata_dict)

Generating report ...

(1/2) Evaluating Column Shapes: |████████████████████████████████████████████████████| 28/28 [00:00<00:00, 895.72it/s]|
Column Shapes Score: 91.41%

(2/2) Evaluating Column Pair Trends: |██████████████████████████████████████████████| 378/378 [00:04<00:00, 88.12it/s]|
Column Pair Trends Score: 78.27%

Overall Score (Average): 84.84%



In [16]:
report.get_visualization(property_name='Column Shapes')

In [17]:
report.get_visualization(property_name='Column Pair Trends')

### Diagnostics and visualization

In [18]:
from sdv.evaluation.single_table import run_diagnostic, get_column_plot

diagnostic = run_diagnostic(real_data, synthetic_data, metadata)
print(diagnostic.get_score())
diagnostic.get_details(property_name='Data Validity')

Generating report ...

(1/2) Evaluating Data Validity: |████████████████████████████████████████████████████| 28/28 [00:00<00:00, 753.04it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |█████████████████████████████████████████████████████| 1/1 [00:00<00:00, 108.72it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%

1.0


Unnamed: 0,Column,Metric,Score
0,col1,CategoryAdherence,1.0
1,col2,CategoryAdherence,1.0
2,col3,BoundaryAdherence,1.0
3,col4,CategoryAdherence,1.0
4,col5,CategoryAdherence,1.0
5,col6,CategoryAdherence,1.0
6,col7,CategoryAdherence,1.0
7,col8,CategoryAdherence,1.0
8,col9,CategoryAdherence,1.0
9,col10,CategoryAdherence,1.0


In [19]:
print(real_data.columns)

Index(['col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9',
       'col10', 'col11', 'col12', 'col13', 'col14', 'col15', 'col16', 'col17',
       'col18', 'col19', 'col20', 'col21', 'col22', 'col23', 'col24', 'col25',
       'col26', 'col27', 'col28'],
      dtype='object')


In [20]:
import time

for col in real_data.columns:
    print(f"📊 Plot for: {col}")
    try:
        fig = get_column_plot(
            real_data=real_data,
            synthetic_data=synthetic_data,
            metadata=metadata,
            column_name=col
        )
        fig.show()
        time.sleep(0.9)
    except Exception as e:
        print(f"⚠️ Error in column ({col}): {e}")

📊 Plot for: col1


📊 Plot for: col2


📊 Plot for: col3


📊 Plot for: col4


📊 Plot for: col5


📊 Plot for: col6


📊 Plot for: col7


📊 Plot for: col8


📊 Plot for: col9


📊 Plot for: col10


📊 Plot for: col11


📊 Plot for: col12


📊 Plot for: col13


📊 Plot for: col14


📊 Plot for: col15


📊 Plot for: col16


📊 Plot for: col17


📊 Plot for: col18


📊 Plot for: col19


📊 Plot for: col20


📊 Plot for: col21


📊 Plot for: col22


📊 Plot for: col23


📊 Plot for: col24


📊 Plot for: col25


📊 Plot for: col26


📊 Plot for: col27


📊 Plot for: col28
