In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('../datasets/socialdiagnosis/data/SocialDiagnosis2011.csv', sep=';')

In [2]:
df.head()

Unnamed: 0,sex,age,marital,income,ls,smoke
0,FEMALE,57,MARRIED,800.0,PLEASED,NO
1,MALE,20,SINGLE,350.0,MOSTLY SATISFIED,NO
2,FEMALE,18,SINGLE,,PLEASED,NO
3,FEMALE,78,WIDOWED,900.0,MIXED,NO
4,FEMALE,54,MARRIED,1500.0,MOSTLY SATISFIED,YES


In [3]:
from synthpop import MissingDataHandler, DataProcessor, CARTMethod

In [4]:
# 1. Initiate metadata
md_handler = MissingDataHandler()

# 1.1 Get data types
metadata= md_handler.get_column_dtypes(df)
print("Column Data Types:", metadata)

Column Data Types: {'sex': 'categorical', 'age': 'numerical', 'marital': 'categorical', 'income': 'numerical', 'ls': 'categorical', 'smoke': 'categorical'}


In [5]:
# 2. Process missing data
print("Missing data:")
print(df.isnull().sum())

Missing data:
sex          0
age          0
marital      9
income     683
ls           8
smoke       10
dtype: int64


In [6]:
# 2.1 Detect type of missingness
missingness_dict = md_handler.detect_missingness(df)
print("Detected missingness type:", missingness_dict)

Detected missingness type: {'marital': 'MAR', 'income': 'MAR', 'ls': 'MAR', 'smoke': 'MAR'}


In [7]:
# 2.2 Impute missing values
real_df = md_handler.apply_imputation(df, missingness_dict)

print("Missing data:")
print(real_df.isnull().sum())

Missing data:
sex        0
age        0
marital    0
income     0
ls         0
smoke      0
dtype: int64


In [8]:
# 3. Preprocessing: Instantiate the DataProcessor with column_dtypes
processor = DataProcessor(metadata)

# 3.1 Preprocess the data: transforms raw data into a numerical format
processed_data = processor.preprocess(real_df)
print("Processed data:")
display(processed_data.head())

Processed data:


Unnamed: 0,sex,age,marital,income,ls,smoke
0,0,0.503625,3,-0.517232,4,0
1,1,-1.495187,4,-0.898113,3,0
2,0,-1.603231,4,0.0,4,0
3,0,1.638086,5,-0.432591,1,0
4,0,0.341559,3,0.075251,3,1


In [9]:
# 4. Fit the CART method
cart = CARTMethod(metadata, smoothing=True, proper=True, minibucket=5, random_state=42)
cart.fit(processed_data)

In [10]:
# 4.1 Preview generated synthetic data
synthetic_processed = cart.sample(100)
print("Synthetic processed data:")
display(synthetic_processed.head())

Synthetic processed data:


Unnamed: 0,sex,age,marital,income,ls,smoke
0,1,-0.716885,4,-1.189097,2,0
1,1,-1.066729,3,0.057878,4,1
2,0,1.552391,3,-0.754037,2,0
3,0,0.522026,3,0.337329,4,1
4,0,0.262577,3,-1.179427,3,0


In [11]:
# 5. Postprocessing: back to the original format and preview of data
synthetic_df = processor.postprocess(synthetic_processed)
print("Synthetic data in original format:")
display(synthetic_df.head())

Synthetic data in original format:


Unnamed: 0,sex,age,marital,income,ls,smoke
0,MALE,34.407146,SINGLE,6.211859,MOSTLY DISSATISFIED,NO
1,MALE,27.93119,MARRIED,1479.474304,PLEASED,YES
2,FEMALE,76.413698,MARRIED,520.222172,MOSTLY DISSATISFIED,NO
3,FEMALE,57.340625,MARRIED,1809.63734,PLEASED,YES
4,FEMALE,52.537967,MARRIED,17.637157,MOSTLY SATISFIED,NO


In [12]:
from synthpop.metrics import (
    MetricsReport,
    EfficacyMetrics,
    DisclosureProtection
)

In [13]:
# 6. Evaluate the synthetic data

# 6.1 Diagnostic report
report = MetricsReport(real_df, synthetic_df, metadata)
report_df = report.generate_report()
print("=== Diagnostic Report ===")
display(report_df)

=== Diagnostic Report ===


Unnamed: 0,column,type,missing_value_similarity,range_coverage,boundary_adherence,ks_complement,tv_complement,statistic_similarity,category_coverage,category_adherence
0,sex,categorical,1.0,,,,0.9764,,1.0,1.0
1,age,numerical,1.0,0.94757,1.0,0.9142,,0.962239,,
2,marital,categorical,1.0,,,,0.967,,0.666667,1.0
3,income,numerical,1.0,0.408926,1.0,0.9056,,0.948719,,
4,ls,categorical,1.0,,,,0.9224,,0.857143,1.0
5,smoke,categorical,1.0,,,,0.9754,,1.0,1.0


In [14]:
# 6.2 Efficacy metrics

# regression
reg_efficacy = EfficacyMetrics(task='regression', target_column="income")
reg_metrics = reg_efficacy.evaluate(real_df, synthetic_df)
print("=== Regression Efficacy Metrics ===")
print(reg_metrics)

ValueError: could not convert string to float: 'MALE'

In [15]:
# classification
clf_efficacy = EfficacyMetrics(task='classification', target_column="smoke")
clf_metrics = clf_efficacy.evaluate(real_df, synthetic_df)
print("\n=== Classification Efficacy Metrics ===")
print(clf_metrics)


=== Classification Efficacy Metrics ===
{'accuracy': 0.6392, 'f1_score': 0.6481509447474609}


In [17]:
# privacy
dp = DisclosureProtection(real_df, synthetic_df)
dp_score = dp.score()
dp_report = dp.report()

print("\n=== Disclosure Protection ===")
print(f"Score: {dp_score:.3f}")
print("Detailed Report:", dp_report)


=== Disclosure Protection ===
Score: 1.000
Detailed Report: {'threshold': 0.0, 'risk_rate': 0.0, 'disclosure_protection_score': 1.0}
