In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('../datasets/socialdiagnosis/data/SocialDiagnosis2011.csv', sep=';')

In [2]:
df.head()

Unnamed: 0,sex,age,marital,income,ls,smoke
0,FEMALE,57,MARRIED,800.0,PLEASED,NO
1,MALE,20,SINGLE,350.0,MOSTLY SATISFIED,NO
2,FEMALE,18,SINGLE,,PLEASED,NO
3,FEMALE,78,WIDOWED,900.0,MIXED,NO
4,FEMALE,54,MARRIED,1500.0,MOSTLY SATISFIED,YES


In [3]:
from synthpop import MissingDataHandler, DataProcessor, CARTMethod

In [4]:
# 1. Initiate metadata
md_handler = MissingDataHandler()

# 1.1 Get data types
metadata= md_handler.get_column_dtypes(df)
print("Column Data Types:", metadata)

Column Data Types: {'sex': 'categorical', 'age': 'numerical', 'marital': 'categorical', 'income': 'numerical', 'ls': 'categorical', 'smoke': 'categorical'}


In [5]:
# 2. Process missing data
print("Missing data:")
print(df.isnull().sum())

Missing data:
sex          0
age          0
marital      9
income     683
ls           8
smoke       10
dtype: int64


In [6]:
# 2.1 Detect type of missingness
missingness_dict = md_handler.detect_missingness(df)
print("Detected missingness type:", missingness_dict)

Detected missingness type: {'marital': 'MAR', 'income': 'MAR', 'ls': 'MAR', 'smoke': 'MAR'}


In [7]:
# 2.2 Impute missing values
real_df = md_handler.apply_imputation(df, missingness_dict)

print("Missing data:")
print(real_df.isnull().sum())

Missing data:
sex        0
age        0
marital    0
income     0
ls         0
smoke      0
dtype: int64


In [8]:
# 3. Preprocessing: Instantiate the DataProcessor with column_dtypes
processor = DataProcessor(metadata)

# 3.1 Preprocess the data: transforms raw data into a numerical format
processed_data = processor.preprocess(real_df)
print("Processed data:")
display(processed_data.head())

Processed data:


Unnamed: 0,sex,age,marital,income,ls,smoke
0,0,57.0,3,800.0,4,0
1,1,20.0,4,350.0,3,0
2,0,18.0,4,1411.093352,4,0
3,0,78.0,5,900.0,1,0
4,0,54.0,3,1500.0,3,1


In [9]:
# 4. Fit the CART method
cart = CARTMethod(metadata, smoothing=True, proper=True, minibucket=5, random_state=42)
cart.fit(processed_data)

In [10]:
# 4.1 Preview generated synthetic data
synthetic_processed = cart.sample(100)
print("Synthetic processed data:")
display(synthetic_processed.head())

Synthetic processed data:


Unnamed: 0,sex,age,marital,income,ls,smoke
0,1,27.000861,3,5940.743483,4,0
1,1,33.685012,3,1747.087586,4,1
2,0,76.212963,5,955.997763,2,0
3,1,36.493914,3,1783.896317,4,1
4,0,54.019538,3,791.422965,4,0


In [11]:
# 5. Postprocessing: back to the original format and preview of data
synthetic_df = processor.postprocess(synthetic_processed)
print("Synthetic data in original format:")
display(synthetic_df.head())

Synthetic data in original format:


Unnamed: 0,sex,age,marital,income,ls,smoke
0,MALE,27.000861,MARRIED,5940.743483,PLEASED,NO
1,MALE,33.685012,MARRIED,1747.087586,PLEASED,YES
2,FEMALE,76.212963,WIDOWED,955.997763,MOSTLY DISSATISFIED,NO
3,MALE,36.493914,MARRIED,1783.896317,PLEASED,YES
4,FEMALE,54.019538,MARRIED,791.422965,PLEASED,NO


In [12]:
from synthpop.metrics import (
    MetricsReport,
    EfficacyMetrics,
    DisclosureProtection
)

In [13]:
# 6. Evaluate the synthetic data

# 6.1 Diagnostic report
report = MetricsReport(real_df, synthetic_df, metadata)
report_df = report.generate_report()
print("=== Diagnostic Report ===")
display(report_df)

=== Diagnostic Report ===


Unnamed: 0,column,type,missing_value_similarity,range_coverage,boundary_adherence,ks_complement,tv_complement,statistic_similarity,category_coverage,category_adherence
0,sex,categorical,1.0,,,,0.9964,,1.0,1.0
1,age,numerical,1.0,0.885944,1.0,0.9342,,0.966491,,
2,marital,categorical,1.0,,,,0.975,,0.833333,1.0
3,income,numerical,1.0,0.498955,1.0,0.8468,,0.912639,,
4,ls,categorical,1.0,,,,0.9176,,0.857143,1.0
5,smoke,categorical,1.0,,,,0.9746,,1.0,1.0


In [14]:
# 6.2 Efficacy metrics

# 6.2.1 Regression
reg_efficacy = EfficacyMetrics(task='regression', target_column="income")
reg_metrics = reg_efficacy.evaluate(real_df, synthetic_df)
print("=== Regression Efficacy Metrics ===")
print(reg_metrics)

=== Regression Efficacy Metrics ===
{'mse': 1455890.3266689673, 'mae': 810.365268015888, 'r2': -0.04299904229518581}


In [15]:
# 6.2.2 Classification
clf_efficacy = EfficacyMetrics(task='classification', target_column="smoke")
clf_metrics = clf_efficacy.evaluate(real_df, synthetic_df)
print("\n=== Classification Efficacy Metrics ===")
print(clf_metrics)


=== Classification Efficacy Metrics ===
{'accuracy': 0.64, 'f1_score': 0.6461333333333333}


In [16]:
# 6.3 Privacy
dp = DisclosureProtection(real_df, synthetic_df)
dp_score = dp.score()
dp_report = dp.report()

print("\n=== Disclosure Protection ===")
print(f"Score: {dp_score:.3f}")
print("Detailed Report:", dp_report)


=== Disclosure Protection ===
Score: 1.000
Detailed Report: {'threshold': 0.0, 'risk_rate': 0.0, 'disclosure_protection_score': 1.0}
