In [1]:
import pandas as pd
import numpy as np
from datetime import  timedelta
from synthpop import DataProcessor, CARTMethod, MissingDataHandler

In [2]:
# Sample Data Generation with 100 observations
dates = pd.date_range("2023-01-01", periods=100, freq="D")
bool_values = np.random.choice([True, False], size=100)
timedeltas = [timedelta(days=int(i)) for i in np.random.randint(1, 100, 100)]

df_custom = pd.DataFrame({
    "numeric_col1": np.random.normal(50, 10, 100),
    "numeric_col2": np.random.randint(0, 100, 100),
    "categorical_col": np.random.choice(["Red", "Green", "Blue"], size=100),
    "boolean_col": bool_values,
    "datetime_col": dates,
    "timedelta_col": timedeltas,
    "float_col": np.random.uniform(0.0, 1.0, 100)
})

df = df_custom.copy()
print("Original Data:")
display(df.head())

Original Data:


Unnamed: 0,numeric_col1,numeric_col2,categorical_col,boolean_col,datetime_col,timedelta_col,float_col
0,59.902619,88,Red,True,2023-01-01,58 days,0.564856
1,64.307371,8,Red,True,2023-01-02,76 days,0.49939
2,33.56628,80,Green,False,2023-01-03,44 days,0.623179
3,34.304724,11,Red,True,2023-01-04,29 days,0.599925
4,48.796185,29,Green,True,2023-01-05,68 days,0.68624


In [3]:
#get the metadata from df 
metadata = MissingDataHandler.get_column_dtypes(df)
print(metadata)

{'numeric_col1': 'numerical', 'numeric_col2': 'numerical', 'categorical_col': 'categorical', 'boolean_col': 'boolean', 'datetime_col': 'datetime', 'timedelta_col': 'timedelta', 'float_col': 'numerical'}


In [4]:
# Instantiate DataProcessor and preprocess the data
processor = DataProcessor(metadata)
processed_data = processor.preprocess(df)
print("Processed Data:")
display(processed_data.head())

Processed Data:


Unnamed: 0,numeric_col1,numeric_col2,categorical_col,boolean_col,datetime_col,timedelta_col,float_col
0,59.902619,88.0,2,1,1672531000.0,5011200.0,0.564856
1,64.307371,8.0,2,1,1672618000.0,6566400.0,0.49939
2,33.56628,80.0,1,0,1672704000.0,3801600.0,0.623179
3,34.304724,11.0,2,1,1672790000.0,2505600.0,0.599925
4,48.796185,29.0,1,1,1672877000.0,5875200.0,0.68624


In [5]:
# Instantiate and fit the CART method
cart = CARTMethod(metadata, smoothing=True, proper=True, minibucket=5, random_state=42)
cart.fit(processed_data)

In [6]:
# For prediction, we might use the same data (or new preprocessed data)
synthetic_processed = cart.sample(100)
print("Synthetic Processed Data:")
display(synthetic_processed.head())

Synthetic Processed Data:


  y_synth_mode = mode(y_synth)
  y_synth_mode = mode(y_synth)
  y_synth_mode = mode(y_synth)
  y_synth_mode = mode(y_synth)
  y_synth_mode = mode(y_synth)


Unnamed: 0,numeric_col1,numeric_col2,categorical_col,boolean_col,datetime_col,timedelta_col,float_col
0,46.623328,102.520877,1,0,1676185000.0,7589583.0,0.462352
1,45.332468,0.0,0,0,1681233000.0,1180002.0,0.729633
2,42.043616,22.092879,0,0,1672887000.0,6234389.0,0.43168
3,46.124408,50.839336,1,0,1678331000.0,7214925.0,0.098846
4,38.49592,75.252838,0,1,1677357000.0,345600.0,0.854832


In [7]:
# Postprocess the synthetic data back to the original format
synthetic_data = processor.postprocess(synthetic_processed)
print("Synthetic Data in Original Format:")
display(synthetic_data.head())

Synthetic Data in Original Format:


Unnamed: 0,numeric_col1,numeric_col2,categorical_col,boolean_col,datetime_col,timedelta_col,float_col
0,46.623328,102.520877,Green,False,2023-02-12 07:02:57.695124865,87 days 20:13:03.003420194,0.462352
1,45.332468,0.0,Blue,False,2023-04-11 17:07:25.414428949,13 days 15:46:41.519494942,0.729633
2,42.043616,22.092879,Blue,False,2023-01-05 02:46:57.093904018,72 days 03:46:28.521553257,0.43168
3,46.124408,50.839336,Green,False,2023-03-09 03:09:57.312693119,83 days 12:08:45.127039962,0.098846
4,38.49592,75.252838,Blue,True,2023-02-25 20:35:55.717335224,4 days 00:00:00,0.854832
