In [1]:
import pandas as pd
import numpy as np
from datetime import  timedelta
from synthpop import DataProcessor

In [2]:
# Sample data generation using the provided data
dates = pd.date_range("2023-01-01", periods=50, freq="D")
bool_values = np.random.choice([True, False], size=50)
timedeltas = [timedelta(days=int(i)) for i in np.random.randint(1, 100, 50)]

df_custom = pd.DataFrame({
    "numeric_col1": np.random.normal(50, 10, 50),
    "numeric_col2": np.random.randint(0, 100, 50),
    "categorical_col": np.random.choice(["Red", "Green", "Blue"], size=50),
    "boolean_col": bool_values,
    "datetime_col": dates,
    "timedelta_col": timedeltas,
    "float_col": np.random.uniform(0.0, 1.0, 50)
})

df = df_custom.copy()
print("Original Data:")
display(df.head())

# Define metadata for each column (update these types as needed)
metadata = {
    "numeric_col1": "numerical",
    "numeric_col2": "numerical",
    "categorical_col": "categorical",
    "boolean_col": "boolean",
    "datetime_col": "datetime",
    "timedelta_col": "timedelta",
    "float_col": "numerical"
}

Original Data:


Unnamed: 0,numeric_col1,numeric_col2,categorical_col,boolean_col,datetime_col,timedelta_col,float_col
0,42.623609,95,Blue,False,2023-01-01,91 days,0.792334
1,63.017119,55,Blue,False,2023-01-02,9 days,0.372891
2,63.574092,97,Red,True,2023-01-03,81 days,0.324229
3,40.652037,15,Blue,False,2023-01-04,66 days,0.098615
4,58.764146,29,Red,True,2023-01-05,22 days,0.572833


In [3]:
# Instantiate the DataProcessor with the metadata
processor = DataProcessor(metadata)

In [4]:
# Preprocess the data: transforms raw data into a numerical format
processed_data = processor.preprocess(df)
print("Processed Data:")
display(processed_data.head())

Processed Data:


Unnamed: 0,numeric_col1,numeric_col2,categorical_col,boolean_col,datetime_col,timedelta_col,float_col
0,42.623609,95.0,0,0,1672531000.0,7862400.0,0.792334
1,63.017119,55.0,0,0,1672618000.0,777600.0,0.372891
2,63.574092,97.0,2,1,1672704000.0,6998400.0,0.324229
3,40.652037,15.0,0,0,1672790000.0,5702400.0,0.098615
4,58.764146,29.0,2,1,1672877000.0,1900800.0,0.572833


In [5]:
# Simulate synthetic data generation by copying the processed data
# (Replace this step with your synthetic data generation method if available)
synthetic_data = processed_data.copy()

In [6]:
# Postprocess the synthetic data to revert it back to its original data types
#the post processing makes sure to have the columns in their original order. 
recovered_data = processor.postprocess(synthetic_data)
print("Recovered Data:")
display(recovered_data.head())

Recovered Data:


Unnamed: 0,numeric_col1,numeric_col2,categorical_col,boolean_col,datetime_col,timedelta_col,float_col
0,42.623609,95.0,Blue,False,2023-01-01,91 days,0.792334
1,63.017119,55.0,Blue,False,2023-01-02,9 days,0.372891
2,63.574092,97.0,Red,True,2023-01-03,81 days,0.324229
3,40.652037,15.0,Blue,False,2023-01-04,66 days,0.098615
4,58.764146,29.0,Red,True,2023-01-05,22 days,0.572833
