In [1]:
import pandas as pd
import numpy as np
from datetime import  timedelta
from synthpop import DataProcessor, GaussianCopulaMethod, MissingDataHandler

In [2]:
# Sample Data Generation with 100 observations
dates = pd.date_range("2023-01-01", periods=100, freq="D")
bool_values = np.random.choice([True, False], size=100)
timedeltas = [timedelta(days=int(i)) for i in np.random.randint(1, 100, 100)]

df_custom = pd.DataFrame({
    "numeric_col1": np.random.normal(50, 10, 100),
    "numeric_col2": np.random.randint(0, 100, 100),
    "categorical_col": np.random.choice(["Red", "Green", "Blue"], size=100),
    "boolean_col": bool_values,
    "datetime_col": dates,
    "timedelta_col": timedeltas,
    "float_col": np.random.uniform(0.0, 1.0, 100)
})

df = df_custom.copy()
print("Original Data:")
display(df.head())

Original Data:


Unnamed: 0,numeric_col1,numeric_col2,categorical_col,boolean_col,datetime_col,timedelta_col,float_col
0,32.920461,10,Green,False,2023-01-01,22 days,0.386749
1,60.329997,35,Red,False,2023-01-02,39 days,0.258195
2,46.026183,88,Red,True,2023-01-03,52 days,0.335551
3,47.49698,38,Red,True,2023-01-04,57 days,0.655373
4,58.866162,76,Blue,False,2023-01-05,58 days,0.189261


In [3]:
#get the metadata from df 
metadata = MissingDataHandler.get_column_dtypes(df)
print(metadata)

{'numeric_col1': 'numerical', 'numeric_col2': 'numerical', 'categorical_col': 'categorical', 'boolean_col': 'boolean', 'datetime_col': 'datetime', 'timedelta_col': 'timedelta', 'float_col': 'numerical'}


In [4]:
# Instantiate DataProcessor and preprocess the data
processor = DataProcessor(metadata)
processed_data = processor.preprocess(df)
print("Processed Data:")
display(processed_data.head())

Processed Data:


Unnamed: 0,numeric_col1,numeric_col2,categorical_col,boolean_col,datetime_col,timedelta_col,float_col
0,32.920461,10.0,1,0,1672531000.0,1900800.0,0.386749
1,60.329997,35.0,2,0,1672618000.0,3369600.0,0.258195
2,46.026183,88.0,2,1,1672704000.0,4492800.0,0.335551
3,47.49698,38.0,2,1,1672790000.0,4924800.0,0.655373
4,58.866162,76.0,0,0,1672877000.0,5011200.0,0.189261


In [5]:
# Instantiate GaussianCopulaMethod with the same metadata
gaussian_copula = GaussianCopulaMethod(metadata)
# Fit the Gaussian Copula model on the processed data
gaussian_copula.fit(processed_data)

INFO:copulas.multivariate.gaussian:Fitting GaussianMultivariate(distribution="{'numeric_col1': <class 'copulas.univariate.beta.BetaUnivariate'>, 'numeric_col2': <class 'copulas.univariate.beta.BetaUnivariate'>, 'categorical_col': <class 'copulas.univariate.beta.BetaUnivariate'>, 'boolean_col': <class 'copulas.univariate.beta.BetaUnivariate'>, 'datetime_col': <class 'copulas.univariate.beta.BetaUnivariate'>, 'timedelta_col': <class 'copulas.univariate.beta.BetaUnivariate'>, 'float_col': <class 'copulas.univariate.beta.BetaUnivariate'>}")


In [6]:
# Generate synthetic processed data (e.g., 100 synthetic observations)
synthetic_processed = gaussian_copula.sample(100)
print("Synthetic Processed Data:")
display(synthetic_processed.head())

Synthetic Processed Data:


Unnamed: 0,numeric_col1,numeric_col2,categorical_col,boolean_col,datetime_col,timedelta_col,float_col
0,40.124922,7.104571,0.000353,0.113361,1673414000.0,1535918.0,0.885532
1,46.825093,72.413819,1.987888,0.338933,1674592000.0,2768494.0,0.727952
2,32.821198,60.09336,1.995752,0.607119,1678090000.0,6710158.0,0.865678
3,52.904942,85.644939,1.735633,0.485267,1675706000.0,3350090.0,0.686321
4,70.750467,58.855272,0.601378,0.001026,1679308000.0,4895207.0,0.631459


In [7]:
# Postprocess the synthetic data back to the original format
synthetic_data = processor.postprocess(synthetic_processed)
print("Synthetic Data in Original Format:")
display(synthetic_data.head())

Synthetic Data in Original Format:


Unnamed: 0,numeric_col1,numeric_col2,categorical_col,boolean_col,datetime_col,timedelta_col,float_col
0,40.124922,7.104571,Blue,False,2023-01-11 05:14:49.736002922,17 days 18:38:38.117568051,0.885532
1,46.825093,72.413819,Green,False,2023-01-24 20:31:30.598692894,32 days 01:01:33.949249402,0.727952
2,32.821198,60.09336,Green,True,2023-03-06 08:09:31.659742117,77 days 15:55:58.228138579,0.865678
3,52.904942,85.644939,Green,False,2023-02-06 17:56:33.571827650,38 days 18:34:50.462733512,0.686321
4,70.750467,58.855272,Blue,False,2023-03-20 10:32:27.117696762,56 days 15:46:47.078806655,0.631459
