In [1]:
import pandas as pd
import numpy as np
from datetime import  timedelta
from synthpop import DataProcessor, GaussianCopulaMethod, MissingDataHandler

In [None]:
# Sample Data Generation with 100 observations
dates = pd.date_range("2023-01-01", periods=100, freq="D")
bool_values = np.random.choice([True, False], size=100)
timedeltas = [timedelta(days=int(i)) for i in np.random.randint(1, 100, 100)]

df_custom = pd.DataFrame({
    "numeric_col1": np.random.normal(50, 10, 100),
    "numeric_col2": np.random.randint(0, 100, 100),
    "categorical_col": np.random.choice(["Red", "Green", "Blue"], size=100),
    "boolean_col": bool_values,
    "datetime_col": dates,
    "timedelta_col": timedeltas,
    "float_col": np.random.uniform(0.0, 1.0, 100)
})

df = df_custom.copy()
print("Original Data:")
display(df.head())

In [None]:
#get the metadata from df 
metadata = MissingDataHandler.get_column_dtypes(df)
print(metadata)

In [None]:
# Instantiate DataProcessor and preprocess the data
processor = DataProcessor(metadata)
processed_data = processor.preprocess(df)
print("Processed Data:")
display(processed_data.head())

In [None]:
# Instantiate GaussianCopulaMethod with the same metadata
gaussian_copula = GaussianCopulaMethod(metadata)
# Fit the Gaussian Copula model on the processed data
gaussian_copula.fit(processed_data)

In [None]:
# Generate synthetic processed data (e.g., 100 synthetic observations)
synthetic_processed = gaussian_copula.sample(100)
print("Synthetic Processed Data:")
display(synthetic_processed.head())

In [None]:
# Postprocess the synthetic data back to the original format
synthetic_data = processor.postprocess(synthetic_processed)
print("Synthetic Data in Original Format:")
display(synthetic_data.head())