In [1]:
import pandas as pd
import numpy as np
from datetime import  timedelta
from synthpop import MissingDataHandler

In [None]:
dates = pd.date_range("2023-01-01", periods=50, freq="D")
bool_values = np.random.choice([True, False], size=50)
timedeltas = timedeltas = [timedelta(days=int(i)) for i in np.random.randint(1, 100, 50)]
df_custom = pd.DataFrame({
    "numeric_col1": np.random.normal(50, 10, 50),
    "numeric_col2": np.random.randint(0, 100, 50),
    "categorical_col": np.random.choice(["Red", "Green", "Blue"], size=50),
    "boolean_col": bool_values,
    "datetime_col": dates,
    "timedelta_col": timedeltas,
    "float_col": np.random.uniform(0.0, 1.0, 50)
})


df = df_custom.copy()
df.head()


In [None]:
np.random.seed(42)  # For reproducibility

def introduce_missingness(dataframe, missing_frac=0.1):
    """Randomly set a fraction of each column's values to NaN."""
    df_with_nans = dataframe.copy()
    rows = len(df_with_nans)
    for col in df_with_nans.columns:
        n_missing = int(rows * missing_frac)
        missing_indices = np.random.choice(df_with_nans.index, n_missing, replace=False)
        df_with_nans.loc[missing_indices, col] = np.nan
    return df_with_nans

df_missing = introduce_missingness(df, missing_frac=0.2)  # 20% missingness
df_missing.head(10)

In [None]:
md_handler = MissingDataHandler()

# Check the data types
column_dtypes = md_handler.get_column_dtypes(df_missing)
print("Column Data Types:", column_dtypes)

# Detect missingness
missingness_dict = md_handler.detect_missingness(df_missing)
print("Detected Missingness Type:", missingness_dict)

In [None]:
df_imputed = md_handler.apply_imputation(df_missing, missingness_dict)

print("Before Imputation:\n", df_missing.head(10))


In [None]:
print("\nAfter Imputation:\n", df_imputed.head(10))