In [1]:
import pandas as pd
import numpy as np
from datetime import  timedelta
from synthpop import MissingDataHandler

In [2]:
dates = pd.date_range("2023-01-01", periods=50, freq="D")
bool_values = np.random.choice([True, False], size=50)
timedeltas = timedeltas = [timedelta(days=int(i)) for i in np.random.randint(1, 100, 50)]
df_custom = pd.DataFrame({
    "numeric_col1": np.random.normal(50, 10, 50),
    "numeric_col2": np.random.randint(0, 100, 50),
    "categorical_col": np.random.choice(["Red", "Green", "Blue"], size=50),
    "boolean_col": bool_values,
    "datetime_col": dates,
    "timedelta_col": timedeltas,
    "float_col": np.random.uniform(0.0, 1.0, 50)
})


df = df_custom.copy()
df.head()


Unnamed: 0,numeric_col1,numeric_col2,categorical_col,boolean_col,datetime_col,timedelta_col,float_col
0,45.294008,65,Blue,True,2023-01-01,29 days,0.996676
1,51.141191,46,Red,True,2023-01-02,60 days,0.216778
2,49.396039,61,Blue,False,2023-01-03,31 days,0.147983
3,34.580348,81,Red,True,2023-01-04,11 days,0.70363
4,41.355316,97,Green,True,2023-01-05,10 days,0.898133


In [3]:
np.random.seed(42)  # For reproducibility

def introduce_missingness(dataframe, missing_frac=0.1):
    """Randomly set a fraction of each column's values to NaN."""
    df_with_nans = dataframe.copy()
    rows = len(df_with_nans)
    for col in df_with_nans.columns:
        n_missing = int(rows * missing_frac)
        missing_indices = np.random.choice(df_with_nans.index, n_missing, replace=False)
        df_with_nans.loc[missing_indices, col] = np.nan
    return df_with_nans

df_missing = introduce_missingness(df, missing_frac=0.2)  # 20% missingness
df_missing.head(10)

  df_with_nans.loc[missing_indices, col] = np.nan


Unnamed: 0,numeric_col1,numeric_col2,categorical_col,boolean_col,datetime_col,timedelta_col,float_col
0,45.294008,,Blue,True,2023-01-01,29 days,0.996676
1,51.141191,46.0,,True,2023-01-02,60 days,0.216778
2,49.396039,,Blue,False,2023-01-03,31 days,0.147983
3,34.580348,81.0,,,2023-01-04,11 days,0.70363
4,41.355316,97.0,Green,True,2023-01-05,10 days,0.898133
5,62.987176,65.0,,True,2023-01-06,69 days,0.495813
6,50.164584,9.0,Blue,False,NaT,30 days,0.681491
7,61.313888,95.0,Red,True,NaT,11 days,0.497654
8,69.935612,56.0,Blue,True,2023-01-09,71 days,
9,52.043996,47.0,Green,False,NaT,NaT,


In [4]:
md_handler = MissingDataHandler()

# Check the data types
column_dtypes = md_handler.get_column_dtypes(df_missing)
print("Column Data Types:", column_dtypes)

# Detect missingness
missingness_dict = md_handler.detect_missingness(df_missing)
print("Detected Missingness Type:", missingness_dict)

Column Data Types: {'numeric_col1': 'numerical', 'numeric_col2': 'numerical', 'categorical_col': 'categorical', 'boolean_col': 'categorical', 'datetime_col': 'datetime', 'timedelta_col': 'timedelta', 'float_col': 'numerical'}
Detected Missingness Type: {'numeric_col1': 'MCAR', 'numeric_col2': 'MAR', 'categorical_col': 'MAR', 'boolean_col': 'MCAR', 'datetime_col': 'MCAR', 'timedelta_col': 'MAR', 'float_col': 'MCAR'}


In [5]:
df_imputed = md_handler.apply_imputation(df_missing, missingness_dict)

print("Before Imputation:\n", df_missing.head(10))


Before Imputation:
    numeric_col1  numeric_col2 categorical_col boolean_col datetime_col  \
0     45.294008           NaN            Blue        True   2023-01-01   
1     51.141191          46.0             NaN        True   2023-01-02   
2     49.396039           NaN            Blue       False   2023-01-03   
3     34.580348          81.0             NaN         NaN   2023-01-04   
4     41.355316          97.0           Green        True   2023-01-05   
5     62.987176          65.0             NaN        True   2023-01-06   
6     50.164584           9.0            Blue       False          NaT   
7     61.313888          95.0             Red        True          NaT   
8     69.935612          56.0            Blue        True   2023-01-09   
9     52.043996          47.0           Green       False          NaT   

  timedelta_col  float_col  
0       29 days   0.996676  
1       60 days   0.216778  
2       31 days   0.147983  
3       11 days   0.703630  
4       10 days   0.

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
  df[col].fillna(df[col].mode()[0], inplace=True)


In [6]:
print("\nAfter Imputation:\n", df_imputed.head(10))


After Imputation:
    numeric_col1  numeric_col2 categorical_col  boolean_col  \
0     45.294008        48.175            Blue         True   
1     51.141191        46.000            Blue         True   
2     49.396039        48.175            Blue        False   
3     34.580348        81.000           Green         True   
4     41.355316        97.000           Green         True   
5     62.987176        65.000            Blue         True   
6     50.164584         9.000            Blue        False   
7     61.313888        95.000             Red         True   
8     69.935612        56.000            Blue         True   
9     52.043996        47.000           Green        False   

         datetime_col    timedelta_col  float_col  
0 2023-01-01 00:00:00 29 days 00:00:00   0.996676  
1 2023-01-02 00:00:00 60 days 00:00:00   0.216778  
2 2023-01-03 00:00:00 31 days 00:00:00   0.147983  
3 2023-01-04 00:00:00 11 days 00:00:00   0.703630  
4 2023-01-05 00:00:00 10 days 00:00:0