In [1]:
import pandas as pd
import numpy as np
from datetime import  timedelta
from synthpop import MissingDataHandler

In [2]:
dates = pd.date_range("2023-01-01", periods=50, freq="D")
bool_values = np.random.choice([True, False], size=50)
timedeltas = timedeltas = [timedelta(days=int(i)) for i in np.random.randint(1, 100, 50)]
df_custom = pd.DataFrame({
    "numeric_col1": np.random.normal(50, 10, 50),
    "numeric_col2": np.random.randint(0, 100, 50),
    "categorical_col": np.random.choice(["Red", "Green", "Blue"], size=50),
    "boolean_col": bool_values,
    "datetime_col": dates,
    "timedelta_col": timedeltas,
    "float_col": np.random.uniform(0.0, 1.0, 50)
})


df = df_custom.copy()
df.head()


Unnamed: 0,numeric_col1,numeric_col2,categorical_col,boolean_col,datetime_col,timedelta_col,float_col
0,52.800535,87,Blue,False,2023-01-01,23 days,0.157993
1,47.191847,15,Blue,False,2023-01-02,41 days,0.845319
2,43.776269,66,Red,True,2023-01-03,81 days,0.163621
3,35.839315,36,Red,True,2023-01-04,49 days,0.372961
4,59.9435,76,Green,False,2023-01-05,91 days,0.029968


In [3]:
np.random.seed(42)  # For reproducibility

def introduce_missingness(dataframe, missing_frac=0.1):
    """Randomly set a fraction of each column's values to NaN."""
    df_with_nans = dataframe.copy()
    rows = len(df_with_nans)
    for col in df_with_nans.columns:
        n_missing = int(rows * missing_frac)
        missing_indices = np.random.choice(df_with_nans.index, n_missing, replace=False)
        df_with_nans.loc[missing_indices, col] = np.nan
    return df_with_nans

df_missing = introduce_missingness(df, missing_frac=0.2)  # 20% missingness
df_missing.head(10)

  df_with_nans.loc[missing_indices, col] = np.nan


Unnamed: 0,numeric_col1,numeric_col2,categorical_col,boolean_col,datetime_col,timedelta_col,float_col
0,52.800535,,Blue,False,2023-01-01,23 days,0.157993
1,47.191847,15.0,,False,2023-01-02,41 days,0.845319
2,43.776269,,Red,True,2023-01-03,81 days,0.163621
3,35.839315,36.0,,,2023-01-04,49 days,0.372961
4,59.9435,76.0,Green,False,2023-01-05,91 days,0.029968
5,54.064506,81.0,,True,2023-01-06,41 days,0.098634
6,57.936702,13.0,Red,True,NaT,57 days,0.424857
7,60.488936,60.0,Red,True,NaT,98 days,0.581491
8,61.043441,65.0,Blue,True,2023-01-09,42 days,
9,53.017574,1.0,Red,False,NaT,NaT,


In [4]:
md_handler = MissingDataHandler()

# Check the data types
column_dtypes = md_handler.get_column_dtypes(df_missing)
print("Column Data Types:", column_dtypes)

# Detect missingness
missingness_dict = md_handler.detect_missingness(df_missing)
print("Detected Missingness Type:", missingness_dict)

Column Data Types: {'numeric_col1': 'numerical', 'numeric_col2': 'numerical', 'categorical_col': 'categorical', 'boolean_col': 'categorical', 'datetime_col': 'datetime', 'timedelta_col': 'timedelta', 'float_col': 'numerical'}
Detected Missingness Type: {'numeric_col1': 'MCAR', 'numeric_col2': 'MCAR', 'categorical_col': 'MCAR', 'boolean_col': 'MCAR', 'datetime_col': 'MCAR', 'timedelta_col': 'MAR', 'float_col': 'MCAR'}


In [5]:
df_imputed = md_handler.apply_imputation(df_missing, missingness_dict)

print("Before Imputation:\n", df_missing.head(10))


Before Imputation:
    numeric_col1  numeric_col2 categorical_col boolean_col datetime_col  \
0     52.800535           NaN            Blue       False   2023-01-01   
1     47.191847          15.0             NaN       False   2023-01-02   
2     43.776269           NaN             Red        True   2023-01-03   
3     35.839315          36.0             NaN         NaN   2023-01-04   
4     59.943500          76.0           Green       False   2023-01-05   
5     54.064506          81.0             NaN        True   2023-01-06   
6     57.936702          13.0             Red        True          NaT   
7     60.488936          60.0             Red        True          NaT   
8     61.043441          65.0            Blue        True   2023-01-09   
9     53.017574           1.0             Red       False          NaT   

  timedelta_col  float_col  
0       23 days   0.157993  
1       41 days   0.845319  
2       81 days   0.163621  
3       49 days   0.372961  
4       91 days   0.

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
  df[col].fillna(df[col].mode()[0], inplace=True)


In [6]:
print("\nAfter Imputation:\n", df_imputed.head(10))


After Imputation:
    numeric_col1  numeric_col2 categorical_col  boolean_col  \
0     52.800535        46.325            Blue        False   
1     47.191847        15.000            Blue        False   
2     43.776269        46.325             Red         True   
3     35.839315        36.000            Blue         True   
4     59.943500        76.000           Green        False   
5     54.064506        81.000            Blue         True   
6     57.936702        13.000             Red         True   
7     60.488936        60.000             Red         True   
8     61.043441        65.000            Blue         True   
9     53.017574         1.000             Red        False   

         datetime_col    timedelta_col  float_col  
0 2023-01-01 00:00:00 23 days 00:00:00   0.157993  
1 2023-01-02 00:00:00 41 days 00:00:00   0.845319  
2 2023-01-03 00:00:00 81 days 00:00:00   0.163621  
3 2023-01-04 00:00:00 49 days 00:00:00   0.372961  
4 2023-01-05 00:00:00 91 days 00:00:0