### Impute masked UKBB tabular data

In [23]:
import os
from os.path import join
import random
import pandas as pd
import numpy as np
import torch

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer


  from .autonotebook import tqdm as notebook_tqdm


In [58]:
DVM_column_names = [ 'Color', 'Bodytype', 'Gearbox','Fuel type' ,
               'Wheelbase', 'Height', 'Width', 'Length', 'Adv year', 'Adv month', 'Reg year', 'Runned miles', 'Price', 'Seat num', 'Door num', 'Entry price', 'Engine size',]
CAD_column_names = torch.load('/bigdata/siyi/data/UKBB/cardiac_segmentations/projects/SelfSuperBio/18545/cardiac_column_names_reordered.pt')
BASE = '/bigdata/siyi/data'
DATA_TRAIN = join(BASE, 'DVM/features/dvm_features_train_noOH_all_views_physical_jittered_50_reordered.csv')
DATA_TEST = join(BASE, 'DVM/features/dvm_features_test_noOH_all_views_physical_jittered_50_reordered.csv')
ratio = 0.1
DATA_MASK = join(BASE, f'DVM/features/missing_mask/dvm_features_test_noOH_all_views_physical_jittered_50_reordered_dvm_value_{ratio}.npy')

In [59]:
df_train = pd.read_csv(DATA_TRAIN, header=None).astype(float)
df_test = pd.read_csv(DATA_TEST, header=None).astype(float)
mask = np.load(DATA_MASK)
df_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,2.0,10.0,1.0,8.0,-0.729668,1.405839,1.386550,-0.239327,0.012811,-0.350005,0.645417,-0.698105,-0.176431,0.135216,0.618330,-0.374743,-1.148269
1,2.0,3.0,0.0,7.0,1.094360,-2.046870,2.279513,0.835403,-2.370628,1.092766,0.645417,-0.804553,2.494028,-1.059812,-2.402588,4.315906,-0.500312
2,2.0,10.0,0.0,8.0,0.117803,2.048587,-0.649921,0.541419,0.012811,-0.830929,1.339829,-1.197420,0.835140,0.135216,0.618330,0.275969,0.147646
3,20.0,7.0,0.0,1.0,0.471384,1.216311,-0.656346,0.642626,-2.370628,2.054614,0.182476,-0.594089,-0.208745,2.525272,0.618330,-0.145804,-0.370720
4,2.0,5.0,0.0,1.0,-0.117917,-0.588327,0.879038,-0.316437,0.012811,-1.311853,0.413946,-0.594089,-0.095455,0.135216,0.618330,-0.353134,0.147646
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88202,20.0,10.0,0.0,8.0,-0.224553,0.425236,1.033219,0.129359,0.012811,0.611842,1.339829,-1.085172,0.794302,0.135216,0.618330,0.284448,-0.629903
88203,1.0,10.0,0.0,6.0,-0.044956,1.331676,1.772002,0.604071,0.012811,-0.350005,1.339829,-1.200928,1.360697,0.135216,0.618330,0.549492,0.795603
88204,20.0,11.0,0.0,6.0,1.268344,-1.033307,-0.213076,1.165533,0.012811,-1.792777,0.645417,-0.856070,0.605809,0.135216,-0.388643,0.576844,0.795603
88205,2.0,5.0,0.0,1.0,-0.061793,-0.415280,1.148854,-0.116432,0.012811,-0.830929,0.645417,0.171581,-0.052567,0.135216,0.618330,-0.325782,0.147646


In [60]:
mask.shape

(88207, 17)

In [61]:
mask

array([[False, False, False, ...,  True,  True, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [ True, False, False, ..., False,  True, False],
       [False, False, False, ..., False, False, False]])

In [62]:
# Mask the test 
df_test_masked = df_test.copy()
df_test_masked.values[mask] = np.nan
df_test_masked

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,2.0,10.0,1.0,8.0,,1.405839,1.386550,-0.239327,0.012811,-0.350005,0.645417,-0.698105,-0.176431,0.135216,,,-1.148269
1,2.0,3.0,0.0,7.0,1.094360,-2.046870,2.279513,0.835403,-2.370628,,0.645417,-0.804553,2.494028,-1.059812,-2.402588,4.315906,-0.500312
2,2.0,10.0,0.0,8.0,0.117803,2.048587,-0.649921,0.541419,0.012811,-0.830929,1.339829,-1.197420,0.835140,0.135216,0.618330,0.275969,0.147646
3,20.0,7.0,0.0,1.0,0.471384,1.216311,,,-2.370628,2.054614,0.182476,-0.594089,-0.208745,2.525272,0.618330,-0.145804,
4,,5.0,0.0,1.0,-0.117917,-0.588327,0.879038,-0.316437,0.012811,-1.311853,0.413946,-0.594089,,0.135216,0.618330,-0.353134,0.147646
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88202,20.0,10.0,0.0,8.0,-0.224553,,1.033219,0.129359,0.012811,0.611842,1.339829,-1.085172,0.794302,0.135216,,0.284448,-0.629903
88203,1.0,,0.0,6.0,-0.044956,1.331676,1.772002,0.604071,0.012811,-0.350005,1.339829,-1.200928,1.360697,0.135216,0.618330,,0.795603
88204,20.0,11.0,0.0,6.0,1.268344,,-0.213076,1.165533,0.012811,-1.792777,0.645417,-0.856070,0.605809,,-0.388643,0.576844,0.795603
88205,,5.0,0.0,1.0,-0.061793,-0.415280,1.148854,-0.116432,0.012811,-0.830929,0.645417,0.171581,-0.052567,0.135216,0.618330,,0.147646


In [63]:
N_test = df_test.shape[0]
N_train = df_train.shape[0]
df_all = pd.concat([df_train, df_test_masked], axis=0).reset_index(drop=True)

In [64]:
num_cat = 4
cat_cols = df_all.columns[:num_cat].tolist()
cont_cols = df_all.columns[num_cat:].tolist()

cat_imputer = IterativeImputer(
    estimator=RandomForestClassifier(),
    max_iter=10,
    random_state=42,
    initial_strategy="most_frequent",
    skip_complete=False
)
cont_imputer = IterativeImputer(
    max_iter=10,
    random_state=42
)

preprocessor = ColumnTransformer([
    ("cat", cat_imputer, cat_cols),
    ("cont", cont_imputer, cont_cols)
])

df_all_filled = preprocessor.fit_transform(df_all)
df_all_filled = pd.DataFrame(df_all_filled, columns=df_all.columns)



In [65]:
df_test_filled = df_all_filled.iloc[N_train:, :].reset_index(drop=True)
df_test_filled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,2.0,10.0,1.0,8.0,-0.065525,1.405839,1.386550,-0.239327,0.012811,-0.350005,0.645417,-0.698105,-0.176431,0.135216,0.731965,-0.514454,-1.148269
1,2.0,3.0,0.0,7.0,1.094360,-2.046870,2.279513,0.835403,-2.370628,0.626440,0.645417,-0.804553,2.494028,-1.059812,-2.402588,4.315906,-0.500312
2,2.0,10.0,0.0,8.0,0.117803,2.048587,-0.649921,0.541419,0.012811,-0.830929,1.339829,-1.197420,0.835140,0.135216,0.618330,0.275969,0.147646
3,20.0,7.0,0.0,1.0,0.471384,1.216311,0.283937,0.447182,-2.370628,2.054614,0.182476,-0.594089,-0.208745,2.525272,0.618330,-0.145804,-0.222059
4,1.0,5.0,0.0,1.0,-0.117917,-0.588327,0.879038,-0.316437,0.012811,-1.311853,0.413946,-0.594089,-0.062848,0.135216,0.618330,-0.353134,0.147646
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88202,20.0,10.0,0.0,8.0,-0.224553,0.155504,1.033219,0.129359,0.012811,0.611842,1.339829,-1.085172,0.794302,0.135216,0.198481,0.284448,-0.629903
88203,1.0,10.0,0.0,6.0,-0.044956,1.331676,1.772002,0.604071,0.012811,-0.350005,1.339829,-1.200928,1.360697,0.135216,0.618330,0.961157,0.795603
88204,20.0,11.0,0.0,6.0,1.268344,-0.096372,-0.213076,1.165533,0.012811,-1.792777,0.645417,-0.856070,0.605809,0.154729,-0.388643,0.576844,0.795603
88205,1.0,5.0,0.0,1.0,-0.061793,-0.415280,1.148854,-0.116432,0.012811,-0.830929,0.645417,0.171581,-0.052567,0.135216,0.618330,-0.045213,0.147646


In [66]:
# Use accuracy for categorical columns and MSE for continuous columns to evaluate imputation performance (only on the masked entries)
from sklearn.metrics import mean_squared_error, accuracy_score
acc_list = []
for col in cat_cols:
    true = df_test.iloc[:, col]
    pred = df_test_filled.iloc[:, col]
    mask_col = mask[:, col]
    acc = accuracy_score(true[mask_col], pred[mask_col])
    acc_list.append(acc)
print("Categorical columns accuracy:", np.mean(acc_list))
mse_list = []
for col in cont_cols:
    true = df_test.iloc[:, col]
    pred = df_test_filled.iloc[:, col]
    mask_col = mask[:, col]
    mse = mean_squared_error(true[mask_col], pred[mask_col])
    mse_list.append(mse)
print("Continuous columns MSE:", np.mean(mse_list))
results = pd.DataFrame({
    "Column": DVM_column_names,
    "MSE": [0] * (len(cat_cols)) + mse_list,
    "Accuracy": acc_list + [0] * (len(cont_cols)),
})
results

Categorical columns accuracy: 0.5276038445398044
Continuous columns MSE: 0.45439047442662506


Unnamed: 0,Column,MSE,Accuracy
0,Color,0.0,0.195867
1,Bodytype,0.0,0.493584
2,Gearbox,0.0,0.726841
3,Fuel type,0.0,0.694123
4,Wheelbase,0.223307,0.0
5,Height,0.600911,0.0
6,Width,0.635042,0.0
7,Length,0.196025,0.0
8,Adv year,0.951673,0.0
9,Adv month,0.942565,0.0


In [67]:
# Save the imputed test set and results
df_test_filled.to_csv(DATA_TEST[:-4]+f'_mask_{ratio}_filled.csv', index=False, header=False)
results.to_csv(f'DVM_mask_{ratio}_imputation_results.csv', index=False)