## ADNI imputation 

In [6]:
from google.colab import drive
drive.mount('/content/drive')

!pip install miceforest
import numpy as np
import pandas as pd
import random 
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import miceforest as mf
from sklearn_pandas import DataFrameMapper
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [10]:
def preprocess_data(df):
    features = ['ABETA', 'AGE', 'PTGENDER', 'PTEDUCAT', 'PTETHCAT', 'PTRACCAT'
                , 'PTMARRY', 'APOE4', 'FDG', 'PIB', 'AV45', 'CDRSB', 'ADAS11', 'ADAS13'
                , 'MMSE', 'RAVLT_immediate', 'FAQ', 'MOCA', 'EcogPtMem', 'EcogPtLang'
                , 'EcogPtVisspat', 'mPACCdigit', 'EcogPtPlan', 'EcogPtOrgan', 'EcogPtDivatt'
                , 'EcogSPMem', 'EcogPtTotal', 'EcogSPLang', 'EcogSPVisspat', 'EcogSPPlan'
                , 'EcogSPOrgan', 'EcogSPDivatt', 'EcogSPTotal', 'Ventricles', 'Hippocampus'
                , 'WholeBrain', 'Entorhinal', 'Fusiform', 'MidTemp', 'ICV'
                , 'RAVLT_learning', 'RAVLT_forgetting', 'RAVLT_perc_forgetting', 'LDELTOTAL'
                , 'DIGITSCOR', 'TRABSCOR', 'mPACCtrailsB', 'ADASQ4', 'PTAU', 'TAU', 'Month_bl']

    df = df[features]  

    # drop low occurring, ambiguous rows
    df = df.drop(df.index[df['ABETA'] == '>1700'])
    df = df.drop(df.index[df['ABETA'] == '<200'])
    df = df.drop(df.index[df['PTAU'] == '>120'])
    df = df.drop(df.index[df['PTAU'] == '<8'])

    # decide on categorical and numerical features
    cols_categorical = ['PTGENDER', 'PTETHCAT', 'PTRACCAT', 'PTMARRY']
    cols_numerical = [feat for feat in df.columns if feat not in cols_categorical]
    xtick_label = cols_categorical + cols_numerical

    # resolve errors by specifying each column into numerical or category
    for c in cols_numerical:
        df[c] = df[c].astype(float)

    for c in cols_categorical:
        df[c] = df[c].astype('category')

    # normalize numerical data and encode categorical data 
    numerical = [([col], StandardScaler()) for col in cols_numerical]
    categorical = [([col], LabelEncoder()) for col in cols_categorical]

    df_test = df.sample(frac=0.2)
    df = df.drop(df_test.index)
    df_val = df.sample(frac=0.2)
    df_train = df.drop(df_val.index)

    x_mapper = DataFrameMapper(categorical + numerical)
    x = x_mapper.fit_transform(df).astype('float32') # fit_transform should be on entire dataset
    x_train = x_mapper.transform(df_train).astype('float32') # fit_transform should be on entire dataset
    x_val = x_mapper.transform(df_val).astype('float32')
    x_test = x_mapper.transform(df_test).astype('float32')
    return x_train, x_val, x_test

def random_mask(data, ratio=0.1):
    # Evaluation is done by masking 10% observed values in validation and test set 
    # np.nan is used for masking
    orig = data.copy()

    not_nan_indices = np.where(~np.isnan(data))  # Find indices where element is not nan
    num_not_nan_indices = len(not_nan_indices[0])  # Count number of not-nan indices
    num_to_choose = int(num_not_nan_indices * ratio)  # Choose 10% of not-nan indices
    tuple_list = list(zip(not_nan_indices[0], not_nan_indices[1]))
    chosen_tuples = random.sample(tuple_list, num_to_choose)

    for i in range(len(chosen_tuples)):
        data[chosen_tuples[i][0],chosen_tuples[i][1]] = np.nan

    return orig, data, chosen_tuples

# testing on 51 features 
df = pd.read_csv('/content/drive/MyDrive/survival/ADNIMERGE.csv')
x_train, x_val, x_test = preprocess_data(df)
orig_val, x_val, masked_indices_val = random_mask(x_val, ratio=0.1)
orig_test, x_test, masked_indices_test = random_mask(x_test, ratio=0.1)
'''
# MICEFOREST
kds = mf.ImputationKernel(
    x_train,
    save_all_iterations=True,
    random_state=0
)

kds.mice()
miceforest_imputed_train = kds.complete_data(x_train)
miceforest_imputed_val = kds.impute_new_data(x_val)
miceforest_imputed_test = kds.impute_new_data(x_test)
'''
# MICE
mice_imputer = IterativeImputer(max_iter=10, random_state=0)
mice_imputed_train = mice_imputer.fit_transform(x_train)
mice_imputed_val = mice_imputer.transform(x_val)
mice_imputed_test = mice_imputer.transform(x_test)

# KNN
knn_imputer = KNNImputer()
knn_imputed_train = knn_imputer.fit_transform(x_train)
knn_imputed_val = knn_imputer.transform(x_val)
knn_imputed_test = knn_imputer.transform(x_test)

# Evaluate MSE for x_val, x_test
mse_mice = 0.0
mse_knn = 0.0
mse_miceforest = 0.0
for tup in masked_indices_val:
    row, col = tup
    mse_mice += (mice_imputed_val[row,col] - orig_val[row,col])**2
    mse_knn += (knn_imputed_val[row,col] - orig_val[row,col])**2

print(f'----MSE for validation----')
print(f'mice: {mse_mice/len(masked_indices_val)}')
print(f'knn: {mse_knn/len(masked_indices_val)}')

mse_mice = 0.0
mse_knn = 0.0
mse_miceforest = 0.0
for tup in masked_indices_test:
    row, col = tup
    mse_mice += (mice_imputed_test[row,col] - orig_test[row,col])**2
    mse_knn += (knn_imputed_test[row,col] - orig_test[row,col])**2

print(f'----MSE for test----')
print(f'mice: {mse_mice/len(masked_indices_test)}')
print(f'knn: {mse_knn/len(masked_indices_test)}')


  df = pd.read_csv('/content/drive/MyDrive/survival/ADNIMERGE.csv')
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


----MSE for validation----
mice: 9293.957890635918
knn: 0.41283268640947435
----MSE for test----
mice: 4908.220177351426
knn: 0.43768168699428595
