In [1]:
import numpy as np
import pandas as pd
import os

import datetime
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import pyampute
import pickle 
import time

from scipy.stats import mstats
from scipy.stats.mstats import winsorize
from scipy import stats
from xgboost import XGBRegressor
from sklearn import tree
from pyampute.ampute import MultivariateAmputation
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from lifelines import CoxPHFitter, WeibullFitter, WeibullAFTFitter
from datetime import datetime, date, timedelta
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from os.path import isfile, join
from sklearn.metrics import mean_absolute_error, roc_auc_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from statsmodels.gam.tests.test_penalized import df_autos
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import norm
from scipy.spatial import distance
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
import warnings 
warnings.filterwarnings('ignore')

from pyampute.ampute import MultivariateAmputation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from app_hyperparameters import init_parameters_decision_tree, init_parameters_xgboost 
from app_hyperparameters import init_parameters_bayesian_ridge, init_parameters_random_forest
from app_stopping_criteria import stop_iteration
from app_uncertainty import uncertainty_sampling, multi_argmax, imputation_uncertainty
from app_uncertainty import EI, remove_outliers
from app_init import init_truncation, init_variable_schema, init_imputation_columns

import miceforest as mf 
import random

os.chdir('H:/Shared drives/CKD_Progression/')

drive = 'H'
main_path = drive + ':/Shared drives/CKD_Progression/save/qoc_cohort_ver002.csv'
data_path = drive + ':/Shared drives/CKD_Progression/data/'
docs_path = drive + ':/Shared drives/CKD_Progression/docs/'
save_path = drive + ':/Shared drives/CKD_Progression/save/'
covariates_path = docs_path + 'covariates.csv'
removecols_path = docs_path + 'remove_columns.csv'

In [2]:
columns_impute = init_imputation_columns()
main_data = pd.read_csv(docs_path + 'CKD_TVC_IMPUTATION_14December2024.csv')
main_data = main_data.rename(columns = {'height': 'HIGH'})
main_data = main_data[['ENC_HN', 'modulo_365'] + columns_impute]
main_data = remove_outliers(main_data, 'BMI')
main_data = remove_outliers(main_data, 'HIGH')
main_data = main_data[main_data['modulo_365'] != 14]

mask = main_data['modulo_365'] == 13
main_data.loc[mask, 'Renal_Serum_creatinine'] = guess = np.random.normal(loc   = main_data['Renal_Serum_creatinine'].mean(), 
                                                                         scale = main_data['Renal_Serum_creatinine'].std(), 
                                                                         size  = len(main_data[mask]))

In [3]:
def iteration_metrics(data, plot_type, log=False, show=True, export_path=False):
    for i in range(len(data)):
        g = sns.lineplot(data.iloc[i,:])
        g.set_xticks(range(len(data.columns))) 
        g.set_xticklabels([str(int(i)+1) for i in data.columns])
        
        g.set_xlabel("number of iterations")
        g.set_ylabel(plot_type)
        if log:
            plt.yscale("log")
    if export_path:
        plt.savefig(export_path+"/"+"_".join(plot_type.split(" "))+".png")
    if show:
        plt.show()
    else:
        plt.clf()

def conv_metric(col, func, convergence_data):
    def calc(data, func=["mean", "std"]):
        if func == "mean":
            return data.mean()
        elif func == "std":
            return data.std()
    # temp = pd.DataFrame([[calc(i[col], "mean") for i in imp_data.iloc[:, 1:]] for imp_data in convergence_data])
    temp = pd.DataFrame([[calc(imp_data.iloc[row_idx, col_idx], "mean") for col_idx in range(1, imp_data.shape[1])] 
                         for imp_data in convergence_data for row_idx in range(imp_data.shape[0])])
    return temp
    
def convergence_plot(features, convergence_data, func = ["mean", "std"], show=False, export_path=False):
    for col in features:
        temp = conv_metric(col=col, func=func, convergence_data=convergence_data)
        iteration_metrics(temp, plot_type=col+" "+func+" convergence", show=show, export_path=export_path)

def delta_metric(col, delta_change):
    temp = pd.concat([data[data.index == col] for data in delta_change]).reset_index(drop=True)
    return temp
    
def delta_plot(features, delta_change, log=False, show=True, export_path=False):
    for col in features:
        temp = delta_metric(col=col, delta_change=delta_change)
        iteration_metrics(temp, plot_type = col+" delta", log = log, show=show, export_path=export_path)

def density_plot(imputed, features, amputed=False, reference=False, show=False, export_path=False):
    for col in features:
        if reference is not False:
            sns.kdeplot(reference[col], color="g", fill=True, linewidth=0, alpha=0.8, label="Reference data")
        if amputed is not False:
            sns.kdeplot(amputed[col], color="b", label = "Amputated data")
        for i in range(len(imputed)):
            if i==0:
                sns.kdeplot(imputed[i][col], color="r", alpha=0.5, label="Imputed data")
            else:
                sns.kdeplot(imputed[i][col], color="r", alpha=0.5)
        plt.legend()
        if export_path:            
            plt.savefig(export_path+"/"+col+".png")
        if show:
            plt.show()
        else:
            plt.clf()

In [4]:
def load_imputed(model, convergence, path):
    name = model + "_" + convergence
    path = path+"/"+name+".pickle"
    with open(path, "rb") as handle:
        imputed_data = pickle.load(handle)
    return imputed_data

def calculate_summary_statistics(df):
    mean_values   = df.mean()
    std_values    = df.std()
    median_values = df.median()
    iqr_values    = df.quantile(0.75) - df.quantile(0.25)
    summary_stats = pd.DataFrame({
        'Mean': mean_values,
        'Standard Deviation': std_values,
        'Median': median_values,
        'IQR': iqr_values})
    return summary_stats

def export_imputed(model, convergence, features, path, verbose=True):
    print('Exporting for {} model with {} convergence'.format(model, convergence))
    name = model + '_' + convergence
    dst_dir = path+'/'+name+'/'
    if verbose:
        print('Exporting datasets', end = ', ')
    os.makedirs(os.path.dirname(dst_dir + 'datasets/'), exist_ok = True)
    imputed_data = load_imputed(model = model, convergence = convergence, path = path)

    df_list = imputed_data['imputed_data']
    convergence_data = imputed_data['convergence_data'][0][0]
    delta_change = imputed_data['iteration_delta'][0]

    df_list = imputed_data['imputed_data']
    stacked_df = np.stack([df.to_numpy() for df in df_list], axis = 2)
    first_column = stacked_df[:, 0, :]
    numeric_data = stacked_df[:, 1:, :]
    numeric_data = numeric_data.astype(float)
    mean_values = np.nanmean(numeric_data, axis = 2)

    averaged_df = pd.DataFrame(mean_values, columns = df_list[0].columns[1:])
    averaged_df.insert(0, 'ENC_HN', first_column[:, 0]) 

    missing_bmi_mask = averaged_df['BMI'].isna()
    calculated_bmi = averaged_df.loc[missing_bmi_mask, 'BW'] / (averaged_df.loc[missing_bmi_mask, 'HIGH'] / 100) ** 2
    averaged_df.loc[missing_bmi_mask, 'BMI'] = calculated_bmi

    summary_statistics = calculate_summary_statistics(averaged_df.iloc[:, 2:])
    maindat_statistics = calculate_summary_statistics(main_data.iloc[:, 1:])

    mean_summary = []
    median_summary = []
    for i in range(len(df_list)):
        complete_baseline = df_list[i]

        imp_summary = complete_baseline.describe()[columns_impute].T[['mean', 'std']].apply(lambda row: [round(i, 2) for i in row], axis = 1)
        mean_summary  .append(imp_summary)
        imp_summary = complete_baseline.describe()[columns_impute].T[['50%', '25%', '75%']].apply(lambda row: [round(i, 2) for i in row], axis = 1)
        median_summary.append(imp_summary)

    rmse  = np.sqrt((pd.concat([((data[columns_impute] - main_data[columns_impute]).mean())**2 for data in df_list], axis = 'columns')).sum(1)/len(df_list))
    mae   = pd.concat([abs(data[columns_impute] - main_data[columns_impute]).mean() for data in df_list], axis = 'columns')

    mean_summary = pd.concat(mean_summary, axis = 'columns')
    mean_summary['mean'] = mean_summary.iloc[:,:len(df_list)].apply(lambda row: np.mean([i[0] for i in row]), axis = 1)
    mean_summary['sd']   = mean_summary.iloc[:,:len(df_list)].apply(lambda row: np.mean([i[1] for i in row]), axis = 1)
    mean_summary['rmse'] = rmse

    median_summary = pd.concat(median_summary, axis = 'columns')
    median_summary['median'] = median_summary.iloc[:,:len(df_list)].apply(lambda row: np.mean([i[0] for i in row]), axis = 1)
    median_summary['25']     = median_summary.iloc[:,:len(df_list)].apply(lambda row: np.mean([i[1] for i in row]), axis = 1)
    median_summary['75']     = median_summary.iloc[:,:len(df_list)].apply(lambda row: np.mean([i[2] for i in row]), axis = 1)
    median_summary['rmse']   = rmse
    
    mean_summary  .to_csv(dst_dir + 'mean_distribution.csv')
    median_summary.to_csv(dst_dir + 'median_distribution.csv')
    
    summary_statistics.to_csv(dst_dir + 'summary_statistics.csv')
    maindat_statistics.to_csv(dst_dir + 'primary_statistics.csv')
    averaged_df.to_csv(dst_dir + 'IMPUTED_LinearReg.csv', index = False)

    # if verbose:
    #     print('Convergence plots', end = ', ')
    # dst_dir = path + '/' + name + '/'
    # os.makedirs(os.path.dirname(dst_dir + 'convergence_plot/'), exist_ok = True)
    # convergence_plot(columns_impute, convergence_data = df_list, func = 'mean', show = False, export_path = dst_dir + 'convergence_plot/')

    # if verbose:
    #     print('Density plots', end = ', ')
    # dst_dir = path + '/' + name + '/'
    # os.makedirs(os.path.dirname(dst_dir + "density_plot/"), exist_ok = True)  
    # density_plot(df_list, columns_impute, amputed = main_data, reference = main_data, show = False, export_path = dst_dir + "density_plot/")

    # if convergence != 'maxit':
    #     if verbose:
    #         print('delta plots')
    #     dst_dir = path + '/' + name + '/'
    #     os.makedirs(os.path.dirname(dst_dir + 'delta_plot/'), exist_ok = True)  
    #     delta_plot(columns_impute, delta_change, log = False, show = False, export_path = dst_dir + 'delta_plot/')
    # else:
    #     print('')
    return df_list

In [5]:
path = 'H:/Shared drives/CKD_Progression/result/tvc_imputation'
df_list = export_imputed('linear', 'early_stop_100D_10I', columns_impute, path, verbose = True)

Exporting for linear model with early_stop_100D_10I convergence
Exporting datasets, 