# Missing Data Analysis

### Loading necessary packages & data

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

data_full = pd.read_excel(r"...")

### Data MCAR: Little's test

In [None]:
import pandas as pd
import numpy as np
from pyampute.exploration.mcar_statistical_tests import MCARTest

data = pd.read_excel(r"Data")
mt = MCARTest(method="little")
print(mt.little_mcar_test(data))

### Association of missingnes in EQ-5D with observed variables

In [None]:
data_full['B_Index_missing'] = data_full['B_Index'].isnull().astype(int)
df_complete_KPSAGE = data_full.dropna(subset=['KPS'])
print(f"Original data: {len(data_full)} rows")
print(f"Filtered data: {len(df_complete_KPSAGE)} rows")

X = df_complete_KPSAGE[["Age", "Gender", "CCI_YN", "KPS", "Katagiri_Group", "Tumor C-level", "Tumor T-level", "Tumor L-level", "Tumor S-level", "Visceral", "Brain"]]
X = sm.add_constant(X)
y = df_complete_KPSAGE['B_Index_missing']

# Fit logistic regression model
model = sm.Logit(y, X).fit()

# Print summary (includes coefficients, p-values, etc.)
print(model.summary())

In [None]:
df_complete_KPSAGE['M3_Index_missing'] = df_complete_KPSAGE['M3_Index'].isnull().astype(int)

X = df_complete_KPSAGE[["Age", "Gender", "CCI_YN", "KPS", "Katagiri_Group","Tumor C-level", "Tumor T-level", "Tumor L-level", "Tumor S-level", "Visceral", "Brain"]]
X = sm.add_constant(X)
y = df_complete_KPSAGE['M3_Index_missing']

# Fit logistic regression model
model = sm.Logit(y, X).fit()

# Print summary (includes coefficients, p-values, etc.)
print(model.summary())

### Sensitivity analysis: complete-case results vs. imputed results

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, brier_score_loss, make_scorer
from sklearn.model_selection import cross_val_score

data_imputed = pd.read_excel(r"...")
data_imputed["MCID_Result"] = (data_imputed["M3_Index"] - data_imputed["B_Index"] >= 0.08).astype(int)

data_B = data_full[data_full['B_Index_missing'] == 0]
data_M3 = data_full[data_full['M3_Index_missing'] == 0]

X = ["B_Index", "Katagiri_Group", "KPS", "Brain", "Opioid"]
y = "MCID_Result"

# Define custom scorers for Brier score and AUC
brier_scorer = make_scorer(brier_score_loss, greater_is_better=False, needs_proba=True)
auc_scorer = make_scorer(roc_auc_score, needs_proba=True)

### Performance of full (imputed) dataset

In [None]:
# Train a random forest on imputed data
model_imputed = RandomForestClassifier(random_state=0)

auc_scores = cross_val_score(model_imputed, data_imputed[X], data_imputed[y], scoring=auc_scorer, cv=5)
brier_scores = cross_val_score(model_imputed, data_imputed[X], data_imputed[y], scoring=brier_scorer, cv=5)
model_imputed.fit(data_imputed[X], data_imputed[y])

# Results
print("Imputed Data Feature Importances:", model_imputed.feature_importances_)
print("Cross-Validated AUC:", np.mean(auc_scores))
print("Cross-Validated Brier Score:", -np.mean(brier_scores))  

### Performance on complete baseline EQ-5D-3L

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Initialize MissForest imputer
missforest_imputer = IterativeImputer(estimator=RandomForestRegressor(n_estimators=100, random_state=0), 
                                      max_iter=10, random_state=0)

# Impute missing values
B_imputed = pd.DataFrame(missforest_imputer.fit_transform(data_B[["B_Index", "Katagiri_Group", "KPS", "Brain", "Opioid", "M3_Index"]]), columns=["B_Index", "Katagiri_Group", "KPS", "Brain", "Opioid", "M3_Index"])
B_imputed["Brain"] = B_imputed["Brain"].round().astype(int)
B_imputed["Opioid"] = B_imputed["Opioid"].round().astype(int)

X = B_imputed[["B_Index", "Katagiri_Group", "KPS", "Brain", "Opioid"]]
y = (B_imputed["M3_Index"] - B_imputed["B_Index"] >= 0.08).astype(int)

# Train random forest on imputed data
model_B = RandomForestClassifier(random_state=0)

B_auc_scores = cross_val_score(model_B, X, y, scoring=auc_scorer, cv=5)
B_brier_scores = cross_val_score(model_B, X, y, scoring=brier_scorer, cv=5)
model_B.fit(X, y)

# Results
print("B Complete N = ", len(B_imputed))
print("Feature Importances (Imputed Data):", model_B.feature_importances_)
print("Cross-Validated AUC:", np.mean(B_auc_scores))
print("Cross-Validated Brier Score:", -np.mean(B_brier_scores))

### Performance on complete 3-months EQ-5D-3L

In [None]:
# Impute missing values
M3_imputed = pd.DataFrame(missforest_imputer.fit_transform(data_M3[["B_Index", "Katagiri_Group", "KPS", "Brain", "Opioid", "M3_Index"]]), columns=["B_Index", "Katagiri_Group", "KPS", "Brain", "Opioid", "M3_Index"])
M3_imputed["Brain"] = M3_imputed["Brain"].round().astype(int)
M3_imputed["Opioid"] = M3_imputed["Opioid"].round().astype(int)

X = M3_imputed[["B_Index", "Katagiri_Group", "KPS", "Brain", "Opioid"]]
y = (M3_imputed["M3_Index"] - M3_imputed["B_Index"] >= 0.08).astype(int)

model_M3 = RandomForestClassifier(random_state=0)

M3_auc_scores = cross_val_score(model_M3, X, y, scoring=auc_scorer, cv=5)
M3_brier_scores = cross_val_score(model_M3, X, y, scoring=brier_scorer, cv=5)
model_M3.fit(X, y)

# Results
print("M3 Complete N = ", len(M3_imputed))
print("Imputed Data Feature Importances:", model_M3.feature_importances_)
print("Cross-Validated AUC:", np.mean(M3_auc_scores))
print("Cross-Validated Brier Score:", -np.mean(M3_brier_scores))  