In [2]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
import numpy as np

In [3]:
df = pd.read_csv("Lynch2022_demographic.csv")

In [4]:
def change_to_label(rows):
    label = -1
    # label sex
    if pd.isna(rows["Sex"]):
        label = 9
    elif rows["Sex"] == "F":
        label = 1
    elif rows["Sex"] == "M":
        label = 2
    else:
        label = 9
    label *= 10

    # label age
    if pd.isna(rows["age_group"]):
        label += 9
    elif rows["age_group"] == "<10":
        label += 0
    elif rows["age_group"] == "<20":
        label += 1
    elif rows["age_group"] == "20-65":
        label += 2
    elif rows["age_group"] == "≥65":
        label += 3
    else:
        label += 9
    label *= 10

    # label treatment group
    if pd.isna(rows["treatment_group"]):
        label += 9
    elif rows["treatment_group"] == "MDI":
        label += 0
    elif rows["treatment_group"] == "Basic Pump":
        label += 1
    elif rows["treatment_group"] == "SAP":
        label += 2
    elif rows["treatment_group"] == "CLP":
        label += 3
    elif rows["treatment_group"] == "BP":
        label += 4
    else:
        label += 9

    label *= 10

    # label a1c group
    if pd.isna(rows["a1c_mean_group"]):
        label += 9
    elif rows["a1c_mean_group"] == "<7":
        label += 0
    elif rows["a1c_mean_group"] == "<8.5":
        label += 1
    elif rows["a1c_mean_group"] == "≥8.5":
        label += 2
    else:
        label += 9

    return label

In [5]:
df["non_imputed_label"] = df.apply(change_to_label, axis=1)

#imputation

In [6]:
df_needed = df[["PtID", "AgeAsofEnrollDt", "Sex", "treatment_group", "hba1c"]].copy()

df_needed = df_needed.reset_index(drop=True)

df_needed["treatment_group"] = df_needed["treatment_group"].replace(
    {
        "Basic Pump/SAP": pd.NA,
    }
)


df_needed = df_needed.replace({pd.NA: np.nan})

In [7]:
from sklearn.preprocessing import StandardScaler

# Sample DataFrame
numeric_df = df_needed[["AgeAsofEnrollDt", "hba1c"]]

# Initialize the scaler
scaler = StandardScaler()

# Normalize the data
normalized_data = scaler.fit_transform(numeric_df)

# Convert back to DataFrame
normalized_df = pd.DataFrame(normalized_data, columns=numeric_df.columns)

In [8]:
# Encode categorical features
encoder = OrdinalEncoder()

In [9]:
df_category = df_needed[["Sex", "treatment_group"]]
df_category_encoded = pd.DataFrame(
    encoder.fit_transform(df_category), columns=df_category.columns
)

In [10]:
encoder.categories_

[array(['F', 'M'], dtype=object),
 array(['BP', 'Basic Pump', 'CLP', 'MDI', 'SAP', nan], dtype=object)]

In [11]:
df_other = pd.merge(
    normalized_df, df_category_encoded, left_index=True, right_index=True
)

In [12]:
imputer = KNNImputer(n_neighbors=3)

In [13]:
df_imputed = pd.DataFrame(imputer.fit_transform(df_other), columns=df_other.columns)

In [14]:
# Decode back to original categories
df_imputed_category = df_imputed[["Sex", "treatment_group"]]


df_final_category = pd.DataFrame(
    encoder.inverse_transform(df_imputed_category), columns=df_imputed_category.columns
)

In [None]:
#df_imputed_numeric = df_imputed[["AgeAsofEnrollDt", "avg_hba1c"]]
# Reverse normalization
#denormalized_data = scaler.inverse_transform(df_imputed_numeric)

# Convert back to DataFrame
#denormalized_df = pd.DataFrame(denormalized_data, columns=df_imputed_numeric.columns)

In [15]:
df_id = df_needed[["PtID"]]
df_final = pd.merge(df_id, df_final_category, left_index=True, right_index=True)

In [16]:
df = df.merge(
    df_final,
    on=["PtID"],
    suffixes=("_old", ""),
    how="left",
)

In [17]:
df_old_columns = df_final.columns[1:]
df_old_columns = [c + "_old" for c in df_old_columns]

In [18]:
df.drop(columns=df_old_columns, inplace=True)

In [20]:
df["label"] = df.apply(change_to_label, axis=1)

In [21]:
df_segment = pd.read_csv("lynch2022_subject_segment_summary.csv")

In [22]:
df_segment = df_segment[["SubjectID", ">=7_days"]]

In [23]:
df_merged = df.merge(
    df_segment,
    left_on="PtID",
    right_on="SubjectID",
    how="left",
)

In [24]:
# use only filter in rows
df_merged = df_merged[df_merged[">=7_days"] == 1]

In [25]:
df_merged.to_csv("lynch_demographic_imputed_labeled.csv", index=False)