In [None]:
# Cargamos los datos en pandas
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data_path = "datos"
supplemental = pd.read_csv(os.path.join(data_path, "supplemental_clinical_data.csv"))
patient =pd.read_csv(os.path.join(data_path, "train_clinical_data.csv"))
peptides = pd.read_csv(os.path.join(data_path, "train_peptides.csv"))
proteins = pd.read_csv(os.path.join(data_path, "train_proteins.csv"))

In [None]:
scaled_patient = pd.concat([patient,supplemental])
updrs_ranges = [52,52,132,24]
updrs_cols = [f"updrs_{i}" for i in range(1,5)]
for updrs_range, col in zip(updrs_ranges, updrs_cols):
    scaled_patient[col] /= updrs_range

scaled_protein = proteins.copy()
scaled_protein["NPX"] = np.log2(proteins["NPX"])
scaled_protein = (
    scaled_protein[["UniProt", "NPX"]]
    .groupby("UniProt")
    .agg(["min", "max"])
    .droplevel(0, axis=1)
    .join(proteins.set_index("UniProt"))
)
scaled_protein["NPX"] = (scaled_protein["NPX"] - scaled_protein["min"]) / (
    scaled_protein["max"] - scaled_protein["min"]
).drop(columns=["min", "max"])

scaled_peptide = peptides.copy()
scaled_peptide["PeptideAbundance"]= np.log2(peptides["PeptideAbundance"])
scaled_peptide = (
    scaled_peptide[["UniProt", "PeptideAbundance", "Peptide"]]
    .groupby(["UniProt", "Peptide"])
    .agg(["min", "max"])
    .droplevel(0, axis=1)
    .join(peptides.set_index(["UniProt", "Peptide"]))
)
scaled_peptide["PeptideAbundance"] = (scaled_peptide["PeptideAbundance"] - scaled_peptide["min"]) / (
    scaled_peptide["max"] - scaled_peptide["min"]
).drop(columns=["min", "max"])

In [None]:

scaled_patient = scaled_patient.rename(
    columns={"upd23b_clinical_state_on_medication": "on_medication"}
)
scaled_patient["on_medication"] = (
    scaled_patient["on_medication"]
    .case_when(
        [
            (scaled_patient.on_medication.eq("On"), 1),
            (scaled_patient.on_medication.eq("Off"), -1),
        ]
    )
    .fillna("0")
)

In [None]:

from itertools import product

def safe_get(patient_id, visit_month, target_col):
    try:
        return indexed_scaled_patient.loc[(patient_id, visit_month), [target_col]].iloc[
            0
        ]
    except KeyError:
        return np.nan

with_leads = scaled_patient
indexed_scaled_patient = scaled_patient.set_index(["patient_id", "visit_month"])

for plus_months, target_col in product(
    [6, 12, 24],
    [
        "updrs_1",
        "updrs_2",
        "updrs_3",
        "updrs_4"
    ],
):
    with_leads[f"{target_col}_plus_{plus_months}"] = with_leads.apply(
        lambda row: safe_get(
            row["patient_id"], row["visit_month"] + plus_months, target_col
        ),
        axis=1,
    )
with_leads = with_leads[~with_leads.updrs_1_plus_6.isna()]

In [None]:
with_leads = with_leads.set_index(["patient_id", "visit_month"]).join(
    proteins[["patient_id", "visit_month", "NPX"]]
    .groupby(["patient_id", "visit_month"])
    .count(),
    how="left",
).reset_index()
with_leads["did_test"] = with_leads["NPX"].case_when([(with_leads["NPX"] > 0, 1)]).fillna(0)
with_leads = with_leads.drop(columns=["NPX"])

In [None]:
with_leads["last_visit"] = with_leads.sort_values(by=['patient_id', 'visit_month']).groupby("patient_id")["visit_month"].shift(1).fillna(0)
with_leads["visit_diff"] = with_leads["visit_month"] - with_leads["last_visit"]
with_leads["visit_count"] = with_leads.groupby('patient_id').cumcount()

In [None]:
with_leads = (
    with_leads.set_index(["patient_id", "visit_month"])
    .join(
        scaled_peptide.pivot_table(
            values="PeptideAbundance",
            index=["patient_id", "visit_month"],
            columns=["Peptide"],
            aggfunc="sum",
        ).fillna(0)
    )
    .reset_index()
)

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import KFold

no_na = with_leads.fillna(0)
feature_cols = [
    "visit_month",
    "did_test",
    "on_medication",
    "updrs_1",
    "updrs_2",
    "updrs_3",
    "visit_count",
    "visit_diff",
    "updrs_4"
] + list(scaled_peptide.reset_index()["Peptide"].unique())
target_cols = [
    f"{target_col}_plus_{plus_months}"
    for plus_months, target_col in product(
        [6, 12, 24],
        [
            "updrs_1",
            "updrs_2",
            "updrs_3",
            "updrs_4"
        ],
    )
]
X = no_na[feature_cols].to_numpy(dtype="float")
y = no_na[target_cols].to_numpy(dtype="float")

# Define SMAPE as a custom loss function
def smape(y_true, y_pred):
    epsilon = 1e-10
    numerator = tf.abs(y_true - y_pred)
    denominator = tf.abs(y_true) + tf.abs(y_pred) + epsilon
    smape = 2 * numerator / denominator
    return tf.reduce_mean(smape)

# Define the model
def create_model():
    model = Sequential(
        [
            Dense(256, activation="relu"),
            Dropout(0.1),
            Dense(128, activation="relu"),
            Dropout(0.1),
            Dense(64, activation="relu"),
            Dropout(0.1),
            Dense(32, activation="relu"),
            Dropout(0.1),
            Dense(12, activation="linear"),
        ]
    )
    model.compile(optimizer="adam", loss=smape, metrics=["mae"])
    return model


# Cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# Cross-validation loop
cv_results = []
for train_idx, val_idx in kf.split(X):
    X_train_fold, X_val_fold = X[train_idx], X[val_idx]
    y_train_fold, y_val_fold = y[train_idx], y[val_idx]

    model = create_model()
    history = model.fit(
        X_train_fold,
        y_train_fold,
        validation_data=(X_val_fold, y_val_fold),
        epochs=50,
        batch_size=16,
        verbose=1,
    )

    val_loss = model.evaluate(X_val_fold, y_val_fold, verbose=0)[0]
    cv_results.append(val_loss)

print(
    f"Cross-Validation SMAPE Loss: {np.mean(cv_results):.4f} ± {np.std(cv_results):.4f}"
)


In [None]:

def predict(real):
    return dict(
        zip(
            target_cols,
            map(
                float,
                model.call(
                    inputs=np.array([[real[col] for col in feature_cols]])
                ).numpy()[0],
            ),
        )
    ) | {k: v for k, v in real.items() if k in {f"updrs_{i}" for i in range(1, 5)}}

def to_ys(data):
    return [
        [
            data[f"updrs_{i}{'_plus_' + str(month) if month > 0 else ''}"]
            for month in [0, 6, 12, 24]
        ]
        for i in range(1, 5)
    ]

def plot(real, colors = ["#61bbb6", "#c3f2f0","#ad56cd", "#4a3b85"], filename = "a.png"):
    predicted = predict(real)
    x = [0, 6, 12, 24]
    real_ys = to_ys(real)
    predicted_ys = to_ys(predicted)
    for real_y, predicted_y, color, i in zip(real_ys, predicted_ys, colors, range(1,5)):
        plt.plot(x, real_y, color = color, label = f"Real updrs_{i}")
        plt.plot(x, predicted_y, '-.', color = color, label = f"Predicted updrs_{i}")
    plt.legend()
    plt.gcf().set_size_inches((18,6))
    plt.savefig(filename)
    plt.clf()

for i in range(8):
    real = no_na.iloc[val_idx].to_dict(orient="records")[i]
    plot(real, filename=f"nn-model-results-{i}.png")


