In [None]:
# Cargamos los datos en pandas
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

data_path = "../datos"
supplemental = pd.read_csv(os.path.join(data_path, "supplemental_clinical_data.csv"))
patient = pd.read_csv(os.path.join(data_path, "train_clinical_data.csv"))
peptides = pd.read_csv(os.path.join(data_path, "train_peptides.csv"))
proteins = pd.read_csv(os.path.join(data_path, "train_proteins.csv"))



In [None]:
print(
    f"""Cardinalidad de los datos:
Hay {len(proteins.UniProt.unique())} proteinas únicas
Hay {len(peptides.Peptide.unique())} péptidos únicos
Hay {len(peptides[["Peptide", "UniProt"]].drop_duplicates())} pares de proteina-peptido únicos.
Hay {len(patient.patient_id.unique())} pacientes
Hay {len(patient)} visitas
Hay {len(supplemental.patient_id.unique())} pacientes (suplementario)
Hay {len(supplemental)} visitas (suplementario)
"""
)



In [None]:

# Comprobamos que efectivamente es una clave primaria
print("¿Hay algún visit_id duplicado en clinical?")
print(patient["visit_id"].duplicated().any())
print("¿Hay algún visit_id duplicado en supplemental?")
print(supplemental["visit_id"].duplicated().any())

# Comprobaciones de relación entre tablas

# Comprobación de la relación 1 a 1
print("¿Están todos los visit_id de proteinas en 'clinical'?")
difference = set(proteins["visit_id"]).difference(set(patient["visit_id"]))
print(not difference)
if difference:
    print(len(difference))

print("¿Están todos los visit_id de 'clinical' en proteinas?")
difference = set(patient["visit_id"]).difference(set(proteins["visit_id"]))
print(not difference)
if difference:
    print(len(difference))

print(
    "¿Es cierto que los datos de 'supplemental' no tienen datos de proteinas asociados?"
)
intersection = set(supplemental["visit_id"]).intersection(proteins["visit_id"])
print(not intersection)
if intersection:
    print(len(intersection))

# Comprobación de la relación 1 a n
print("¿Estan todas las mediciones de peptidos asociadas a una medicion de proteinas?")
difference = set(peptides[["visit_id", "UniProt"]].apply(tuple, axis=1)).difference(
    proteins[["visit_id", "UniProt"]].apply(tuple, axis=1)
)
print(not difference)
if difference:
    print(len(difference))



In [None]:
# Analis de distribución de datos de proteinas

sns.set_theme(rc={"figure.figsize": (11.7, 8.27)})
proteins["logNPX"] = np.log2(proteins["NPX"])
unique_proteins = proteins["UniProt"].unique()
protein = unique_proteins[1]
some_proteins = proteins.loc[proteins["UniProt"] == protein]
sns_plot = sns.violinplot(some_proteins, x="logNPX", y="UniProt")
sns_plot.get_figure().savefig("log_npx_violin_plot.png")
plt.close()
sns_plot = sns.violinplot(some_proteins, x="NPX", y="UniProt")
sns_plot.get_figure().savefig("npx_violin_plot.png")
sns_plot




In [None]:

# Analis de distribución de datos UPDRS

updrs_cols = [f"updrs_{i}" for i in range(1, 5)]
updrs = pd.concat([patient[updrs_cols], supplemental[updrs_cols]])
sns_plot = sns.violinplot(updrs)
sns_plot.get_figure().savefig("updrs_violins.png")
sns_plot




In [None]:
# Analis de distribución de datos UPDRS (Normalizado)
updrs_ranges = [52, 52, 132, 24]
for i in range(1, 5):
    updrs[f"updrs_{i}"] = updrs[f"updrs_{i}"] / updrs_ranges[i - 1]
sns_plot = sns.violinplot(updrs)
sns_plot.get_figure().savefig("norm_updrs_violins.png")
sns_plot




In [None]:
# Valores nulos en datos UPDRS

for col in updrs_cols:
    print(
        f"{col}: {len(updrs[updrs[col].isna()])} / {len(updrs)} ({len(updrs[updrs[col].isna()]) * 100/ len(updrs):.2f}%)"
    )




In [None]:
# Valores nulos en datos UPDRS
counted = peptides[["UniProt", "Peptide"]].drop_duplicates().groupby("Peptide").count()
len(counted[counted.UniProt > 1]) == 0




In [None]:
# Valores del mes de visita
visit_months = pd.concat([patient["visit_month"], supplemental["visit_month"]])
sns_plot = sns.histplot(visit_months, binwidth=1)
sns_plot.set_xticks(visit_months.unique())
sns_plot.get_figure().savefig("visit_month_hist.png")
sns_plot



In [None]:

for col in updrs_cols:
    sns_plot = sns.lineplot(
        pd.concat([patient, supplemental], axis=0, ignore_index=True).fillna(0),
        x="visit_month",
        y=col,
        estimator="mean",
        errorbar=("ci", 95),
    )
    sns_plot.get_figure().savefig(f"evolution_{col}.png")
    plt.close()


In [None]:

df = pd.concat([patient, supplemental], axis=0, ignore_index=True).fillna(0)
df = df.rename(columns={"upd23b_clinical_state_on_medication": "on_medication"})

df["on_medication"] = (
    df["on_medication"]
    .case_when(
        [
            (df.on_medication.eq("On"), 1),
            (df.on_medication.eq("Off"), -1),
        ]
    )
    .fillna("0")
)
df = df.drop(columns=["patient_id", "visit_id"])
cross_corr_matrix = df.corr()

plt.figure(figsize=(20, 16))
sns.heatmap(
    cross_corr_matrix,
    annot=False,
    cmap="coolwarm",
    cbar=True,
    square=True,
    xticklabels=True,
    yticklabels=True,
    linewidths=0.1,
)

plt.title("Matriz de correlación cruzada", fontsize=16)
plt.xticks(fontsize=20, rotation=90)
plt.yticks(fontsize=20)
plt.tight_layout()
plt.savefig("correlation.png")
plt.show()



In [None]:
sns.pairplot(df[updrs_cols])
plt.savefig("pairplot.png")



In [None]:

cross_corr_matrix
