In [1]:
import os
import numpy as np
import pandas as pd



In [2]:
data_path = "../../datos"
supplemental = pd.read_csv(os.path.join(data_path, "supplemental_clinical_data.csv"))
patient =pd.read_csv(os.path.join(data_path, "train_clinical_data.csv"))
peptides = pd.read_csv(os.path.join(data_path, "train_peptides.csv"))
proteins = pd.read_csv(os.path.join(data_path, "train_proteins.csv"))

supplemental.loc[supplemental["visit_month"] == 5, "visit_month"] = 6



In [3]:
scaled_patient = pd.concat([patient,supplemental])
updrs_ranges = [52,52,132,24]
updrs_cols = [f"updrs_{i}" for i in range(1,5)]
for updrs_range, col in zip(updrs_ranges, updrs_cols):
    scaled_patient[col] /= updrs_range

scaled_protein = proteins.copy()
scaled_protein["NPX"] = np.log2(proteins["NPX"])
scaled_protein = (
    scaled_protein[["UniProt", "NPX"]]
    .groupby("UniProt")
    .agg(["min", "max"])
    .droplevel(0, axis=1)
    .join(proteins.set_index("UniProt"))
)
scaled_protein["NPX"] = (scaled_protein["NPX"] - scaled_protein["min"]) / (
    scaled_protein["max"] - scaled_protein["min"]
).drop(columns=["min", "max"])

scaled_peptide = peptides.copy()
scaled_peptide["PeptideAbundance"]= np.log2(peptides["PeptideAbundance"])
scaled_peptide = (
    scaled_peptide[["UniProt", "PeptideAbundance", "Peptide"]]
    .groupby(["UniProt", "Peptide"])
    .agg(["min", "max"])
    .droplevel(0, axis=1)
    .join(peptides.set_index(["UniProt", "Peptide"]))
)
scaled_peptide["PeptideAbundance"] = (scaled_peptide["PeptideAbundance"] - scaled_peptide["min"]) / (
    scaled_peptide["max"] - scaled_peptide["min"]
).drop(columns=["min", "max"])



In [4]:

scaled_patient = scaled_patient.rename(
    columns={"upd23b_clinical_state_on_medication": "on_medication"}
)
scaled_patient["on_medication"] = (
    scaled_patient["on_medication"]
    .case_when(
        [
            (scaled_patient.on_medication.eq("On"), 1),
            (scaled_patient.on_medication.eq("Off"), -1),
        ]
    )
    .fillna("0")
)

scaled_patient = (
    scaled_patient.set_index(["patient_id", "visit_month"])
    .join(
        scaled_peptide.pivot_table(
            values="PeptideAbundance",
            index=["patient_id", "visit_month"],
            columns=["Peptide"],
            aggfunc="sum",
        ).fillna(0)
    )
    .reset_index()
)

scaled_patient = (
    scaled_patient.set_index(["patient_id", "visit_month"])
    .join(
        scaled_protein.pivot_table(
            values="NPX",
            index=["patient_id", "visit_month"],
            columns=["UniProt"],
            aggfunc="sum",
        ).fillna(0)
    )
    .reset_index()
)
scaled_patient = scaled_patient.fillna(0)
# scale visit month
max_month = scaled_patient.visit_month.max() # keep max to deescale later
scaled_patient["visit_month"] = scaled_patient["visit_month"] / max_month
protein_cols = list(scaled_protein.index.unique())
peptide_cols = list(scaled_peptide.index.unique().to_series().apply(lambda t: t[1]))



In [5]:
import sys

sys.path.append("./codigo/modelo_seq2seq")



In [28]:
from format_seqs import format_data
from aux import train_val_split

input_feautures_name = "BASE"

base_encoder_input_features = [
    "visit_month",
    "updrs_1",
    "updrs_2",
    "updrs_3",
    "updrs_4",
]
protein_encoder_input_features = base_encoder_input_features + protein_cols
peptide_encoder_input_features = base_encoder_input_features + peptide_cols

encoder_input_features = (protein_encoder_input_features if input_feautures_name == "PROTEIN" else peptide_encoder_input_features if input_feautures_name == "PEPTIDE" else base_encoder_input_features
                          )
decoder_input_features = [
    "visit_month",
    "updrs_1",
    "updrs_2",
    "updrs_3",
    "updrs_4",
]
output_features = decoder_input_features[1:]  # visit_month is the only covariable

data, target_indices = format_data(
    scaled_patient,
    partition_key="patient_id",
    order_key="visit_month",
    encoder_input_features=encoder_input_features,
    decoder_input_features=decoder_input_features,
    output_features=output_features,
    input_seq_length=3,
    output_seq_length=3,
)



In [42]:
from lstm import Encoder as LSTMEncoder, DecoderWithAttention as LSTMDecoder
from seq2seq import Encoder as GRUEncoder, DecoderWithAttention as GRUDecoder, Seq2Seq
rnn_type = "LSTM"
if rnn_type == "LSTM":
    Encoder, DecoderWithAttention = LSTMEncoder, LSTMDecoder
else:
    Encoder, DecoderWithAttention = GRUEncoder, GRUDecoder

enc_feature_size = len(encoder_input_features)
hidden_size = 16
num_layers = 1
dropout = 0.1
dec_feature_size = len(decoder_input_features)
dec_target_size = len(output_features)
device = 'cpu'
lr = 0.00025
grad_clip = 1
batch_size = 64
num_epochs = 100
decay = 5 # Lower means faster decay


In [43]:
from aux import get_best_model
from aux import evaluate
import json
results = []
n = 5
for i in range(n):
    train_data, val_data = train_val_split(data, p = 0.8) # 80% of data to train
    encoder = Encoder(enc_feature_size, hidden_size, num_layers, dropout)
    decoder_args = (dec_feature_size, dec_target_size, hidden_size, num_layers, target_indices, dropout, device)
    decoder = DecoderWithAttention(*decoder_args)
    seq2seq = Seq2Seq(encoder, decoder, lr, grad_clip).to(device)
    best_model = get_best_model(seq2seq, train_data, val_data, batch_size, num_epochs, decay)
    results.append(evaluate(best_model, val_data, batch_size))
with open("../../results.txt", "a", encoding = "utf8") as f:
    json.dump(
        {k:v for k,v in locals().items()
            if k in {"hidden_size", "num_layers", "dropout", "lr", "grad_clip", "batch_size", "num_epochs", "decay", "input_feautures_name", "rnn_type", "n"}} | {
                "val_mean" : float(np.mean(results)), "val_std" : float(np.std(results))},
        f
    )
    f.write("\n")



Epoch 1 => Train loss: 159.86044, Val: 158.64383, Teach: 0.83, Took 0.1 s      (NEW BEST)
Epoch 2 => Train loss: 152.56536, Val: 149.88038, Teach: 0.80, Took 0.1 s      (NEW BEST)
Epoch 3 => Train loss: 139.80156, Val: 138.83539, Teach: 0.77, Took 0.1 s      (NEW BEST)
Epoch 4 => Train loss: 129.58438, Val: 130.32601, Teach: 0.73, Took 0.1 s      (NEW BEST)
Epoch 5 => Train loss: 122.71402, Val: 123.98438, Teach: 0.69, Took 0.1 s      (NEW BEST)
Epoch 6 => Train loss: 120.68081, Val: 123.25453, Teach: 0.65, Took 0.1 s      (NEW BEST)
Epoch 7 => Train loss: 120.19856, Val: 122.98842, Teach: 0.60, Took 0.1 s      (NEW BEST)
Epoch 8 => Train loss: 119.32075, Val: 118.44588, Teach: 0.55, Took 0.1 s      (NEW BEST)
Epoch 9 => Train loss: 114.70169, Val: 104.08895, Teach: 0.50, Took 0.1 s      (NEW BEST)
Epoch 10 => Train loss: 99.76326, Val: 90.04103, Teach: 0.45, Took 0.1 s      (NEW BEST)
Epoch 11 => Train loss: 89.39515, Val: 86.00287, Teach: 0.40, Took 0.1 s      (NEW BEST)
Epoch 12 => 

In [None]:
i=1


In [None]:
from plot import plot
plot(i,best_model, val_data, max_month)

import matplotlib.pyplot as plt

plt.savefig(f"seq2seq-model-results-{i}.png")
i+=1

