In [1]:


import pytorch_lightning as pl
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

file_path = '"C:\Users\sanma\Downloads\PORSCH\GP.xlsx"'


In [5]:
file_path = r'C:\Users\sanma\Downloads\PORSCH\GP.xlsx'
# Definieer hulpfuncties
def load_data(file_path):
    """Laad een Excel-bestand en retourneer als pandas DataFrame."""
    return pd.read_excel(file_path)
def select_variables(dd):
    """Selecteer variabelen die moeten worden meegenomen."""
    return dd[(dd['Meenemen (0=nee, 1= ja, 2=onbekend, 3=computatie)'] == 1) & 
              (dd['0=input, 1=output, 2=onbekend'] != 2)]['Variable name'].tolist()


In [7]:
# Definieer de PORSCH2DataModule klasse
class PORSCH2DataModule(pl.LightningDataModule):
    def __init__(self, data_files, dd_path, batch_size=32, missing_threshold=50):
        super().__init__()
        self.data_files = data_files
        self.dd_path = dd_path
        self.batch_size = batch_size
        self.missing_threshold = missing_threshold
        self.data = None
        self.X_train = None
        self.X_val = None
        self.y_train = None
        self.y_val = None

    def prepare_data(self):
        # Laad data dictionary
        dd = load_data(self.dd_path)
        variables = select_variables(dd)
        
        # Laad en combineer data uit alle bestanden
        dfs = [load_data(file) for file in self.data_files]
        self.data = pd.concat(dfs, ignore_index=True)
        
        # Selecteer relevante variabelen
        self.data = self.data[variables]
        
        # Verwijder variabelen met te veel missende waarden
        missing_percentages = self.data.isnull().mean() * 100
        self.data = self.data.loc[:, missing_percentages < self.missing_threshold]

    def setup(self, stage=None):
        # Scheid features en target
        X = self.data.drop('target_column', axis=1)  # Vervang 'target_column' door de naam van je doelvariabele
        y = self.data['target_column']

        # Split data
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

        # Preprocessing
        numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
        categorical_features = X.select_dtypes(include=['object']).columns

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), numeric_features),
                ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
            ])

        X_train = preprocessor.fit_transform(X_train)
        X_val = preprocessor.transform(X_val)

        # Convert to tensors
        self.X_train = torch.tensor(X_train.toarray() if hasattr(X_train, "toarray") else X_train, dtype=torch.float32)
        self.X_val = torch.tensor(X_val.toarray() if hasattr(X_val, "toarray") else X_val, dtype=torch.float32)
        self.y_train = torch.tensor(y_train.values, dtype=torch.float32)
        self.y_val = torch.tensor(y_val.values, dtype=torch.float32)

    def train_dataloader(self):
        train_dataset = TensorDataset(self.X_train, self.y_train)
        return DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        val_dataset = TensorDataset(self.X_val, self.y_val)
        return DataLoader(val_dataset, batch_size=self.batch_size)


In [8]:
# Initialiseer de DataModule
data_files = [r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\PORSCH_Postop_CT_scans_export_20201128.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\PORSCH_Postoperatieve_biochemie_export_20201128.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\PORSCH_Postoperatieve_klinische_parameters_(per_29-08-2019)_export_20201128.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\PORSCH_Postoperatieve_klinische_parameters_(per_29-08-2019_alleen_1.1_en_1.5_invullen)_export_20201207.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\PORSCH_PTC_drainage_export_20210310.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\PORSCH_Relaparotomie_export_20201128.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\PORSCH_Transgastrische_drainage_export_20210310.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\PORSCH_Voedingssondeplaatsing_export_20210310.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cRdGG - porsch lab.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cSTAZ - ab.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cSTAZ - bloeddruk.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cSTAZ - lab.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cUMCU - Lab PORSCH 28 jan 2019.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cAMC - ab.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cAMC - biochemie.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cAMC - bloeddruk.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cAMC - Sat AHF FiO2.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cAMC - sleutelbestand.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cAmphia - ab.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cAmphia - biochemie.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cCatharina - ab.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cCatharina - bloeddruk.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cCatharina - PORSCH lab.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cJBZ - 2020-046 Porsch - LAB uitslagen.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cJBZ - ab.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cJBZ - bloeddruk.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cLUMC - ab.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cLUMC - biochemie.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cMUMC - Participanten MUMC Porsch MdD_LAB.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cOLVG - ab.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cOLVG - PORSCH lab.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cox.time.to.adjuvant.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cRadboud - 12_Lab_Porsch.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cRadboud - ab.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cRadboud - bloeddruk.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\cRdGG - ab.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\data.full.DCPAincluded.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\data.full.DCPAincluded_aangepast.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\ENDPOINTS AFTER ADJUDICATION DEFINITIVE.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\PORSCH_Datadictionary_CASTOR_23052018.xlsx",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\PORSCH_Endoscopische_interventie_bloeding_export_20201128.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\PORSCH_Endovasculaire_(angiografische)_interventie_export_20201128.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\PORSCH_export_20210329.csv",
r"C:\Users\sanma\Downloads\PORSCH\PORSCH pancreas map\PORSCH_Minimaal_invasieve_percutane_drainage_export_20201128.csv"]  # Voeg hier alle bestandspaden toe
dd_path = 'path_to_data_dictionary.xlsx'


data_module = PORSCH2DataModule(data_files, dd_path)

In [None]:
# Voer data voorbereiding uit
data_module.prepare_data()

In [None]:
#Controleer missende waarden (optioneel)
missing_data = data_module.data.isnull().sum() / len(data_module.data) * 100
print("Percentage missende waarden per variabele:")
print(missing_data.sort_values(ascending=False))

In [None]:
#Set up de data voor training
data_module.setup()

In [None]:
#Controleer de vorm van de trainingsdata (optioneel)
print("Vorm van trainingsdata:", data_module.X_train.shape)
print("Vorm van trainingslabels:", data_module.y_train.shape)

In [None]:
#Definieer het model (voorbeeld)
class PORSCH2Model(pl.LightningModule):
    def __init__(self, input_size):
        super().__init__()
        self.layer1 = torch.nn.Linear(input_size, 64)
        self.layer2 = torch.nn.Linear(64, 32)
        self.layer3 = torch.nn.Linear(32, 1)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        return torch.sigmoid(self.layer3(x))

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = torch.nn.functional.binary_cross_entropy(y_hat, y)
        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.02)

In [None]:
#Initialiseer het model
model = PORSCH2Model(input_size=data_module.X_train.shape[1])

In [None]:
# Train het model
trainer = pl.Trainer(max_epochs=10)
trainer.fit(model, data_module)

In [None]:
# Evalueer het model (voorbeeld)
model.eval()
with torch.no_grad():
    val_predictions = model(data_module.X_val)
    # Voeg hier evaluatiemetrieken toe