In [None]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import os
import re

from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
data_dir = "/content/drive/MyDrive/PhysioNet.zip (Unzipped Files)/set-a/set-a"
outcomes_dir = "/content/drive/MyDrive/PhysioNet.zip (Unzipped Files)/Outcomes-a.txt"

In [None]:
outcomes_df = pd.read_csv(outcomes_dir)
outcomes_df.set_index("RecordID", inplace=True)

In [None]:
CONSTANT_PARAMS = ['Age', 'Gender', 'Height', 'Weight', 'ICUType']
VITAL_PARAMS = ['HR', 'NIDiasABP', 'NIMAP', 'NISysABP', 'DiasABP', 'MAP', 'SysABP', 'Temp', 'GCS', 'FiO2', 'MechVent', 'Urine']
LAB_PARAMS = ['BUN', 'Creatinine', 'Glucose', 'HCO3', 'HCT', 'Mg', 'Platelets', 'K', 'Na', 'WBC', 'pH', 'PaCO2', 'PaO2']

MAX_TIME_STEPS = 100

patient_files = [f for f in os.listdir(data_dir) if f.endswith('.txt')]
n_patients = len(patient_files)

n_constant = len(CONSTANT_PARAMS)
n_vital = len(VITAL_PARAMS)
n_lab = len(LAB_PARAMS)
matrix_4d = np.zeros((n_patients, 3, MAX_TIME_STEPS, max(n_constant, n_vital, n_lab)))

for i, file in enumerate(patient_files):
    file_name = file.split('.')[0]
    record_id = int(re.search(r'\d+', file_name).group())

    df = pd.read_csv(os.path.join(data_dir, file))

    constant_data = df[df['Parameter'].isin(CONSTANT_PARAMS)].pivot(index='Time', columns='Parameter', values='Value')
    if not constant_data.empty:
        constant_data = constant_data.iloc[0:1].reindex(columns=CONSTANT_PARAMS).fillna(0).values
        matrix_4d[i, 0, 0, :n_constant] = constant_data

    vital_df = df[df['Parameter'].isin(VITAL_PARAMS)].groupby(['Time', 'Parameter'])['Value'].mean().reset_index()
    vital_data = vital_df.pivot(index='Time', columns='Parameter', values='Value')
    vital_data = vital_data.reindex(columns=VITAL_PARAMS).fillna(0).values
    if len(vital_data) > MAX_TIME_STEPS:
        vital_data = vital_data[:MAX_TIME_STEPS]
    else:
        vital_data = np.pad(vital_data, ((0, MAX_TIME_STEPS - len(vital_data)), (0, 0)), mode='constant')
    matrix_4d[i, 1, :len(vital_data), :n_vital] = vital_data

    lab_df = df[df['Parameter'].isin(LAB_PARAMS)].groupby(['Time', 'Parameter'])['Value'].mean().reset_index()
    lab_data = lab_df.pivot(index='Time', columns='Parameter', values='Value')
    lab_data = lab_data.reindex(columns=LAB_PARAMS).fillna(0).values
    if len(lab_data) > MAX_TIME_STEPS:
        lab_data = lab_data[:MAX_TIME_STEPS]
    else:
        lab_data = np.pad(lab_data, ((0, MAX_TIME_STEPS - len(lab_data)), (0, 0)), mode='constant')
    matrix_4d[i, 2, :len(lab_data), :n_lab] = lab_data

print("Форма 4D-матрицы:", matrix_4d.shape)

Форма 4D-матрицы: (4000, 3, 100, 13)


In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Concatenate, Input
from sklearn.model_selection import train_test_split

In [None]:
outcomes = pd.read_csv(outcomes_dir)
y = outcomes.set_index('RecordID')['In-hospital_death'].reindex([int(re.search(r'\d+', f.split('.')[0]).group()) for f in patient_files]).values

X_train, X_test, y_train, y_test = train_test_split(matrix_4d, y, test_size=0.2, random_state=42)

In [None]:
input_const = Input(shape=(MAX_TIME_STEPS, matrix_4d.shape[3]))
input_vital = Input(shape=(MAX_TIME_STEPS, matrix_4d.shape[3]))
input_lab = Input(shape=(MAX_TIME_STEPS, matrix_4d.shape[3]))

conv_const = Conv1D(16, 3, activation='relu')(input_const)
pool_const = MaxPooling1D(2)(conv_const)
lstm_const = LSTM(16, return_sequences=False)(pool_const)

conv_vital = Conv1D(32, 3, activation='relu')(input_vital)
pool_vital = MaxPooling1D(2)(conv_vital)
lstm_vital = LSTM(64, return_sequences=False)(pool_vital)

conv_lab = Conv1D(32, 3, activation='relu')(input_lab)
pool_lab = MaxPooling1D(2)(conv_lab)
lstm_lab = LSTM(64, return_sequences=False)(pool_lab)

concat = Concatenate()([lstm_const, lstm_vital, lstm_lab])
dense = Dense(64, activation='relu')(concat)
output = Dense(1, activation='sigmoid')(dense)

model = Model(inputs=[input_const, input_vital, input_lab], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(
    [X_train[:, 0, :, :], X_train[:, 1, :, :], X_train[:, 2, :, :]],
    y_train,
    epochs=20,
    batch_size=32,
    validation_data=([X_test[:, 0, :, :], X_test[:, 1, :, :], X_test[:, 2, :, :]], y_test),
)

Epoch 1/20
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 89ms/step - accuracy: 0.8196 - loss: 0.4742 - val_accuracy: 0.8600 - val_loss: 0.3800
Epoch 2/20
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 70ms/step - accuracy: 0.8583 - loss: 0.3696 - val_accuracy: 0.8600 - val_loss: 0.3825
Epoch 3/20
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 61ms/step - accuracy: 0.8568 - loss: 0.3706 - val_accuracy: 0.8662 - val_loss: 0.3624
Epoch 4/20
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 79ms/step - accuracy: 0.8797 - loss: 0.3400 - val_accuracy: 0.8625 - val_loss: 0.3694
Epoch 5/20
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 62ms/step - accuracy: 0.8716 - loss: 0.3306 - val_accuracy: 0.8650 - val_loss: 0.3651
Epoch 6/20
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 62ms/step - accuracy: 0.8572 - loss: 0.3563 - val_accuracy: 0.8712 - val_loss: 0.3544
Epoch 7/20
[1m100/1

<keras.src.callbacks.history.History at 0x7ddb02643150>

In [None]:
model.save('/content/drive/MyDrive/PhysioNet_model.h5')



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
rf = RandomForestClassifier().fit(X_train, y_train)
preds_rf = rf.predict(X_test)

print(accuracy_score(y_test, preds_rf))

ValueError: Found array with dim 4. RandomForestClassifier expected <= 2.