In [25]:
import pandas as pd
import os
from pathlib import Path
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Data preparing

In [29]:
folder_path = Path('../data/HUPA-UCM-data')
patients_info_path = Path('../data/HUPA-data_patients/patients_info.csv')

# Check folder and file existence
if not os.path.exists(folder_path):
    raise FileNotFoundError(f"Directory {folder_path} does not exist.")
if not os.path.exists(patients_info_path):
    raise FileNotFoundError(f"File {patients_info_path} does not exist.")

# Load patients info and check duplicates
patients_info = pd.read_csv(patients_info_path)
if patients_info['person_id'].duplicated().any():
    raise ValueError("Duplicate person_id found in patients_info.csv!")
print("Unique person_ids in patients_info:", patients_info['person_id'].nunique())

# Load time series HUPA-UCM-data
all_files = [folder_path / f for f in os.listdir(folder_path) if f.endswith('.csv')]
data_list = []
for file in all_files:
    temp_data = pd.read_csv(file, delimiter=';')
    temp_data['person_id'] = file.stem
    data_list.append(temp_data)

# Combine and preprocess HUPA-UCM-data
data = pd.concat(data_list, ignore_index=True).drop_duplicates()

# Convert time and handle errors
data['time'] = pd.to_datetime(data['time'], errors='coerce')
data['minute_treat'] = data['time'].dt.minute
data['hour_of_day_treat'] = data['time'].dt.hour
data['month_treat'] = data['time'].dt.month
data['year_treat'] = data['time'].dt.year
data['day_treat'] = data['time'].dt.day.clip(1, data['time'].dt.days_in_month)
data = data.drop(['time'], axis=1)

# Fill missing values safely
for col in ['glucose', 'calories', 'heart_rate']:
    if data[col].isna().all():
        data[col] = 0
    else:
        data[col] = data[col].fillna(data[col].mean())
data = data.fillna(0)

# Merge with patients_info
data = data.merge(patients_info, on='person_id', how='left')
# Convert to bmi
data['bmi'] = data['weight'] / pow(data['height'], 2)
data = data.drop(['weight'], axis=1)
data = data.drop(['height'], axis=1)

# Anomaly checks
# Heart rate check
data = data[(data['heart_rate'] > 40) & (data['heart_rate'] < 200)]
# Glucose range check
data = data[(data['glucose'] >= 0) & (data['glucose'] <= 500)]
# Minute range check
data = data[(data['minute_treat'] >= 0) & (data['minute_treat'] <= 59)]
# Hour of day range check
data = data[(data['hour_of_day_treat'] >= 0) & (data['hour_of_day_treat'] <= 23)]
# Month range check
data = data[(data['month_treat'] >= 1) & (data['month_treat'] <= 12)]
# Age range check
data = data[(data['age'] >= 0) & (data['age'] <= 120)]
# BMI range check
data = data[(data['bmi'] >= 15) & (data['bmi'] <= 40)]

# Check target variable
if data['basal_rate'].isna().any():
    raise ValueError("Target variable 'basal_rate' contains missing values!")

# One-Hot Encoding
data = pd.get_dummies(data, columns=['gender', 'treatment'], drop_first=True)

# Define features
features = [
    'glucose',
    'calories',
    'heart_rate',
    'steps',
    'bolus_volume_delivered',
    'carb_input',
    'HbA1c',
    'age',
    'dx_time',
    'bmi',
    'minute_treat',
    'hour_of_day_treat',
    'month_treat',
    'day_treat',
]
features += [col for col in data.columns if col.startswith(('gender_', 'treatment_'))]

data.to_csv('final_prepared_data.csv')
X = data[features]
y = data['basal_rate']

Unique person_ids in patients_info: 25


# test model

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

model = DecisionTreeRegressor(max_depth=22, min_samples_leaf=2, min_samples_split=2, random_state=42)

model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

mse = round(mean_squared_error(y_test, y_pred), 6)
r2 = round(r2_score(y_test, y_pred), 6)
mae = round(mean_absolute_error(y_test, y_pred), 6)

print(f'Mean Squared Error: {mse}')
print(f'R²: {r2}')
print(f'Mean Absolute Error: {mae}')

Mean Squared Error: 0.000168
R²: 0.870357
Mean Absolute Error: 0.003919
