In [1]:
import sys
from pathlib import Path

# Go two levels up from the notebook to project root, then append to sys.path
sys.path.append(str(Path().resolve().parents[1]))

from utils.data_loading import load_datasets
from utils.data_preparation import get_spo2_to_po2_interpolator, add_shift_raw_column
from utils.evaluation import evaluate_macro_patient_level, print_evaluation


train_df, test_df, odc = load_datasets()
spo2_to_po2 = get_spo2_to_po2_interpolator(odc)

train_df = add_shift_raw_column(train_df, spo2_to_po2)
test_df  = add_shift_raw_column(test_df, spo2_to_po2)

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [5]:
test_df  = test_df[test_df['SpO2(%)'] < 95].copy()

# Create log feature
train_df['log_PiO2'] = np.log(train_df['Insp.O2(kPa)'])
test_df['log_PiO2'] = np.log(test_df['Insp.O2(kPa)'])

# ===============================================================
# Modeling functions
# ===============================================================
def expand_features(df, features):
    return df[features].values

def fit_cv_model(df, features, k=10, seed=42):
    kf = KFold(n_splits=k, shuffle=True, random_state=seed)
    coefs, intercepts = [], []
    for train_idx, _ in kf.split(df):
        sub = df.iloc[train_idx]
        X = expand_features(sub, features)
        y = sub['shift'].values
        model = LinearRegression().fit(X, y)
        coefs.append(model.coef_)
        intercepts.append(model.intercept_)
    return np.mean(coefs, axis=0), np.mean(intercepts)

def predict_model(df, coef, intercept, features):
    X = expand_features(df, features)
    return X @ coef + intercept



# ===============================================================
# Run: Train + Evaluate (SpO₂ < 95% only)
# ===============================================================
features = ['shift_raw', 'log_PiO2', 'SpO2(%)']

coef_lin, intercept_lin = fit_cv_model(train_df, features)
print(f"\nLinear model coefficients: {coef_lin.round(4)}\nIntercept: {intercept_lin:.4f}")


test_df['y_pred'] = predict_model(test_df, coef_lin, intercept_lin, features)

summary = evaluate_macro_patient_level(test_df, y_true_col='shift', y_pred_col='y_pred')
print_evaluation(summary)


Linear model coefficients: [ 0.1939 11.7562 -0.2411]
Intercept: -3.7055
Macro-averaged per-patient metrics:
MAE  = 0.804
MSE  = 1.404
RMSE = 0.948
Bias = 0.047


In [None]:
# ===============================================================
# Imports & Paths
# ===============================================================
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.interpolate import PchipInterpolator
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error

# File paths
TRAIN_PATH = Path("/Users/sarah/ML/master_thesis/Datasets/Shift Calculation/train_219_with_shift_shunt.csv")
TEST_PATH  = Path("/Users/sarah/ML/master_thesis/Datasets/Shift Calculation/test_219_with_shift_shunt.csv")
ODC_PATH   = Path("/Users/sarah/ML/master_thesis/ODC/Neonatal_ODC_Table.csv")

# ===============================================================
# Load & preprocess data
# ===============================================================
train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)
odc = pd.read_csv(ODC_PATH).sort_values('SO2 (%)').drop_duplicates('SO2 (%)')

# SpO₂ → PO₂ mapping
spo2_to_po2 = PchipInterpolator(odc['SO2 (%)'], odc['PO2 (kPa)'])
to_PcO2 = lambda spo2: spo2_to_po2(np.asarray(spo2))

def compute_shift_raw(row):
    Pc = to_PcO2([row['SpO2(%)']])[0]
    return row['Insp.O2(kPa)'] - Pc

train_df['shift_raw'] = train_df.apply(compute_shift_raw, axis=1)
test_df['shift_raw'] = test_df.apply(compute_shift_raw, axis=1)

train_df = train_df.rename(columns={'Anon.Patient_ID': 'Patient_ID'}).set_index('Patient_ID')
test_df  = test_df.rename(columns={'Anon.Patient_ID': 'Patient_ID'}).set_index('Patient_ID')

# Filter to SpO₂ < 95%

test_df  = test_df[test_df['SpO2(%)'] < 95].copy()

# Create log feature
train_df['log_PiO2'] = np.log(train_df['Insp.O2(kPa)'])
test_df['log_PiO2'] = np.log(test_df['Insp.O2(kPa)'])

# ===============================================================
# Modeling functions
# ===============================================================
def expand_features(df, features):
    return df[features].values

def fit_cv_model(df, features, k=10, seed=42):
    kf = KFold(n_splits=k, shuffle=True, random_state=seed)
    coefs, intercepts = [], []
    for train_idx, _ in kf.split(df):
        sub = df.iloc[train_idx]
        X = expand_features(sub, features)
        y = sub['shift'].values
        model = LinearRegression().fit(X, y)
        coefs.append(model.coef_)
        intercepts.append(model.intercept_)
    return np.mean(coefs, axis=0), np.mean(intercepts)

def predict_model(df, coef, intercept, features):
    X = expand_features(df, features)
    return X @ coef + intercept

def evaluate_macro_patient_level(df, coef, intercept, features):
    df = df.copy()
    df['y_true'] = df['shift']
    df['y_pred'] = predict_model(df, coef, intercept, features)
    
    grouped = df.groupby(df.index)
    mae_per_patient = grouped.apply(lambda g: mean_absolute_error(g['y_true'], g['y_pred']))
    mse_per_patient = grouped.apply(lambda g: mean_squared_error(g['y_true'], g['y_pred']))
    rmse_per_patient = np.sqrt(mse_per_patient)
    bias_per_patient = grouped.apply(lambda g: np.mean(g['y_pred'] - g['y_true']))
    
    print("Macro-averaged per-patient metrics:")
    print(f"MAE  = {mae_per_patient.mean():.3f}")
    print(f"MSE  = {mse_per_patient.mean():.3f}")
    print(f"RMSE = {rmse_per_patient.mean():.3f}")
    print(f"Bias = {bias_per_patient.mean():.3f}")

# ===============================================================
# Run: Train + Evaluate (SpO₂ < 95% only)
# ===============================================================
features = ['shift_raw', 'log_PiO2', 'SpO2(%)']

coef_lin, intercept_lin = fit_cv_model(train_df, features)
print(f"\nLinear model coefficients: {coef_lin.round(4)}\nIntercept: {intercept_lin:.4f}")

evaluate_macro_patient_level(test_df, coef_lin, intercept_lin, features)



Linear model coefficients: [ 0.1939 11.7562 -0.2411]
Intercept: -3.7055
Macro-averaged per-patient metrics:
MAE  = 0.804
MSE  = 1.404
RMSE = 0.948
Bias = 0.047
