# Baseline Linear Regression

In [2]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# two levels up from the notebook to project root, then append to sys.path
sys.path.append(str(Path().resolve().parents[1]))

PROJECT_ROOT = Path().resolve().parents[1]
RESULTS_PATH = PROJECT_ROOT / "results"


from utils.data_loading import load_datasets
from utils.data_preparation import get_spo2_to_po2_interpolator, add_shift_raw_column
from utils.evaluation import evaluate_macro_patient_level, print_evaluation, bland_altman_plots
from utils.modeling import fit_cv_models, predict_cv_ensemble
from utils.logging import log_run_json


train_df, test_df, val_df, odc = load_datasets()
spo2_to_po2 = get_spo2_to_po2_interpolator(odc)

train_df = add_shift_raw_column(train_df, spo2_to_po2)
test_df  = add_shift_raw_column(test_df, spo2_to_po2)
val_df   = add_shift_raw_column(val_df, spo2_to_po2)

# Effect of Difference of Feature Sets

## With Fixed Shift_raw

In [3]:
import itertools
import pandas as pd
from sklearn.linear_model import LinearRegression


def add_engineered_features(df):
    df['log_PiO2'] = np.log(df['PiO2(kPa)'])
    df['log_SpO2'] = np.log(df['SpO2(%)'])
    df['SpO2_over_PiO2'] = df['SpO2(%)'] / df['PiO2(kPa)']
    df['SpO2_squared'] = df['SpO2(%)'] ** 2
    df['Hb_SpO2'] = df['Hb'] * df['SpO2(%)']
    df['saturation_deficit'] = 100 - df['SpO2(%)']
    df['CaO2_estimate'] = (1.34 * df['Hb'] * df['SpO2(%)'] / 100) + (0.0031 * df['PiO2(kPa)'])
    df = add_shift_raw_column(df, spo2_to_po2)  # if not already present
    return df


train_df_engineered = add_engineered_features(train_df)
test_df_engineered = add_engineered_features(test_df)

# Always include 'shift_raw'
base_features = ['PiO2(kPa)', 'SpO2(%)', 'Hb', 'log_PiO2', 'log_SpO2', 'SpO2_over_PiO2', 
                 'SpO2_squared', 'Hb_SpO2', 'saturation_deficit', 'CaO2_estimate']

mandatory_feature = ['shift_raw']

# Generate all combinations of additional features (1 to N), always adding shift_raw
feature_combinations = [
    mandatory_feature + list(combo)
    for r in range(1, len(base_features) + 1)
    for combo in itertools.combinations(base_features, r)
]


# Evaluate each feature set
results = []

for features in feature_combinations:
    features = list(features)
    model = LinearRegression()
    cv_models = fit_cv_models(train_df_engineered, features=features, target_col='shift', model_class=LinearRegression, k=10)
    test_df_engineered['y_pred'] = predict_cv_ensemble(test_df_engineered, features, cv_models)

    summary = evaluate_macro_patient_level(test_df_engineered, y_true_col='shift', y_pred_col='y_pred')
    results.append({
        'features': features,
        'MAE': summary['MAE'],
        'MSE': summary['MSE'],
        'RMSE': summary['RMSE'],
        'Bias': summary['Mean Bias Error']
    })

# Convert results to DataFrame for inspection
results_df = pd.DataFrame(results).sort_values(by='MAE')


In [4]:
results_df

Unnamed: 0,features,MAE,MSE,RMSE,Bias
281,"[shift_raw, SpO2(%), log_PiO2, log_SpO2, SpO2_...",0.890621,1.733317,1.081507,0.024642
551,"[shift_raw, SpO2(%), log_PiO2, log_SpO2, SpO2_...",0.890621,1.733317,1.081507,0.024642
355,"[shift_raw, log_PiO2, log_SpO2, SpO2_squared, ...",0.890621,1.733317,1.081507,0.024642
150,"[shift_raw, log_PiO2, SpO2_squared, saturation...",0.891493,1.736509,1.082282,0.024563
100,"[shift_raw, SpO2(%), log_PiO2, SpO2_squared]",0.891493,1.736509,1.082282,0.024563
...,...,...,...,...,...
16,"[shift_raw, PiO2(kPa), Hb_SpO2]",1.123795,3.710883,1.312705,-0.074786
18,"[shift_raw, PiO2(kPa), CaO2_estimate]",1.123795,3.710883,1.312705,-0.074786
89,"[shift_raw, PiO2(kPa), Hb_SpO2, CaO2_estimate]",1.123795,3.710883,1.312705,-0.074786
53,"[shift_raw, Hb_SpO2, CaO2_estimate]",1.123795,3.710883,1.312705,-0.074786


In [6]:
results_df_rounded = results_df.copy()

for col in results_df.columns:
    if col != 'features':
        results_df_rounded[col] = pd.to_numeric(results_df[col], errors='coerce').round(3)
results_df_rounded.head(10).to_csv(RESULTS_PATH / "linear_regression_subsets.csv", index=False)

In [39]:
top_n = 10
top_features = results_df.nsmallest(top_n, 'MAE')['features']

from collections import Counter
flat_features = [f for sublist in top_features for f in sublist]
freq_counter = Counter(flat_features)

# View most frequent features in top-N models
sorted(freq_counter.items(), key=lambda x: x[1], reverse=True)


[('shift_raw', 10),
 ('log_PiO2', 7),
 ('log_SpO2', 7),
 ('SpO2_squared', 7),
 ('SpO2(%)', 6),
 ('saturation_deficit', 6),
 ('SpO2_over_PiO2', 3)]

## Without fixed Shift_raw

In [40]:
import itertools
import pandas as pd
from sklearn.linear_model import LinearRegression

# Ensure shift_raw and log_PiO2 are added
def add_engineered_features(df):
    df['log_PiO2'] = np.log(df['PiO2(kPa)'])
    df['log_SpO2'] = np.log(df['SpO2(%)'])
    df['SpO2_over_PiO2'] = df['SpO2(%)'] / df['PiO2(kPa)']
    df['SpO2_squared'] = df['SpO2(%)'] ** 2
    df['Hb_SpO2'] = df['Hb'] * df['SpO2(%)']
    df['saturation_deficit'] = 100 - df['SpO2(%)']
    df['CaO2_estimate'] = (1.34 * df['Hb'] * df['SpO2(%)'] / 100) + (0.0031 * df['PiO2(kPa)'])
    df = add_shift_raw_column(df, spo2_to_po2)  # if not already present
    return df


train_df_engineered = add_engineered_features(train_df)
test_df_engineered = add_engineered_features(test_df)

# Always include 'shift_raw'
base_features = ['shift_raw', 'PiO2(kPa)', 'SpO2(%)', 'Hb', 'log_PiO2', 'log_SpO2', 'SpO2_over_PiO2', 
                 'SpO2_squared', 'Hb_SpO2', 'saturation_deficit', 'CaO2_estimate']

feature_combinations = list(itertools.chain.from_iterable(
    itertools.combinations(base_features, r) for r in range(1, len(base_features) + 1)
))


# Evaluate each feature set
results = []

for features in feature_combinations:
    features = list(features)
    model = LinearRegression()
    cv_models = fit_cv_models(train_df_engineered, features=features, target_col='shift', model_class=LinearRegression, k=10)
    test_df_engineered['y_pred'] = predict_cv_ensemble(test_df_engineered, features, cv_models)

    summary = evaluate_macro_patient_level(test_df_engineered, y_true_col='shift', y_pred_col='y_pred')
    results.append({
        'features': features,
        'MAE': summary['MAE'],
        'MSE': summary['MSE'],
        'RMSE': summary['RMSE'],
        'Bias': summary['Mean Bias Error']
    })

# Convert results to DataFrame for inspection
results_df = pd.DataFrame(results).sort_values(by='MAE')


In [41]:
results_df

Unnamed: 0,features,MAE,MSE,RMSE,Bias
531,"[log_PiO2, log_SpO2, SpO2_squared, saturation_...",0.885012,1.667355,1.078825,0.018545
457,"[SpO2(%), log_PiO2, log_SpO2, SpO2_squared]",0.885012,1.667355,1.078825,0.018545
937,"[SpO2(%), log_PiO2, log_SpO2, SpO2_squared, sa...",0.885012,1.667355,1.078825,0.018545
667,"[shift_raw, SpO2(%), log_PiO2, log_SpO2, SpO2_...",0.890621,1.733317,1.081507,0.024642
1189,"[shift_raw, SpO2(%), log_PiO2, log_SpO2, SpO2_...",0.890621,1.733317,1.081507,0.024642
...,...,...,...,...,...
37,"[SpO2(%), CaO2_estimate]",2.794557,19.905546,2.861645,0.851007
55,"[log_SpO2, CaO2_estimate]",2.794646,19.867128,2.859377,0.848745
42,"[Hb, Hb_SpO2]",2.803907,21.171579,2.877010,0.866026
44,"[Hb, CaO2_estimate]",2.815700,21.306687,2.871326,0.870762


In [42]:
top_n = 10
top_features = results_df.nsmallest(top_n, 'MAE')['features']

from collections import Counter
flat_features = [f for sublist in top_features for f in sublist]
freq_counter = Counter(flat_features)

# View most frequent features in top-N models
sorted(freq_counter.items(), key=lambda x: x[1], reverse=True)


[('log_PiO2', 10),
 ('SpO2_squared', 10),
 ('log_SpO2', 7),
 ('shift_raw', 7),
 ('saturation_deficit', 6),
 ('SpO2(%)', 6)]

## Simplistic Feature Selection

In [43]:
import itertools
import pandas as pd
from sklearn.linear_model import LinearRegression


# Ensure shift_raw and log_PiO2 are added
def add_engineered_features(df):
    df['log_PiO2'] = np.log(df['PiO2(kPa)'])
    df['log_SpO2'] = np.log(df['SpO2(%)'])
    df = add_shift_raw_column(df, spo2_to_po2)  # your existing function
    return df

train_df_engineered = add_engineered_features(train_df)
test_df_engineered = add_engineered_features(test_df)

# Feature sets to evaluate
base_features = ['PiO2(kPa)', 'SpO2(%)', 'Hb', 'shift_raw', 'log_PiO2']
feature_combinations = list(itertools.chain.from_iterable(
    itertools.combinations(base_features, r) for r in range(1, len(base_features) + 1)
))

# Evaluate each feature set
results = []

for features in feature_combinations:
    features = list(features)
    model = LinearRegression()
    cv_models = fit_cv_models(train_df_engineered, features=features, target_col='shift', model_class=LinearRegression, k=10)
    test_df_engineered['y_pred'] = predict_cv_ensemble(test_df_engineered, features, cv_models)

    summary = evaluate_macro_patient_level(test_df_engineered, y_true_col='shift', y_pred_col='y_pred')
    results.append({
        'features': features,
        'MAE': summary['MAE'],
        'RMSE': summary['RMSE'],
        'Mean Bias Error': summary['Mean Bias Error']
    })


results_df = pd.DataFrame(results).sort_values(by='MAE')


In [44]:
results_df

Unnamed: 0,features,MAE,RMSE,Mean Bias Error
23,"[SpO2(%), shift_raw, log_PiO2]",0.92478,1.118792,0.034316
27,"[PiO2(kPa), SpO2(%), shift_raw, log_PiO2]",0.966574,1.163226,0.025776
14,"[shift_raw, log_PiO2]",0.972359,1.175547,0.070456
11,"[SpO2(%), log_PiO2]",0.972649,1.173108,0.003464
20,"[PiO2(kPa), shift_raw, log_PiO2]",0.987294,1.186322,0.068623
17,"[PiO2(kPa), SpO2(%), log_PiO2]",1.010212,1.214646,0.006205
29,"[SpO2(%), Hb, shift_raw, log_PiO2]",1.011853,1.210073,-0.086738
24,"[Hb, shift_raw, log_PiO2]",1.012803,1.216901,0.013944
7,"[PiO2(kPa), shift_raw]",1.015418,1.206206,0.068723
16,"[PiO2(kPa), SpO2(%), shift_raw]",1.016671,1.212357,0.028073
