In [None]:
import pandas as pd

# STEP 1: Load diagnoses_icd
diag = pd.read_csv("path_to_mimic/mimic-iv-2.1/hosp/diagnoses_icd.csv", usecols=["subject_id", "icd_code"])

# STEP 2: Function to identify sepsis ICD codes
def is_sepsis(code):
    code = str(code)
    return code.startswith("A41") or code.startswith("R65")

diag["sepsis_label"] = diag["icd_code"].apply(is_sepsis)

# STEP 3: Aggregate to one row per subject
sepsis_labels = diag.groupby("subject_id")["sepsis_label"].max().reset_index()
print(sepsis_labels["sepsis_label"].value_counts())


sepsis_label
False    174236
True       6415
Name: count, dtype: int64


In [None]:
sepsis_labels.to_csv("H:/path_to_mimic/sepsis_labels.csv", index=False)


In [None]:
d_items = pd.read_csv("H:/path_to_mimic/archive/mimic-iv-2.1/icu/d_items.csv")

keywords = ["heart rate", "blood pressure", "spo2", "temperature", "respiratory rate"]
for key in keywords:
    print(f"\n--- Matching for: {key} ---")
    print(d_items[d_items["label"].str.lower().str.contains(key)])



--- Matching for: heart rate ---
   itemid                    label     abbreviation      linksto  \
2  220045               Heart Rate               HR  chartevents   
3  220046  Heart rate Alarm - High  HR Alarm - High  chartevents   
4  220047   Heart Rate Alarm - Low   HR Alarm - Low  chartevents   

              category unitname param_type  lownormalvalue  highnormalvalue  
2  Routine Vital Signs      bpm    Numeric             NaN              NaN  
3               Alarms      bpm    Numeric             NaN              NaN  
4               Alarms      bpm    Numeric             NaN              NaN  

--- Matching for: blood pressure ---
      itemid                                     label         abbreviation  \
6     220050          Arterial Blood Pressure systolic                 ABPs   
7     220051         Arterial Blood Pressure diastolic                 ABPd   
8     220052              Arterial Blood Pressure mean                 ABPm   
9     220056       Arterial

In [None]:
import pandas as pd

# Define all selected itemids
vital_itemids = [
    220045,  # Heart Rate
    220050, 220179,  # Systolic
    220051, 220180,  # Diastolic
    220052, 220181,  # Mean BP
    226253,          # SpO2
    223761, 223762,  # Temp
    220210, 224690   # Resp Rate
]

# Define path
chartevents_path = "H:/path_to_mimic/archive/mimic-iv-2.1/icu/chartevents.csv"

# Create storage
feature_rows = []

# Chunked reading
chunks = pd.read_csv(chartevents_path, usecols=["subject_id", "itemid", "valuenum"], chunksize=1000000)

for chunk in chunks:
    chunk = chunk[chunk["itemid"].isin(vital_itemids)]
    # Simple aggregation
    stats = chunk.groupby(["subject_id", "itemid"])["valuenum"].agg(["mean", "min", "max"]).reset_index()
    feature_rows.append(stats)

# Combine all chunks
vital_stats = pd.concat(feature_rows)

# … everything up to vital_stats = pd.concat(feature_rows) stays the same …

# Now build the wide table with automatic aggregation:
features = (
    vital_stats
    .pivot_table(
        index="subject_id",
        columns="itemid",
        values=["mean", "min", "max"],
        aggfunc="mean"      # when duplicates exist, take the mean of means/mins/maxs
    )
)

# Flatten the MultiIndex columns:
features.columns = [f"{stat}_{itemid}" for stat, itemid in features.columns]
features = features.reset_index()

# (Optional) save to CSV
features.to_csv("H:/path_to_mimic/vitals_features.csv", index=False)



In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# ─── 1) Load features & labels ──────────────────────────────────────────────────
features = pd.read_csv("H:/path_to_mimic/vitals_features.csv")    # subject_id + feature cols
labels   = pd.read_csv("H:/path_to_mimic/sepsis_labels.csv")      # subject_id + sepsis_label

# List your fixed 10 test patients
TEST_IDS = [
    10009628, 10024982, 10007058, 10011668, 10014729,
    10019350, 10002443, 10017285, 10018328, 10008077
]

# ─── 2) Split labels into train/test label sets ────────────────────────────────
labels_train = labels[~labels["subject_id"].isin(TEST_IDS)]
labels_test  = labels[ labels["subject_id"].isin(TEST_IDS)]

print(f"Labels: {labels.shape[0]} total, {labels_train.shape[0]} train, {labels_test.shape[0]} test")

# ─── 3) Merge features INTO each set ────────────────────────────────────────────
#   - use left join so every label keeps its row, even if features are missing
train = labels_train.merge(features, on="subject_id", how="left")
test  = labels_test .merge(features, on="subject_id", how="left")

# Identify feature columns
feat_cols = [c for c in train.columns if c not in ("subject_id","sepsis_label")]

# ─── 4) Impute missing feature values ───────────────────────────────────────────
# Compute medians on training data
medians = train[feat_cols].median()

# Fill NaNs in both train & test using training medians
train[feat_cols] = train[feat_cols].fillna(medians)
test[feat_cols]  = test[feat_cols].fillna(medians)

# (Optional) if any remain, fill with zero
train[feat_cols] = train[feat_cols].fillna(0)
test[feat_cols]  = test[feat_cols].fillna(0)

print(f"After merge+impute → train shape {train.shape}, test shape {test.shape}")

# ─── 5) Train & Evaluate ───────────────────────────────────────────────────────
X_train, y_train = train[feat_cols], train["sepsis_label"]
X_test,  y_test  = test [feat_cols],  test ["sepsis_label"]

clf = LogisticRegression(max_iter=500, class_weight="balanced")
clf.fit(X_train, y_train)

print("\nTest classification report on your 10 patients:")
print(classification_report(y_test, clf.predict(X_test)))


Labels: 180651 total, 180641 train, 10 test
After merge+impute → train shape (180641, 38), test shape (10, 38)

Test classification report on your 10 patients:
              precision    recall  f1-score   support

       False       1.00      0.40      0.57        10
        True       0.00      0.00      0.00         0

    accuracy                           0.40        10
   macro avg       0.50      0.20      0.29        10
weighted avg       1.00      0.40      0.57        10



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
import joblib

# assume `log_pipeline` is the Pipeline you fitted:
# Pipeline([("scaler", StandardScaler()), ("clf", LogisticRegression(...))])
joblib.dump(log_pipeline, "sepsis_model.pkl")
print("Model saved to sepsis_model.pkl")


Model saved to sepsis_model.pkl


In [None]:
import pandas as pd
import joblib

# ─── 1) Load your precomputed features and pipeline ─────────────────────────────
features = pd.read_csv("H:/path_to_mimic/vitals_features.csv")
pipeline = joblib.load("sepsis_model.pkl")   # your saved sklearn Pipeline

# ─── 2) Define your test patient IDs ────────────────────────────────────────────
test_ids = [
    10009628, 10024982, 10007058, 10011668, 10014729,
    10019350, 10002443, 10017285, 10018328, 10008077
]

# ─── 3) Filter to just those patients ────────────────────────────────────────────
test_df = features[features["subject_id"].isin(test_ids)].copy()

# ─── 4) Prepare feature matrix ──────────────────────────────────────────────────
feat_cols = [c for c in test_df.columns if c != "subject_id"]
X_test = test_df[feat_cols]

# If you used median imputation during training, repeat it here:
# medians = pd.Series(...)  # compute or hard‐code your training medians
# X_test = X_test.fillna(medians).fillna(0)

# Or simply:
X_test = X_test.fillna(0)

# ─── 5) Compute risk probabilities ───────────────────────────────────────────────
test_df["risk_score"] = pipeline.predict_proba(X_test)[:,1]

# ─── 6) Display the results ──────────────────────────────────────────────────────
print(test_df[["subject_id", "risk_score"]])


     subject_id  risk_score
11     10002443    0.511466
43     10007058    0.276585
49     10008077    0.518878
56     10009628    0.393179
66     10011668    0.438867
86     10014729    0.374077
94     10017285    0.776982
102    10018328    0.071844
110    10019350    0.221950
148    10024982    0.967730


In [16]:
import joblib

# Load model
model = joblib.load('sepsis_model.pkl')

# If it was trained using a pipeline with a feature selector or transformer
if hasattr(model, 'named_steps'):
    print(model.named_steps)

# Most reliable: check the feature names from training
# This only works if you saved feature names during training
print(model.feature_names_in_)  # Available in scikit-learn 1.0+


{'scaler': StandardScaler(), 'clf': LogisticRegression(class_weight='balanced', max_iter=2000, random_state=42,
                   solver='saga')}
['max_220045' 'max_220050' 'max_220051' 'max_220052' 'max_220179'
 'max_220180' 'max_220181' 'max_220210' 'max_223761' 'max_223762'
 'max_224690' 'max_226253' 'mean_220045' 'mean_220050' 'mean_220051'
 'mean_220052' 'mean_220179' 'mean_220180' 'mean_220181' 'mean_220210'
 'mean_223761' 'mean_223762' 'mean_224690' 'mean_226253' 'min_220045'
 'min_220050' 'min_220051' 'min_220052' 'min_220179' 'min_220180'
 'min_220181' 'min_220210' 'min_223761' 'min_223762' 'min_224690'
 'min_226253']


In [17]:
import joblib, numpy as np, pandas as pd

# 1) Load your pipeline
pipeline = joblib.load("sepsis_model.pkl")
scaler   = pipeline.named_steps["scaler"]
clf      = pipeline.named_steps["clf"]

# 2) Compute raw weights
feat_names      = scaler.feature_names_in_
means, scales   = scaler.mean_, scaler.scale_
coefs_scaled    = clf.coef_[0]
intercept_scaled= clf.intercept_[0]

coefs_raw     = coefs_scaled / scales
intercept_raw = intercept_scaled - np.sum(coefs_scaled * (means / scales))

# 3) Build a DataFrame
df = pd.DataFrame({
    "feature" : ["intercept"] + list(feat_names),
    "weight"  : [intercept_raw] + list(coefs_raw)
})

# 4) Save to CSV
df.to_csv("lr_weights.csv", index=False)
print("Saved lr_weights.csv")


Saved lr_weights.csv


In [6]:
print(test["sepsis_label"].value_counts())


sepsis_label
False    10
Name: count, dtype: int64


Test label balance:
 sepsis_label
True     4
False    1
Name: count, dtype: int64
Train size: 50929 Test size: 5


In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# ─── 1) Load features & labels ─────────────────────────────────────────────────
features = pd.read_csv("H:/path_to_mimic/vitals_features.csv")
labels   = pd.read_csv("H:/path_to_mimic/sepsis_labels.csv")

# ─── 2) Build stratified 10-patient test set (5 septic + 5 non-septic) ──────────
septic_ids    = labels[ labels.sepsis_label ==  True]["subject_id"].unique()
nonseptic_ids = labels[ labels.sepsis_label == False]["subject_id"].unique()
rng = np.random.default_rng(42)
test_septic    = rng.choice(septic_ids,    size=5, replace=False)
test_nonseptic = rng.choice(nonseptic_ids, size=5, replace=False)
TEST_IDS = np.concatenate([test_septic, test_nonseptic])

# ─── 3) Split labels & merge features ───────────────────────────────────────────
labels_train = labels[~labels.subject_id.isin(TEST_IDS)]
labels_test  = labels[ labels.subject_id.isin(TEST_IDS)]

train_df = labels_train.merge(features, on="subject_id", how="left")
test_df  = labels_test .merge(features, on="subject_id", how="left")

# ─── 4) Identify feature columns ────────────────────────────────────────────────
feat_cols = [c for c in train_df.columns if c not in ("subject_id","sepsis_label")]

# ─── 5) Drop columns with no data in training set ──────────────────────────────
medians    = train_df[feat_cols].median()
empty_cols = medians[medians.isna()].index.tolist()
if empty_cols:
    print("Dropping empty cols:", empty_cols)
    train_df.drop(columns=empty_cols, inplace=True)
    test_df.drop(columns=empty_cols,  inplace=True)
    feat_cols = [c for c in feat_cols if c not in empty_cols]
    medians   = medians.drop(index=empty_cols)

# ─── 6) Impute remaining missing with train-set medians ─────────────────────────
train_df[feat_cols] = train_df[feat_cols].fillna(medians)
test_df [feat_cols] = test_df [feat_cols].fillna(medians)

# (Optional) fill any stragglers with zero
train_df[feat_cols] = train_df[feat_cols].fillna(0)
test_df [feat_cols] = test_df [feat_cols].fillna(0)

# ─── 7) Prepare X/y ─────────────────────────────────────────────────────────────
X_train, y_train = train_df[feat_cols], train_df["sepsis_label"]
X_test,  y_test  = test_df [feat_cols],  test_df ["sepsis_label"]

print(f"Train size: {X_train.shape[0]}   Test size: {X_test.shape[0]}")
print("Test label distribution:\n", y_test.value_counts(), "\n")

# ─── 8) Logistic Regression with scaling ────────────────────────────────────────
log_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf",    LogisticRegression(
                  solver="saga",
                  max_iter=2000,
                  class_weight="balanced",
                  random_state=42
              ))
])
log_pipeline.fit(X_train, y_train)
print("Logistic Regression (scaled) results:")
print(classification_report(y_test, log_pipeline.predict(X_test)))

# ─── 9) Random Forest baseline ─────────────────────────────────────────────────
rf = RandomForestClassifier(
    n_estimators=100,
    class_weight="balanced",
    random_state=42
)
rf.fit(X_train, y_train)
print("Random Forest results:")
print(classification_report(y_test, rf.predict(X_test)))


Train size: 180641   Test size: 10
Test label distribution:
 sepsis_label
True     5
False    5
Name: count, dtype: int64 





Logistic Regression (scaled) results:
              precision    recall  f1-score   support

       False       0.71      1.00      0.83         5
        True       1.00      0.60      0.75         5

    accuracy                           0.80        10
   macro avg       0.86      0.80      0.79        10
weighted avg       0.86      0.80      0.79        10

Random Forest results:
              precision    recall  f1-score   support

       False       0.50      1.00      0.67         5
        True       0.00      0.00      0.00         5

    accuracy                           0.50        10
   macro avg       0.25      0.50      0.33        10
weighted avg       0.25      0.50      0.33        10



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
