In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df=pd.read_csv('data/processed_data.csv')
X=df.drop(columns='preeclampsia')
y=df['preeclampsia']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("train shape:", X_train.shape)
print("test shape:", X_test.shape)

train shape: (2277, 8)
test shape: (570, 8)


In [11]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, class_weight='balanced'),
    'RandomForest': RandomForestClassifier(n_estimators=100, class_weight='balanced'),
    'SVM': SVC(kernel='rbf', probability=True, class_weight='balanced'),
    'XGBoost': XGBClassifier(eval_metric='logloss')
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(f"trained {name}")

trained LogisticRegression
trained RandomForest
trained SVM
trained XGBoost


In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def evaluate_model(name, model, X_test, y_test):
    y_pred=model.predict(X_test)
    y_prob=model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    acc=accuracy_score(y_test, y_pred)
    prec=precision_score(y_test, y_pred)
    rec=recall_score(y_test, y_pred)
    f1=f1_score(y_test, y_pred)
    auc=roc_auc_score(y_test, y_prob) if y_prob is not None else "N/A"

    print(f"\n{name} Performance:")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print(f"AUC-ROC:   {auc}")

for name, model in models.items():
    evaluate_model(name, model, X_test, y_test)


LogisticRegression Performance:
Accuracy:  0.8825
Precision: 0.3366
Recall:    1.0000
F1 Score:  0.5037
AUC-ROC:   0.9530838454784899

RandomForest Performance:
Accuracy:  0.9632
Precision: 0.7097
Recall:    0.6471
F1 Score:  0.6769
AUC-ROC:   0.9823858647936787

SVM Performance:
Accuracy:  0.9053
Precision: 0.3810
Recall:    0.9412
F1 Score:  0.5424
AUC-ROC:   0.9739354697102722

XGBoost Performance:
Accuracy:  0.9544
Precision: 0.6250
Recall:    0.5882
F1 Score:  0.6061
AUC-ROC:   0.9787642669007901


In [14]:
import joblib
import os

os.makedirs('models/', exist_ok=True)

for name, model in models.items():
    joblib.dump(model, f'models/{name}.pkl')
    print(f"Saved {name}")


Saved LogisticRegression
Saved RandomForest
Saved SVM
Saved XGBoost


In [15]:
print(df['preeclampsia'].value_counts(normalize=True))


preeclampsia
0    0.939937
1    0.060063
Name: proportion, dtype: float64


In [16]:
df.columns


Index(['preeclampsia', 'systolic_bp', 'mean_arterial_pressure', 'diastolic_bp',
       'pulse_pressure', 'serum_uric_acid', 'uric_acid_creatinine_ratio',
       'proteinuria_24h', 'protein_creatinine_ratio'],
      dtype='object')