%pip install wittgenstein pandas scikit-learn
%pip install imblearn

In [9]:
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import classification_report, confusion_matrix
    import wittgenstein as lw
    from imblearn.over_sampling import SMOTENC
    
    df = pd.read_csv("newborn_health_monitoring_with_risk.csv")
    
    df['apgar_score'] = df.groupby('baby_id')['apgar_score'].ffill()
    
    cols_to_drop = ['Unnamed: 0', 
                   'baby_id', 
                   'name', 
                   'date', 
                   'jaundice_level_mg_dl', 
                   'apgar_score',
                   'gestational_age_weeks', 
                   'birth_weight_kg', 
                   'birth_length_cm', 
                   'birth_head_circumference_cm']
    df = df.drop(columns=cols_to_drop, errors="ignore")
    print(df.shape)
    target = "risk_level"
    X = df.drop(columns=[target])
    y = df[target]
    
    train_size = 0.7
    test_size = 0.15
    val_size = 0.15
    test_temp_size = test_size / (test_size + val_size)
    
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=(1-train_size), random_state=42, stratify=y
    )
    
    # temp into test and val
    X_test, X_val, y_test, y_val = train_test_split(
        X_temp, y_temp, test_size=(1-test_temp_size), random_state=42, stratify=y_temp
    )
    
    print("Train %:", len(X_train)/len(X)*100)
    print("Test %:", len(X_test)/len(X)*100)
    print("Val %:", len(X_val)/len(X)*100)
    
    print("\nBefore SMOTENC:", y_train.value_counts())
    
    # APPLY SMOTENC ON TRAIN SET ONLY
    cat_cols = X.select_dtypes(include='object').columns.tolist()
    categorical_indices = [X.columns.get_loc(c) for c in cat_cols]
    
    sm = SMOTENC(
        categorical_features=categorical_indices,
        sampling_strategy={ 'At Risk': int(len(y_train[y_train=='Healthy']) * 1) }, # *1 -> 1:1 balance
        random_state=42
    )
    
    
    X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)
    
    print("\nAfter SMOTENC:", y_train_resampled.value_counts())
    X_train_resampled = pd.DataFrame(X_train_resampled, columns=X.columns)
    for col in cat_cols:
        X_train_resampled[col] = X_train_resampled[col].astype(str)
    
    train_df = pd.concat([X_train_resampled, y_train_resampled], axis=1)
    
    # train
    ripper = lw.RIPPER(random_state=42)
    
    ripper.fit(
        train_df,
        class_feat=target,
        min_samples=2,
        pos_class="At Risk"
    )
    
    # PREP TEST AND VALIDATION SETS
    X_test_clean = X_test.copy()
    X_val_clean = X_val.copy()
    
    # Convert categorical values to string for RIPPER prediction
    for col in cat_cols:
        X_test_clean[col] = X_test_clean[col].astype(str)
        X_val_clean[col] = X_val_clean[col].astype(str)
    
    # eval
    ripper_test_preds_raw = ripper.predict(X_test_clean)
    y_pred_test = ["At Risk" if p else "Healthy" for p in ripper_test_preds_raw]
    
    print("\n========== RIPPER TEST SET RESULTS ==========")
    print(classification_report(y_test, y_pred_test))
    print(confusion_matrix(y_test, y_pred_test))
    
    # Validation
    ripper_val_preds_raw = ripper.predict(X_val_clean)
    y_pred_val = ["At Risk" if p else "Healthy" for p in ripper_val_preds_raw]

    cm = confusion_matrix(
        y_test,
        y_pred_test,
        labels=["Healthy", "At Risk"]
    )
    tn, fp, fn, tp = cm.ravel()
    tpr = tp / (tp + fn)
    fpr = fp / (fp + tn)
    
    print(f"TPR (Recall for 'At Risk'):", tpr)
    print(f"FPR ( for 'At Risk'):", fpr)
    print(tpr/fpr)
    
    print("\n========== RIPPER VALIDATION SET RESULTS ==========")
    print(classification_report(y_val, y_pred_val))
    print(confusion_matrix(y_val, y_pred_val))

    cm = confusion_matrix (y_val, y_pred_val)

   
    tn, fp, fn, tp = cm.ravel()
    
    tpr = tp / (tp + fn)
    fpr = fp / (fp + tn)
    
    print(f"TPR (Recall for 'At Risk'):", tpr)
    print(f"FPR ( for 'At Risk'):", fpr)


    
    print("\n========== RIPPER RULESET ==========")
    print(ripper.ruleset_)


(3000, 16)
Train %: 69.96666666666667
Test %: 15.0
Val %: 15.033333333333335

Before SMOTENC: risk_level
Healthy    1821
At Risk     278
Name: count, dtype: int64

After SMOTENC: risk_level
Healthy    1821
At Risk    1821
Name: count, dtype: int64

              precision    recall  f1-score   support

     At Risk       0.64      1.00      0.78        60
     Healthy       1.00      0.91      0.95       390

    accuracy                           0.92       450
   macro avg       0.82      0.96      0.87       450
weighted avg       0.95      0.92      0.93       450

[[ 60   0]
 [ 34 356]]
TPR (Recall for 'At Risk'): 1.0
FPR ( for 'At Risk'): 0.08717948717948718
11.470588235294118

              precision    recall  f1-score   support

     At Risk       0.73      1.00      0.85        60
     Healthy       1.00      0.94      0.97       391

    accuracy                           0.95       451
   macro avg       0.87      0.97      0.91       451
weighted avg       0.96      0.95  