<a href="https://colab.research.google.com/github/rekhasahoo/ANEMIA/blob/main/ANEMIA_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Anemia Detection Pipeline with Type Detection from Turkey + Biochemical Rules

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import KNNImputer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import classification_report

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# ---------------------------
# 1. Load datasets
# ---------------------------
turkey = pd.read_csv("/content/TURKEY DATASET 1 (1).csv")
bangladesh = pd.read_csv("/content/BANGLADESH DATASET (1).csv")
lucknow2 = pd.read_csv("/content/Hepcidin Data Reformated 042320 (1).csv")

# ---------------------------
# 2. Standardize column names
# ---------------------------
def clean_cols(df):
    df.columns = df.columns.str.strip().str.lower()
    return df

turkey = clean_cols(turkey)
bangladesh = clean_cols(bangladesh)
lucknow2 = clean_cols(lucknow2)

# ---------------------------
# 3. Decision column
# ---------------------------
turkey['decision'] = turkey['all_class'].apply(lambda x: 0 if x == 0 else 1)
bangladesh['decision'] = bangladesh['decision_class']
lucknow2['decision'] = lucknow2['anemic']

# ---------------------------
# 4. Select common columns + biochemical markers + decision
# ---------------------------
common_cols = ['rbc', 'hgb', 'mcv', 'mch', 'mchc', 'ferritin', 'b12', 'folate', 'decision', 'all_class']
def select_cols(df, col_list):
    cols_present = [c for c in col_list if c in df.columns]
    return df[cols_present]

turkey_sel = select_cols(turkey, common_cols)
bangladesh_sel = select_cols(bangladesh, common_cols)
lucknow2_sel = select_cols(lucknow2, common_cols)

# ---------------------------
# 5. Merge datasets
# ---------------------------
data = pd.concat([turkey_sel, bangladesh_sel, lucknow2_sel], ignore_index=True)

# ---------------------------
# 6. Convert numeric columns
# ---------------------------
numeric_cols = ['rbc', 'hgb', 'mcv', 'mch', 'mchc', 'ferritin', 'b12', 'folate']
for col in numeric_cols:
    if col in data.columns:
        data[col] = pd.to_numeric(data[col], errors='coerce')

# ---------------------------
# 7. KNN Imputation for missing numeric values
# ---------------------------
imputer = KNNImputer(n_neighbors=5)
data[numeric_cols] = imputer.fit_transform(data[numeric_cols])

# ---------------------------
# 8. Anemia type assignment
# ---------------------------
def assign_anemia_type(row):
    # Turkey dataset: based on All_Class
    if 'all_class' in row and not pd.isna(row['all_class']):
        if row['all_class'] == 4: return 'B12_deficiency'
        if row['all_class'] == 3: return 'Folate_deficiency'
        if row['all_class'] == 2: return 'Iron_deficiency'
        if row['all_class'] == 1: return 'HGB_deficiency'
        return 'Normal'
    # Other datasets: based on biochemical ranges
    types = []
    if 'ferritin' in row and row['ferritin'] < 20:
        types.append('Iron_deficiency')
    if 'b12' in row and row['b12'] < 200:
        types.append('B12_deficiency')
    if 'folate' in row and row['folate'] < 3:
        types.append('Folate_deficiency')
    return ','.join(types) if types else 'Normal'

data['anemia_type'] = data.apply(assign_anemia_type, axis=1)

# ---------------------------
# 9. Encode target and type
# ---------------------------
le_target = LabelEncoder()
data['decision'] = le_target.fit_transform(data['decision'])

le_type = LabelEncoder()
data['anemia_type_encoded'] = le_type.fit_transform(data['anemia_type'])

# ---------------------------
# 10. Split features & targets
# ---------------------------
X = data[numeric_cols]
y_decision = data['decision']
y_type = data['anemia_type_encoded']

X_train, X_test, y_train_dec, y_test_dec = train_test_split(
    X, y_decision, test_size=0.2, random_state=42, stratify=y_decision
)
_, _, y_train_type, y_test_type = train_test_split(
    X, y_type, test_size=0.2, random_state=42, stratify=y_type
)

# ---------------------------
# 11. Scale features
# ---------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ---------------------------
# 12. ML Models for decision
# ---------------------------
ml_models = {
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

print("=== Anemia Detection (Decision) ===")
for name, model in ml_models.items():
    model.fit(X_train_scaled, y_train_dec)
    y_pred = model.predict(X_test_scaled)
    print(f"\n--- {name} ---")
    print(classification_report(y_test_dec, y_pred))

print("=== Anemia Type Prediction ===")
for name, model in ml_models.items():
    model.fit(X_train_scaled, y_train_type)
    y_pred = model.predict(X_test_scaled)
    print(f"\n--- {name} ---")
    print(classification_report(y_test_type, y_pred))

# ---------------------------
# 13. Save final dataset
# ---------------------------
data.to_csv("/content/anemia_final_dataset_with_type.csv", index=False)
print("✅ Dataset with decision & anemia_type saved at /content/anemia_final_dataset_with_type.csv")


=== Anemia Detection (Decision) ===

--- KNN ---
              precision    recall  f1-score   support

           0       0.92      0.95      0.94      2034
           1       0.92      0.86      0.89      1249

    accuracy                           0.92      3283
   macro avg       0.92      0.91      0.91      3283
weighted avg       0.92      0.92      0.92      3283


--- SVM ---
              precision    recall  f1-score   support

           0       0.93      0.96      0.95      2034
           1       0.93      0.89      0.91      1249

    accuracy                           0.93      3283
   macro avg       0.93      0.92      0.93      3283
weighted avg       0.93      0.93      0.93      3283


--- DecisionTree ---
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      2034
           1       0.89      0.89      0.89      1249

    accuracy                           0.92      3283
   macro avg       0.91      0.91      0.91 

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



--- XGBoost ---
              precision    recall  f1-score   support

           0       0.93      0.98      0.96      2034
           1       0.97      0.88      0.92      1249

    accuracy                           0.94      3283
   macro avg       0.95      0.93      0.94      3283
weighted avg       0.94      0.94      0.94      3283

=== Anemia Type Prediction ===

--- KNN ---
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        40
           1       0.00      0.00      0.00        31
           2       0.14      0.04      0.06       204
           3       0.26      0.20      0.22       839
           4       0.67      0.80      0.73      2169

    accuracy                           0.58      3283
   macro avg       0.21      0.21      0.20      3283
weighted avg       0.52      0.58      0.54      3283


--- SVM ---
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        40
 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



--- DecisionTree ---
              precision    recall  f1-score   support

           0       0.02      0.03      0.02        40
           1       0.00      0.00      0.00        31
           2       0.06      0.06      0.06       204
           3       0.25      0.27      0.26       839
           4       0.66      0.63      0.64      2169

    accuracy                           0.49      3283
   macro avg       0.20      0.20      0.20      3283
weighted avg       0.50      0.49      0.50      3283


--- RandomForest ---
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        40
           1       0.00      0.00      0.00        31
           2       0.00      0.00      0.00       204
           3       0.22      0.03      0.06       839
           4       0.66      0.96      0.78      2169

    accuracy                           0.64      3283
   macro avg       0.18      0.20      0.17      3283
weighted avg       0.49      0.64

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



--- XGBoost ---
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        40
           1       0.00      0.00      0.00        31
           2       0.00      0.00      0.00       204
           3       0.27      0.06      0.09       839
           4       0.66      0.94      0.78      2169

    accuracy                           0.64      3283
   macro avg       0.19      0.20      0.17      3283
weighted avg       0.51      0.64      0.54      3283

✅ Dataset with decision & anemia_type saved at /content/anemia_final_dataset_with_type.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# ---------------------------
# 1. Load cleaned dataset
# ---------------------------
data = pd.read_csv("/content/anemia_final_dataset_with_type.csv")

# ---------------------------
# 2. Define target columns
# ---------------------------
X = data.drop(columns=['decision', 'anemia_type'], errors='ignore')
y_dec = data['decision']

# Encode anemia_type (string → numeric)
le_type = LabelEncoder()
y_type = le_type.fit_transform(data['anemia_type'].astype(str))

# ---------------------------
# 3. Impute missing values
# ---------------------------
numeric_features = X.select_dtypes(include=[np.number]).columns
cat_features = X.select_dtypes(include=['object']).columns

imputer_num = SimpleImputer(strategy='median')
X[numeric_features] = imputer_num.fit_transform(X[numeric_features])

if len(cat_features) > 0:
    imputer_cat = SimpleImputer(strategy='most_frequent')
    X[cat_features] = imputer_cat.fit_transform(X[cat_features])

# Encode categorical columns if any
for col in cat_features:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

# ---------------------------
# 4. Train/test split
# ---------------------------
X_train, X_test, y_train_dec, y_test_dec, y_train_type, y_test_type = train_test_split(
    X, y_dec, y_type, test_size=0.2, random_state=42, stratify=y_dec
)

# ---------------------------
# 5. Scale features
# ---------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ---------------------------
# 6. Train decision models
# ---------------------------
ml_models = {
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42, n_estimators=100),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

print("=== Anemia Decision Prediction ===")
for name, model in ml_models.items():
    model.fit(X_train_scaled, y_train_dec)
    y_pred = model.predict(X_test_scaled)
    print(f"\n--- {name} ---")
    print(classification_report(y_test_dec, y_pred))

# ---------------------------
# 7. Type prediction for anemic patients only
# ---------------------------
mask_train_anemic = y_train_dec == 1
mask_test_anemic = y_test_dec == 1

X_train_anemic = X_train_scaled[mask_train_anemic.values]
X_test_anemic = X_test_scaled[mask_test_anemic.values]

y_train_type_anemic = y_train_type[mask_train_anemic.values]
y_test_type_anemic = y_test_type[mask_test_anemic.values]

print("\n=== Anemia Type Prediction (Only Anemic Patients) ===")
for name, model in ml_models.items():
    model.fit(X_train_anemic, y_train_type_anemic)
    y_pred_type = model.predict(X_test_anemic)
    print(f"\n--- {name} ---")
    print(classification_report(
        y_test_type_anemic,
        y_pred_type,
        target_names=le_type.classes_
    ))

# ---------------------------
# 8. Save final dataset
# ---------------------------
data.to_csv("/content/anemia_final_dataset_ready_for_ml.csv", index=False)
print("✅ Dataset saved at /content/anemia_final_dataset_ready_for_ml.csv")


=== Anemia Decision Prediction ===

--- KNN ---
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      2034
           1       0.99      0.97      0.98      1249

    accuracy                           0.99      3283
   macro avg       0.99      0.98      0.99      3283
weighted avg       0.99      0.99      0.99      3283


--- SVM ---
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      2034
           1       1.00      0.98      0.99      1249

    accuracy                           0.99      3283
   macro avg       0.99      0.99      0.99      3283
weighted avg       0.99      0.99      0.99      3283


--- DecisionTree ---
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2034
           1       0.99      0.98      0.98      1249

    accuracy                           0.99      3283
   macro avg       0.99      0.99      0.99  

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



--- KNN ---
                   precision    recall  f1-score   support

   B12_deficiency       1.00      1.00      1.00        30
Folate_deficiency       1.00      1.00      1.00        26
   HGB_deficiency       1.00      0.98      0.99       195
  Iron_deficiency       1.00      1.00      1.00       837
           Normal       0.99      1.00      1.00       161

         accuracy                           1.00      1249
        macro avg       1.00      1.00      1.00      1249
     weighted avg       1.00      1.00      1.00      1249


--- SVM ---
                   precision    recall  f1-score   support

   B12_deficiency       1.00      1.00      1.00        30
Folate_deficiency       1.00      1.00      1.00        26
   HGB_deficiency       1.00      1.00      1.00       195
  Iron_deficiency       1.00      1.00      1.00       837
           Normal       1.00      0.99      1.00       161

         accuracy                           1.00      1249
        macro avg       1

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Dataset saved at /content/anemia_final_dataset_ready_for_ml.csv






excluding biomarkers


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# =======================
# Load dataset
# =======================
df = pd.read_csv("/content/anemia_final_dataset_ready_for_ml.csv")

# =======================
# Features and targets
# =======================
# Drop non-feature columns
X = df.drop(columns=['decision', 'anemia_type', 'anemia_type_encoded', 'all_class'], errors='ignore')

# Target 1: Binary decision (0 = not anemic, 1 = anemic)
y_dec = df['decision']

# Target 2: Anemia type (only for anemic patients)
y_type = df['anemia_type_encoded']
df_type = df[df['decision'] == 1]   # filter only anemic
X_type = df_type.drop(columns=['decision', 'anemia_type', 'anemia_type_encoded', 'all_class'], errors='ignore')
y_type = df_type['anemia_type_encoded']

# Scale features (important for SVM/KNN)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_type_scaled = scaler.fit_transform(X_type)

# =======================
# Models
# =======================
models = {
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVM": SVC(kernel='rbf', probability=True),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# =======================
# 1. Binary Anemia Decision
# =======================
print("=== Anemia Decision Prediction (CV) ===\n")
for name, model in models.items():
    y_pred = cross_val_predict(model, X_scaled, y_dec, cv=cv)
    print(f"--- {name} ---")
    print(classification_report(y_dec, y_pred))
    print("\n")

# =======================
# 2. Anemia Type Prediction (only anemic patients)
# =======================
print("\n=== Anemia Type Prediction (CV, only anemic patients) ===\n")
for name, model in models.items():
    y_pred_type = cross_val_predict(model, X_type_scaled, y_type, cv=cv)
    print(f"--- {name} ---")
    print(classification_report(y_type, y_pred_type, target_names=df['anemia_type'].unique()))
    print("\n")


=== Anemia Decision Prediction (CV) ===

--- KNN ---
              precision    recall  f1-score   support

           0       0.92      0.95      0.94     10170
           1       0.92      0.87      0.89      6244

    accuracy                           0.92     16414
   macro avg       0.92      0.91      0.91     16414
weighted avg       0.92      0.92      0.92     16414



--- SVM ---
              precision    recall  f1-score   support

           0       0.94      0.96      0.95     10170
           1       0.94      0.89      0.92      6244

    accuracy                           0.94     16414
   macro avg       0.94      0.93      0.93     16414
weighted avg       0.94      0.94      0.94     16414



--- DecisionTree ---
              precision    recall  f1-score   support

           0       0.94      0.94      0.94     10170
           1       0.90      0.90      0.90      6244

    accuracy                           0.92     16414
   macro avg       0.92      0.92     

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- XGBoost ---
              precision    recall  f1-score   support

           0       0.94      0.98      0.96     10170
           1       0.96      0.89      0.92      6244

    accuracy                           0.94     16414
   macro avg       0.95      0.93      0.94     16414
weighted avg       0.95      0.94      0.94     16414




=== Anemia Type Prediction (CV, only anemic patients) ===

--- KNN ---
                   precision    recall  f1-score   support

   B12_deficiency       0.37      0.20      0.26       201
Folate_deficiency       0.47      0.26      0.34       153
  Iron_deficiency       0.42      0.33      0.37      1019
   HGB_deficiency       0.80      0.88      0.84      4189
           Normal       0.87      0.77      0.82       682

         accuracy                           0.74      6244
        macro avg       0.58      0.49      0.52      6244
     weighted avg       0.72      0.74      0.73      6244



--- SVM ---
                   precision    rec

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- XGBoost ---
                   precision    recall  f1-score   support

   B12_deficiency       0.55      0.42      0.48       201
Folate_deficiency       0.59      0.51      0.55       153
  Iron_deficiency       0.53      0.35      0.42      1019
   HGB_deficiency       0.80      0.89      0.84      4189
           Normal       0.88      0.82      0.85       682

         accuracy                           0.77      6244
        macro avg       0.67      0.60      0.63      6244
     weighted avg       0.75      0.77      0.76      6244





applying smote

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# ---------------------------
# 1. Load dataset
# ---------------------------
df = pd.read_csv("/content/anemia_final_dataset_ready_for_ml.csv")

# ---------------------------
# 2. Define features and targets
# ---------------------------
X = df[['rbc','hgb','mcv','mch','mchc','b12','folate','ferritin']]
y_dec = df['decision']              # 0: Not Anemic, 1: Anemic
y_type = df['anemia_type_encoded']  # Encoded type

# ---------------------------
# 3. Impute missing values
# ---------------------------
imputer = SimpleImputer(strategy='median')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# ---------------------------
# 4. Scale features
# ---------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ---------------------------
# 5. Define ML models
# ---------------------------
ml_models = {
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42, n_estimators=100),
    "XGBoost": xgb.XGBClassifier(eval_metric='mlogloss', use_label_encoder=False)
}

# ---------------------------
# 6. Anemia Decision Prediction with CV
# ---------------------------
print("=== Anemia Decision Prediction (CV) ===")
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for name, model in ml_models.items():
    y_pred_cv = cross_val_predict(model, X_scaled, y_dec, cv=skf)
    print(f"\n--- {name} ---")
    print(classification_report(y_dec, y_pred_cv))

# ---------------------------
# 7. Anemia Type Prediction (only anemic patients)
# ---------------------------
mask_anemic = y_dec == 1
X_anemic = X_scaled[mask_anemic]
y_type_anemic = y_type[mask_anemic]

# Mapping encoded labels to type names for readability
type_mapping = dict(zip(df['anemia_type_encoded'], df['anemia_type']))
type_names = [type_mapping[i] for i in sorted(type_mapping.keys())]

print("\n=== Anemia Type Prediction (CV, only anemic patients) ===")
for name, model in ml_models.items():
    y_pred_type_cv = cross_val_predict(model, X_anemic, y_type_anemic, cv=skf)
    print(f"\n--- {name} ---")
    print(classification_report(y_type_anemic, y_pred_type_cv, target_names=type_names))

# ---------------------------
# 8. Save final dataset
# ---------------------------
df.to_csv("/content/anemia_final_dataset_ready_for_ml.csv", index=False)
print("✅ Dataset saved at /content/anemia_final_dataset_ready_for_ml.csv")


=== Anemia Decision Prediction (CV) ===

--- KNN ---
              precision    recall  f1-score   support

           0       0.92      0.95      0.94     10170
           1       0.92      0.87      0.89      6244

    accuracy                           0.92     16414
   macro avg       0.92      0.91      0.91     16414
weighted avg       0.92      0.92      0.92     16414


--- SVM ---
              precision    recall  f1-score   support

           0       0.94      0.96      0.95     10170
           1       0.94      0.89      0.92      6244

    accuracy                           0.94     16414
   macro avg       0.94      0.93      0.93     16414
weighted avg       0.94      0.94      0.94     16414


--- DecisionTree ---
              precision    recall  f1-score   support

           0       0.94      0.94      0.94     10170
           1       0.90      0.90      0.90      6244

    accuracy                           0.92     16414
   macro avg       0.92      0.92      0

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



--- XGBoost ---
              precision    recall  f1-score   support

           0       0.94      0.98      0.96     10170
           1       0.96      0.89      0.92      6244

    accuracy                           0.94     16414
   macro avg       0.95      0.93      0.94     16414
weighted avg       0.95      0.94      0.94     16414


=== Anemia Type Prediction (CV, only anemic patients) ===

--- KNN ---
                   precision    recall  f1-score   support

   B12_deficiency       0.31      0.17      0.22       201
Folate_deficiency       0.45      0.27      0.33       153
   HGB_deficiency       0.39      0.31      0.34      1019
  Iron_deficiency       0.79      0.88      0.83      4189
           Normal       0.86      0.77      0.82       682

         accuracy                           0.73      6244
        macro avg       0.56      0.48      0.51      6244
     weighted avg       0.71      0.73      0.72      6244


--- SVM ---
                   precision    recal

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



--- XGBoost ---
                   precision    recall  f1-score   support

   B12_deficiency       0.55      0.42      0.48       201
Folate_deficiency       0.59      0.51      0.55       153
   HGB_deficiency       0.53      0.35      0.42      1019
  Iron_deficiency       0.80      0.89      0.84      4189
           Normal       0.88      0.82      0.85       682

         accuracy                           0.77      6244
        macro avg       0.67      0.60      0.63      6244
     weighted avg       0.75      0.77      0.76      6244

✅ Dataset saved at /content/anemia_final_dataset_ready_for_ml.csv


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

# ---------------------------
# 1. Load dataset
# ---------------------------
df = pd.read_csv("/content/anemia_final_dataset_ready_for_ml.csv")

# ---------------------------
# 2. Define features and targets
# ---------------------------
X = df[['rbc','hgb','mcv','mch','mchc','b12','folate','ferritin']]
y_dec = df['decision']              # 0: Not Anemic, 1: Anemic
y_type = df['anemia_type_encoded']  # Encoded type

# ---------------------------
# 3. Impute missing values
# ---------------------------
imputer = SimpleImputer(strategy='median')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# ---------------------------
# 4. Scale features
# ---------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ---------------------------
# 5. Base learners for stacking
# ---------------------------
base_learners = [
    ('knn', KNeighborsClassifier()),
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42, n_estimators=100)),
    ('svm', SVC(probability=True, random_state=42))
]

# ---------------------------
# 6. Meta-learner
# ---------------------------
meta_learner = LogisticRegression(max_iter=1000)

# ---------------------------
# 7. Stacking classifier for Anemia Decision (0/1)
# ---------------------------
stack_model_dec = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv=5,
    stack_method='predict_proba',  # better for meta-learner learning probabilities
    n_jobs=-1
)

# Cross-validation predictions
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred_dec = cross_val_predict(stack_model_dec, X_scaled, y_dec, cv=skf)

print("=== Anemia Decision Prediction (Stacking Ensemble CV) ===")
print(classification_report(y_dec, y_pred_dec))

# ---------------------------
# 8. Stacking classifier for Anemia Type (only anemic patients)
# ---------------------------
mask_anemic = y_dec == 1
X_anemic = X_scaled[mask_anemic]
y_type_anemic = y_type[mask_anemic]

stack_model_type = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv=5,
    stack_method='predict_proba',
    n_jobs=-1
)

y_pred_type = cross_val_predict(stack_model_type, X_anemic, y_type_anemic, cv=skf)

# Map encoded type to readable names
type_mapping = dict(zip(df['anemia_type_encoded'], df['anemia_type']))
type_names = [type_mapping[i] for i in sorted(type_mapping.keys())]

print("\n=== Anemia Type Prediction (Stacking Ensemble CV, Only Anemic Patients) ===")
print(classification_report(y_type_anemic, y_pred_type, target_names=type_names))

# ---------------------------
# 9. Save dataset for reference
# ---------------------------
df.to_csv("/content/anemia_final_dataset_ready_for_ml.csv", index=False)
print("✅ Dataset saved at /content/anemia_final_dataset_ready_for_ml.csv")


=== Anemia Decision Prediction (Stacking Ensemble CV) ===
              precision    recall  f1-score   support

           0       0.94      0.98      0.96     10170
           1       0.97      0.89      0.93      6244

    accuracy                           0.95     16414
   macro avg       0.95      0.94      0.94     16414
weighted avg       0.95      0.95      0.95     16414


=== Anemia Type Prediction (Stacking Ensemble CV, Only Anemic Patients) ===
                   precision    recall  f1-score   support

   B12_deficiency       0.60      0.36      0.45       201
Folate_deficiency       0.69      0.39      0.50       153
   HGB_deficiency       0.60      0.22      0.32      1019
  Iron_deficiency       0.78      0.94      0.85      4189
           Normal       0.90      0.80      0.85       682

         accuracy                           0.78      6244
        macro avg       0.71      0.54      0.59      6244
     weighted avg       0.76      0.78      0.74      6244

✅ Da

SMOTE + Undersampling → balances both small and large classes.

Class weights → forces the model to care more about rare deficiencies.

Stacking ensemble

In [None]:
# -----------------------------
# IMPORTS
# -----------------------------
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# -----------------------------
# LOAD DATA
# -----------------------------
df = pd.read_csv('/content/anemia_final_dataset_ready_for_ml.csv')

# FEATURES
features = ['rbc','hgb','mcv','mch','mchc','b12','folate','ferritin']

# TARGETS
target_decision = 'decision'
target_type = 'anemia_type_encoded'

# -----------------------------
# SPLIT DATA (train/test)
# -----------------------------
X_train, X_test, y_train_dec, y_test_dec = train_test_split(
    df[features], df[target_decision], test_size=0.2, random_state=42, stratify=df[target_decision]
)

X_train_type, X_test_type, y_train_type, y_test_type = train_test_split(
    df[features], df[target_type], test_size=0.2, random_state=42, stratify=df[target_type]
)

# -----------------------------
# BALANCING (SMOTE + Undersampling)
# -----------------------------
over = SMOTE(random_state=42)
under = RandomUnderSampler(random_state=42)

# Decision target (binary)
X_train_dec_bal, y_train_dec_bal = Pipeline([
    ('over', over),
    ('under', under)
]).fit_resample(X_train, y_train_dec)

# Type target (multiclass)
X_train_type_bal, y_train_type_bal = Pipeline([
    ('over', over),
    ('under', under)
]).fit_resample(X_train_type, y_train_type)

# -----------------------------
# FEATURE SCALING
# -----------------------------
scaler = StandardScaler()
X_train_dec_scaled = scaler.fit_transform(X_train_dec_bal)
X_test_dec_scaled = scaler.transform(X_test)

X_train_type_scaled = scaler.fit_transform(X_train_type_bal)
X_test_type_scaled = scaler.transform(X_test_type)

# -----------------------------
# STACKING ENSEMBLE with CLASS WEIGHTS
# -----------------------------
base_learners = [
    ('rf', RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)),
    ('dt', DecisionTreeClassifier(class_weight='balanced', random_state=42)),
    ('knn', KNeighborsClassifier()),  # no class_weight here
    ('svc', SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=42))
]

# --- Anemia Decision (Binary)
stack_clf_dec = StackingClassifier(
    estimators=base_learners,
    final_estimator=LogisticRegression(class_weight='balanced', max_iter=1000),
    cv=5
)
stack_clf_dec.fit(X_train_dec_scaled, y_train_dec_bal)
y_pred_dec = stack_clf_dec.predict(X_test_dec_scaled)
print("=== Anemia Decision Prediction (Stacking Ensemble, Balanced) ===")
print(classification_report(y_test_dec, y_pred_dec))

# --- Anemia Type (Multiclass)
stack_clf_type = StackingClassifier(
    estimators=base_learners,
    final_estimator=LogisticRegression(class_weight='balanced', max_iter=1000),
    cv=5
)
stack_clf_type.fit(X_train_type_scaled, y_train_type_bal)
y_pred_type = stack_clf_type.predict(X_test_type_scaled)
print("=== Anemia Type Prediction (Stacking Ensemble, Balanced) ===")
print(classification_report(y_test_type, y_pred_type))


=== Anemia Decision Prediction (Stacking Ensemble, Balanced) ===
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      2034
           1       0.94      0.89      0.91      1249

    accuracy                           0.94      3283
   macro avg       0.94      0.93      0.93      3283
weighted avg       0.94      0.94      0.94      3283

=== Anemia Type Prediction (Stacking Ensemble, Balanced) ===
              precision    recall  f1-score   support

           0       0.29      0.42      0.35        40
           1       0.44      0.39      0.41        31
           2       0.34      0.46      0.39       204
           3       0.80      0.74      0.77       839
           4       0.95      0.94      0.94      2169

    accuracy                           0.85      3283
   macro avg       0.57      0.59      0.57      3283
weighted avg       0.86      0.85      0.85      3283



In [None]:
# -----------------------------
# IMPORTS
# -----------------------------
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# Import New Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    VotingClassifier, # <-- New Ensemble
    StackingClassifier
)
# You may need to install these: pip install xgboost lightgbm
try:
    from xgboost import XGBClassifier
    from lightgbm import LGBMClassifier
    HAVE_BOOSTING = True
except ImportError:
    HAVE_BOOSTING = False
    print("XGBoost/LightGBM not installed. Skipping those models.")
    print("To install: pip install xgboost lightgbm")

# Import Imbalanced-Learn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# -----------------------------
# LOAD DATA
# -----------------------------
try:
    df = pd.read_csv('/content/anemia_final_dataset_ready_for_ml.csv')
except FileNotFoundError:
    print("Error: Dataset file not found.")
    print("Please ensure '/content/anemia_final_dataset_ready_for_ml.csv' is uploaded.")
    exit()

# FEATURES
features = ['rbc', 'hgb', 'mcv', 'mch', 'mchc', 'b12', 'folate', 'ferritin']
# TARGETS
target_decision = 'decision'
target_type = 'anemia_type_encoded'

# -----------------------------
# 1. SIMPLIFIED DATA SPLIT
# -----------------------------
# We only need one X split, as the features are the same for both tasks.
X = df[features]
y_dec = df[target_decision]
y_type = df[target_type]

# Split for Anemia Decision (Binary)
X_train, X_test, y_train_dec, y_test_dec = train_test_split(
    X, y_dec, test_size=0.2, random_state=42, stratify=y_dec
)

# Split for Anemia Type (Multiclass)
# We use the *same* X_train/X_test from the split above for consistency.
# We just need to get the corresponding y_type labels for those splits.
y_train_type = y_type.loc[X_train.index]
y_test_type = y_type.loc[X_test.index]

print(f"Original X_train shape: {X_train.shape}")
print(f"Original X_test shape: {X_test.shape}")
print("-" * 30)

# -----------------------------
# 2. FEATURE SCALING (THE CORRECT WAY)
# -----------------------------
# Fit the scaler ONLY on the original training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
# Transform the test data using the scaler fit on the training data
X_test_scaled = scaler.transform(X_test)

# -----------------------------
# 3. BALANCING (Applied AFTER scaling)
# -----------------------------
# Define the resampling strategy
over = SMOTE(random_state=42, k_neighbors=3) # k_neighbors=3 (or less) may be needed if a class is very small
under = RandomUnderSampler(random_state=42)
balancing_pipeline = ImbPipeline([('over', over), ('under', under)])

# Apply balancing only to the (scaled) training sets
print("Balancing binary 'decision' target...")
X_train_dec_bal, y_train_dec_bal = balancing_pipeline.fit_resample(X_train_scaled, y_train_dec)

print("Balancing multiclass 'type' target...")
X_train_type_bal, y_train_type_bal = balancing_pipeline.fit_resample(X_train_scaled, y_train_type)

print(f"Balanced X_train (decision): {X_train_dec_bal.shape}")
print(f"Balanced X_train (type): {X_train_type_bal.shape}")
print("-" * 30)


# -----------------------------
# 4. SECTION 1: INDIVIDUAL MODELS
# -----------------------------
# We are resampling, so we don't need 'class_weight'
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "Support Vector (RBF)": SVC(kernel='rbf', random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

if HAVE_BOOSTING:
    models["XGBoost"] = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
    models["LightGBM"] = LGBMClassifier(random_state=42, verbose=-1)

# --- Train & Evaluate on Anemia Decision (Binary) ---
print("=" * 30)
print("  INDIVIDUAL MODELS: ANEMIA DECISION (BINARY)  ")
print("=" * 30)
for name, model in models.items():
    print(f"\n--- Training {name} ---")
    model.fit(X_train_dec_bal, y_train_dec_bal)
    y_pred = model.predict(X_test_scaled)
    print(classification_report(y_test_dec, y_pred))

# --- Train & Evaluate on Anemia Type (Multiclass) ---
print("\n" + "=" * 30)
print("  INDIVIDUAL MODELS: ANEMIA TYPE (MULTICLASS)  ")
print("=" * 30)
for name, model in models.items():
    print(f"\n--- Training {name} ---")
    model.fit(X_train_type_bal, y_train_type_bal)
    y_pred = model.predict(X_test_scaled)
    print(classification_report(y_test_type, y_pred, zero_division=0))


# -----------------------------
# 5. SECTION 2: ENSEMBLE LEARNING (VOTING)
# -----------------------------
# A Voting ensemble is simpler than Stacking. It just averages predictions.
# We'll use a few good, but different, models.
estimators_voting = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('svc', SVC(kernel='rbf', probability=True, random_state=42)),
    ('gb', GradientBoostingClassifier(random_state=42))
]

# --- Voting: Anemia Decision (Binary) ---
print("\n" + "=" * 30)
print("  VOTING ENSEMBLE: ANEMIA DECISION (BINARY)  ")
print("=" * 30)
voting_clf_dec = VotingClassifier(estimators=estimators_voting, voting='soft')
voting_clf_dec.fit(X_train_dec_bal, y_train_dec_bal)
y_pred_voting_dec = voting_clf_dec.predict(X_test_scaled)
print(classification_report(y_test_dec, y_pred_voting_dec))

# --- Voting: Anemia Type (Multiclass) ---
print("\n" + "=" * 30)
print("  VOTING ENSEMBLE: ANEMIA TYPE (MULTICLASS)  ")
print("=" * 30)
voting_clf_type = VotingClassifier(estimators=estimators_voting, voting='soft')
voting_clf_type.fit(X_train_type_bal, y_train_type_bal)
y_pred_voting_type = voting_clf_type.predict(X_test_scaled)
print(classification_report(y_test_type, y_pred_voting_type, zero_division=0))


# -----------------------------
# 6. SECTION 3: ENSEMBLE LEARNING (STACKING)
# -----------------------------
# This is your original code, now using the correctly scaled/balanced data.
# Note: I removed 'class_weight' as we are already balancing with SMOTE/RUS.
base_learners = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('knn', KNeighborsClassifier()),
    ('svc', SVC(kernel='rbf', probability=True, random_state=42))
]

# --- Stacking: Anemia Decision (Binary) ---
print("\n" + "=" * 30)
print("  STACKING ENSEMBLE: ANEMIA DECISION (BINARY)  ")
print("=" * 30)
stack_clf_dec = StackingClassifier(
    estimators=base_learners,
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5
)
stack_clf_dec.fit(X_train_dec_bal, y_train_dec_bal)
y_pred_dec = stack_clf_dec.predict(X_test_scaled)
print(classification_report(y_test_dec, y_pred_dec))

# --- Stacking: Anemia Type (Multiclass) ---
print("\n" + "=" * 30)
print("  STACKING ENSEMBLE: ANEMIA TYPE (MULTICLASS)  ")
print("=" * 30)
stack_clf_type = StackingClassifier(
    estimators=base_learners,
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5
)
stack_clf_type.fit(X_train_type_bal, y_train_type_bal)
y_pred_type = stack_clf_type.predict(X_test_scaled)
print(classification_report(y_test_type, y_pred_type, zero_division=0))

print("\n=== Model Training Complete ===")


Original X_train shape: (13131, 8)
Original X_test shape: (3283, 8)
------------------------------
Balancing binary 'decision' target...
Balancing multiclass 'type' target...
Balanced X_train (decision): (16272, 8)
Balanced X_train (type): (43250, 8)
------------------------------
  INDIVIDUAL MODELS: ANEMIA DECISION (BINARY)  

--- Training Logistic Regression ---
              precision    recall  f1-score   support

           0       0.94      0.89      0.91      2034
           1       0.84      0.90      0.87      1249

    accuracy                           0.90      3283
   macro avg       0.89      0.90      0.89      3283
weighted avg       0.90      0.90      0.90      3283


--- Training K-Nearest Neighbors ---
              precision    recall  f1-score   support

           0       0.93      0.91      0.92      2034
           1       0.86      0.89      0.87      1249

    accuracy                           0.90      3283
   macro avg       0.89      0.90      0.90      

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       0.94      0.97      0.95      2034
           1       0.94      0.89      0.91      1249

    accuracy                           0.94      3283
   macro avg       0.94      0.93      0.93      3283
weighted avg       0.94      0.94      0.94      3283


--- Training LightGBM ---




              precision    recall  f1-score   support

           0       0.93      0.98      0.95      2034
           1       0.96      0.88      0.92      1249

    accuracy                           0.94      3283
   macro avg       0.95      0.93      0.94      3283
weighted avg       0.94      0.94      0.94      3283


  INDIVIDUAL MODELS: ANEMIA TYPE (MULTICLASS)  

--- Training Logistic Regression ---
              precision    recall  f1-score   support

           0       0.11      0.83      0.20        30
           1       0.17      1.00      0.30        26
           2       0.27      0.52      0.36       195
           3       0.73      0.58      0.65       837
           4       0.96      0.83      0.89      2195

    accuracy                           0.75      3283
   macro avg       0.45      0.75      0.48      3283
weighted avg       0.85      0.75      0.78      3283


--- Training K-Nearest Neighbors ---
              precision    recall  f1-score   support

    

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       0.23      0.60      0.34        30
           1       0.35      0.73      0.47        26
           2       0.30      0.45      0.36       195
           3       0.82      0.69      0.75       837
           4       0.95      0.93      0.94      2195

    accuracy                           0.84      3283
   macro avg       0.53      0.68      0.57      3283
weighted avg       0.87      0.84      0.85      3283


--- Training LightGBM ---




              precision    recall  f1-score   support

           0       0.20      0.60      0.30        30
           1       0.35      0.69      0.47        26
           2       0.31      0.49      0.38       195
           3       0.83      0.68      0.75       837
           4       0.95      0.93      0.94      2195

    accuracy                           0.84      3283
   macro avg       0.53      0.68      0.57      3283
weighted avg       0.87      0.84      0.85      3283


  VOTING ENSEMBLE: ANEMIA DECISION (BINARY)  
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      2034
           1       0.95      0.89      0.92      1249

    accuracy                           0.94      3283
   macro avg       0.94      0.93      0.93      3283
weighted avg       0.94      0.94      0.94      3283


  VOTING ENSEMBLE: ANEMIA TYPE (MULTICLASS)  
              precision    recall  f1-score   support

           0       0.17      0.77  

In [None]:
!pip install xgboost lightgbm catboost




In [None]:
# -----------------------------
# SECTION 1: IMPORTS
# -----------------------------
import pandas as pd
import numpy as np
import warnings
from collections import Counter

# --- Preprocessing & Metrics ---
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE # <-- For Feature Selection
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

# --- Imbalanced-Learn ---
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek # <-- Advanced Balancing
from imblearn.pipeline import Pipeline as ImbPipeline

# --- Base Models ---
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

# --- Ensemble Models ---
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    StackingClassifier
)

# --- Boosting Libraries (Import with aliases) ---
try:
    import xgboost as xgb
    import lightgbm as lgb
    import catboost as cb
    HAVE_BOOSTING = True
except ImportError:
    HAVE_BOOSTING = False
    print("Warning: XGBoost, LightGBM, or CatBoost not installed. Skipping some models.")
    print("To install: pip install xgboost lightgbm catboost")

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# -----------------------------
# SECTION 2: HELPER FUNCTIONS
# -----------------------------

def get_model_list():
    """Returns your custom list of (name, model) tuples."""
    model_list = [
        ('knn', KNeighborsClassifier(n_neighbors=7)),
        ('svm', SVC(probability=True, kernel='rbf', random_state=42)),
        ('dt', DecisionTreeClassifier(random_state=42)),
        ('rf', RandomForestClassifier(n_estimators=150, random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('mlp', MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=300, random_state=42))
    ]

    if HAVE_BOOSTING:
        model_list.extend([
            ('xgb', xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)),
            ('lgb', lgb.LGBMClassifier(random_state=42, verbose=-1)),
            ('cat', cb.CatBoostClassifier(verbose=0, random_state=42))
        ])
    else:
        print("---! Boosted models (XGB, LGB, Cat) not found !---")

    return model_list

def print_all_metrics(y_true, y_pred, model_name, average_type):
    """Prints a formatted block of classification metrics."""
    acc = accuracy_score(y_true, y_pred)
    pre = precision_score(y_true, y_pred, average=average_type, zero_division=0)
    rec = recall_score(y_true, y_pred, average=average_type, zero_division=0)
    f1 = f1_score(y_true, y_pred, average=average_type, zero_division=0)

    print(f"{model_name} Metrics ({average_type} average):")
    print(f"  Accuracy:  {acc:.4f}")
    print(f"  Precision: {pre:.4f}")
    print(f"  Recall:    {rec:.4f}")
    print(f"  F1-Score:  {f1:.4f}")
    print("-" * 20)

# -----------------------------
# SECTION 3: LOAD DATA
# -----------------------------
try:
    df = pd.read_csv('/content/anemia_final_dataset_ready_for_ml.csv')
except FileNotFoundError:
    print("Error: Dataset file not found.")
    print("Please ensure '/content/anemia_final_dataset_ready_for_ml.csv' is uploaded.")
    exit()

# -----------------------------
# SECTION 4: ADVANCED FEATURE ENGINEERING
# -----------------------------
print("\n--- Starting Feature Engineering ---")
# Use .replace(0, 1e-6) to avoid division by zero
df['hgb_to_rbc_ratio'] = df['hgb'] / (df['rbc'].replace(0, 1e-6))
df['mcv_mch_ratio'] = df['mcv'] / (df['mch'].replace(0, 1e-6))
df['b12_folate_ratio'] = df['b12'] / (df['folate'].replace(0, 1e-6))
df['ferritin_to_hgb'] = df['ferritin'] / (df['hgb'].replace(0, 1e-6))

# Define the new, complete feature list
original_features = ['rbc', 'hgb', 'mcv', 'mch', 'mchc', 'b12', 'folate', 'ferritin']
ratio_features = ['hgb_to_rbc_ratio', 'mcv_mch_ratio', 'b12_folate_ratio', 'ferritin_to_hgb']
features_all = original_features + ratio_features

print(f"Created {len(ratio_features)} new features. Total features: {len(features_all)}")

# -----------------------------
# SECTION 5: DEFINE X AND Y
# -----------------------------
X = df[features_all]
y_dec = df['decision']
y_type = df['anemia_type_encoded']

# -----------------------------
# SECTION 6: TRAIN/TEST SPLIT
# -----------------------------
# This is the most important step for preventing data leakage.
X_train, X_test, y_train_dec, y_test_dec = train_test_split(
    X, y_dec, test_size=0.2, random_state=42, stratify=y_dec
)

y_train_type = y_type.loc[X_train.index]
y_test_type = y_type.loc[X_test.index]

print(f"\nOriginal X_train shape: {X_train.shape}")
print(f"Original X_test shape: {X_test.shape}")

# -----------------------------
# SECTION 7: FEATURE SELECTION (RFE)
# -----------------------------
print("\n--- Running RFE to select top features ---")
# RFE will find the best 8 features from our total list of 12
rfe = RFE(
    estimator=RandomForestClassifier(n_estimators=50, random_state=42),
    n_features_to_select=8, # We'll select the best 8
    step=1
)

# Fit RFE *only* on the training data
rfe.fit(X_train, y_train_dec)

# Get the names of the selected features
selected_features_mask = rfe.support_
selected_feature_names = X_train.columns[selected_features_mask]
print(f"Top 8 selected features: {list(selected_feature_names)}")

# Transform our train and test sets to use *only* these features
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

# -----------------------------
# SECTION 8: FEATURE SCALING
# -----------------------------
print("\n--- Scaling selected features ---")
scaler = StandardScaler()
# Fit the scaler *only* on the RFE-transformed training data
X_train_scaled = scaler.fit_transform(X_train_rfe)
# Transform the test data
X_test_scaled = scaler.transform(X_test_rfe)

# -----------------------------
# SECTION 9: ADVANCED BALANCING
# -----------------------------
# We'll use SMOTE-Tomek, which oversamples and then cleans noisy data
balancing_pipeline_smote_tomek = SMOTETomek(
    random_state=42,
    smote=SMOTE(random_state=42, k_neighbors=3), # k=3 for small classes
    tomek=None # Use default TomekLinks
)

# --- Balancing for Binary (Decision) Task ---
print("\nBalancing binary 'decision' target with SMOTE-Tomek...")
X_train_dec_bal, y_train_dec_bal = balancing_pipeline_smote_tomek.fit_resample(
    X_train_scaled, y_train_dec
)
print(f"Decision - Original: {Counter(y_train_dec)} | Balanced: {Counter(y_train_dec_bal)}")

# --- Balancing for Multiclass (Type) Task ---
print("Balancing multiclass 'type' target with SMOTE-Tomek...")
X_train_type_bal, y_train_type_bal = balancing_pipeline_smote_tomek.fit_resample(
    X_train_scaled, y_train_type
)
print(f"Type - Original: {Counter(y_train_type)} | Balanced: {Counter(y_train_type_bal)}")

# -----------------------------
# SECTION 10: INDIVIDUAL MODELS (LOOP)
# -----------------------------
models_to_test = dict(get_model_list())

print("\n" + "=" * 30)
print("  INDIVIDUAL MODELS: ANEMIA DECISION (BINARY)  ")
print("=" * 30)
for name, model in models_to_test.items():
    print(f"\n--- Training {name} ---")
    model.fit(X_train_dec_bal, y_train_dec_bal)
    y_pred = model.predict(X_test_scaled)

    print(classification_report(y_test_dec, y_pred, zero_division=0))
    print_all_metrics(y_test_dec, y_pred, name, average_type='weighted')

print("\n" + "=" * 30)
print("  INDIVIDUAL MODELS: ANEMIA TYPE (MULTICLASS)  ")
print("=" * 30)
for name, model in models_to_test.items():
    print(f"\n--- Training {name} ---")
    model.fit(X_train_type_bal, y_train_type_bal)
    y_pred = model.predict(X_test_scaled)

    print(classification_report(y_test_type, y_pred, zero_division=0))
    print_all_metrics(y_test_type, y_pred, name, average_type='weighted')


# -----------------------------
# SECTION 11: STACKING ENSEMBLE (FINAL MODEL)
# -----------------------------
# Get a fresh list of model objects for the stack
base_learners = get_model_list()

# --- Stacking: Anemia Decision (Binary) ---
print("\n" + "=" * 30)
print("  STACKING ENSEMBLE: ANEMIA DECISION (BINARY)  ")
print("=" * 30)
stack_clf_dec = StackingClassifier(
    estimators=base_learners,
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5,
    n_jobs=-1
)
stack_clf_dec.fit(X_train_dec_bal, y_train_dec_bal)
y_pred_dec = stack_clf_dec.predict(X_test_scaled)

print(classification_report(y_test_dec, y_pred_dec, zero_division=0))
print_all_metrics(y_test_dec, y_pred_dec, "Stacking Ensemble", average_type='weighted')

# --- Stacking: Anemia Type (Multiclass) ---
print("\n" + "=" * 30)
print("  STACKING ENSEMBLE: ANEMIA TYPE (MULTICLASS)  ")
print("=" * 30)
stack_clf_type = StackingClassifier(
    estimators=base_learners,
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5,
    n_jobs=-1
)
stack_clf_type.fit(X_train_type_bal, y_train_type_bal)
y_pred_type = stack_clf_type.predict(X_test_scaled)

print(classification_report(y_test_type, y_pred_type, zero_division=0))
print_all_metrics(y_test_type, y_pred_type, "Stacking Ensemble", average_type='weighted')

print("\n=== All Model Training and Evaluation Complete ===")


--- Starting Feature Engineering ---
Created 4 new features. Total features: 12

Original X_train shape: (13131, 12)
Original X_test shape: (3283, 12)

--- Running RFE to select top features ---
Top 8 selected features: ['rbc', 'hgb', 'mcv', 'mch', 'b12', 'hgb_to_rbc_ratio', 'mcv_mch_ratio', 'ferritin_to_hgb']

--- Scaling selected features ---

Balancing binary 'decision' target with SMOTE-Tomek...
Decision - Original: Counter({0: 8136, 1: 4995}) | Balanced: Counter({0: 7955, 1: 7955})
Balancing multiclass 'type' target with SMOTE-Tomek...
Type - Original: Counter({4: 8650, 3: 3358, 2: 824, 0: 172, 1: 127}) | Balanced: Counter({0: 8650, 1: 8648, 2: 8601, 4: 8561, 3: 8560})

  INDIVIDUAL MODELS: ANEMIA DECISION (BINARY)  

--- Training knn ---
              precision    recall  f1-score   support

           0       0.94      0.92      0.93      2034
           1       0.88      0.90      0.89      1249

    accuracy                           0.91      3283
   macro avg       0.91    