# Naive Bayes

In [2]:
# read the saved joblib model
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

clf = joblib.load('/usr4/ugrad/spuduch/zebraMD/Model/naive_bayes_model.pkl')
le = joblib.load('/usr4/ugrad/spuduch/zebraMD/Model/label_mapping.joblib')
test_df = pd.read_csv('/usr4/ugrad/spuduch/zebraMD/data/test_data.csv')
X_test = test_df.drop(columns=['prognosis'])
y_test = test_df['prognosis']

In [3]:
# reverse the dict le
le = {v: k for k, v in le.items()}
le

{0: '(vertigo) Paroymsal  Positional Vertigo',
 1: 'AIDS',
 2: 'Acne',
 3: 'Alcoholic hepatitis',
 4: 'Allergy',
 5: 'Arthritis',
 6: 'Bronchial Asthma',
 7: 'Cervical spondylosis',
 8: 'Chicken pox',
 9: 'Chronic cholestasis',
 10: 'Common Cold',
 11: 'Dengue',
 12: 'Diabetes ',
 13: 'Dimorphic hemmorhoids(piles)',
 14: 'Drug Reaction',
 15: 'Fungal infection',
 16: 'GERD',
 17: 'Gastroenteritis',
 18: 'Heart attack',
 19: 'Hepatitis B',
 20: 'Hepatitis C',
 21: 'Hepatitis D',
 22: 'Hepatitis E',
 23: 'Hypertension ',
 24: 'Hyperthyroidism',
 25: 'Hypoglycemia',
 26: 'Hypothyroidism',
 27: 'Impetigo',
 28: 'Jaundice',
 29: 'Malaria',
 30: 'Migraine',
 31: 'Osteoarthristis',
 32: 'Paralysis (brain hemorrhage)',
 33: 'Peptic ulcer diseae',
 34: 'Pneumonia',
 35: 'Psoriasis',
 36: 'Tuberculosis',
 37: 'Typhoid',
 38: 'Urinary tract infection',
 39: 'Varicose veins',
 40: 'hepatitis A'}

In [4]:
feature_names = X_test.columns
classes = clf.classes_

# Get feature importance scores
feature_importance = clf.feature_log_prob_

def print_top_features(label_index, label_name, top_n=10):
    importance = feature_importance[label_index]
    sorted_idx = np.argsort(importance)
    top_features = feature_names[sorted_idx][-top_n:]
    top_importance = importance[sorted_idx][-top_n:]

    print(f"\nTop {top_n} Important Features for {le.get(label_name)}:")
    for i, (feature, imp) in enumerate(zip(top_features[::-1], top_importance[::-1]), 1):
        prob, log_prob = np.exp(imp), imp
        print(f"{i}. {feature}: {prob:.4f}")
        # print(f"   log_prob: {log_prob:.4f}")

# Print top features for each label
for i, label in enumerate(classes):
    print_top_features(i, label)


Top 10 Important Features for (vertigo) Paroymsal  Positional Vertigo:
1. unsteadiness: 0.9592
2. nausea: 0.9490
3. loss_of_balance: 0.9490
4. vomiting: 0.9388
5. headache: 0.9286
6. spinning_movements: 0.8980
7. loss_of_appetite: 0.0102
8. fluid_overload: 0.0102
9. acute_liver_failure: 0.0102
10. yellowing_of_eyes: 0.0102

Top 10 Important Features for AIDS:
1. high_fever: 0.9388
2. muscle_wasting: 0.8878
3. patches_in_throat: 0.8878
4. extra_marital_contacts: 0.8776
5. malaise: 0.0102
6. swelled_lymph_nodes: 0.0102
7. swelling_of_stomach: 0.0102
8. fluid_overload: 0.0102
9. acute_liver_failure: 0.0102
10. blurred_and_distorted_vision: 0.0102

Top 10 Important Features for Acne:
1. skin_rash: 0.9388
2. blackheads: 0.8980
3. pus_filled_pimples: 0.8878
4. scurring: 0.8776
5. blurred_and_distorted_vision: 0.0102
6. swelled_lymph_nodes: 0.0102
7. swelling_of_stomach: 0.0102
8. fluid_overload: 0.0102
9. acute_liver_failure: 0.0102
10. yellowing_of_eyes: 0.0102

Top 10 Important Features f

# Logistic Regression

In [5]:
# load the model
lr_clf = joblib.load('/usr4/ugrad/spuduch/zebraMD/Model/l1_lr_model.joblib')

In [9]:
for i, estimator in enumerate(lr_clf.estimators_):
    if i in le:
        class_label = le[i]
        print(f"\nTop 10 feature importances for class {class_label}:")
        
        # Ensure the coefficients are 1D
        coef = estimator.coef_.ravel()
        
        if len(coef) == len(feature_names):
            importances = pd.DataFrame({'feature': feature_names, 'importance': np.abs(coef), 'coefficient': coef})
            importances = importances.sort_values('importance', ascending=False)
            # drop importance column
            importances = importances.drop(columns=['importance'])
            print(importances.head(10))
        else:
            print(f"Error: Mismatch between number of features ({len(feature_names)}) and coefficients ({len(coef)})")
    else:
        print(f"Error: No label found for estimator index {i}")



Top 10 feature importances for class (vertigo) Paroymsal  Positional Vertigo:
                       feature  coefficient
86                unsteadiness    15.797178
84          spinning_movements    14.155595
98           altered_sensorium    -8.646448
102        dischromic _patches    -7.231999
109      lack_of_concentration    -7.179617
92            passage_of_gases    -6.407541
87   weakness_of_one_body_side    -6.304217
123                 blackheads    -6.253143
12         burning_micturition    -5.804917
7                 stomach_pain    -5.552244

Top 10 feature importances for class AIDS:
                    feature  coefficient
10           muscle_wasting    11.026896
22        patches_in_throat     9.741696
98        altered_sensorium    -7.065570
102     dischromic _patches    -6.412482
123              blackheads    -6.079222
75   extra_marital_contacts     6.037882
122      pus_filled_pimples    -5.865640
4                 shivering    -5.727607
12      burning_micturit

Again, feature importances are reasonable and very similar to those of naive bayes

# XGBoost

In [12]:
# load the model
xgb_clf = joblib.load('/usr4/ugrad/spuduch/zebraMD/Model/xgboost_model.joblib')


array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40])

In [17]:
feature_names = X_test.columns

feature_importance = xgb_clf.feature_importances_

feature_importance_tuples = list(zip(feature_names, feature_importance))

# Sort the list by importance score in descending order
sorted_importance = sorted(feature_importance_tuples, key=lambda x: x[1], reverse=True)

def print_top_features(top_n=10):
    print(f"\nTop {top_n} Important Features for XGBoost Model:")
    for i, (feature, importance) in enumerate(sorted_importance[:top_n], 1):
        print(f"{i}. {feature}: {importance:.4f}")

# Print top features
print_top_features()



Top 10 Important Features for XGBoost Model:
1. polyuria: 0.0790
2. history_of_alcohol_consumption: 0.0596
3. bruising: 0.0582
4. continuous_feel_of_urine: 0.0408
5. silver_like_dusting: 0.0384
6. mucoid_sputum: 0.0383
7. internal_itching: 0.0358
8. hip_joint_pain: 0.0329
9. spinning_movements: 0.0316
10. movement_stiffness: 0.0289


since I trained a single xgboost model, I can't get the feature importances specific to each class, so its not really helpful to know these feature importances