In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb  # For the XGBoost model
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
import random
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, classification_report)
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder



### 1. Data Preparation

Convert categorial and non-number data to numerical data.
Sine most ML algorithm do only accept numerical data !!

In [2]:
# import dataset and do data preparation 
adult_tmp = pd.read_csv("C:\\Users\\pixal\\Desktop\\WPy64-312101\\notebooks\\adult\\adult_with_heading.csv")
adult_tmp['cap-gain-loss'] = adult_tmp['capital-gain'] + adult_tmp ['capital-loss']

# column has prefix space. Finally space removed by hand as programming is troublesome!
# adult_tmp.loc[:,'marital-status'] = adult_tmp['marital-status'].str.lstrip()  

# convert dataset for X (predicators)
mar_cat = adult_tmp[['marital-status']]
mar_cat = mar_cat.replace({'Married-AF-spouse':'Married', 'Married-civ-spouse':'Married', 'Married-spouse-absent':'Married'})
dummies = pd.get_dummies(mar_cat['marital-status'], dtype='uint8')
X = pd.concat((adult_tmp['cap-gain-loss'], dummies), axis=1)

# convert dataset for targets
le_class = LabelEncoder() # convert target, one dimension only !!
y = le_class.fit_transform(adult_tmp['class'])

print("Class of y :", le_class.classes_)            # show numbers assignment
print(f"Number for <=50K : {le_class.transform(['<=50K'])}") 
print(f"'Number for >50K : {le_class.transform(['>50K'])}") 


# y = adult_train[['class']]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Class of y : ['<=50K' '>50K']
Number for <=50K : [0]
'Number for >50K : [1]


In [3]:
# evaluate model
def evaluate_model(model, model_name):
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Make predictions
    accuracy = accuracy_score(y_test, y_pred)  # Calculate accuracy
    print(f"\n--- {model_name} Evaluation ---")
    print(f"Accuracy: {accuracy:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# build Decision Tree

dtc5_model = DecisionTreeClassifier(criterion = "gini", max_leaf_nodes=5)
evaluate_model(dtc5_model, "Decision Tree - gini")

# build Decision Tree with entropy
dt_model = DecisionTreeClassifier(criterion = "entropy", max_leaf_nodes=5)
evaluate_model(dt_model, "Decision Tree - entropy")

# use Navie Bayes classification
nb_model = GaussianNB()
evaluate_model(nb_model, "Naive Bayes")


# XGBoot
xgb_model = xgb.XGBClassifier(
    # use_label_encoder=False,  # Avoids warnings in newer versions
    eval_metric='mlogloss'  # Multiclass log loss for evaluation
)
evaluate_model(xgb_model, "XGBoost")


--- Decision Tree - gini Evaluation ---
Accuracy: 0.81
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.98      0.89      4942
           1       0.80      0.30      0.44      1571

    accuracy                           0.81      6513
   macro avg       0.81      0.64      0.66      6513
weighted avg       0.81      0.81      0.78      6513


--- Decision Tree - entropy Evaluation ---
Accuracy: 0.81
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.98      0.89      4942
           1       0.80      0.30      0.44      1571

    accuracy                           0.81      6513
   macro avg       0.81      0.64      0.66      6513
weighted avg       0.81      0.81      0.78      6513


--- Naive Bayes Evaluation ---
Accuracy: 0.80
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.97      0.88      4942
      