In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.model_selection import train_test_split, RandomizedSearchCV

# Classification metrics
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score
)

# Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier
)
from sklearn.svm import SVC

# Boosting libraries (classification versions)
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

warnings.filterwarnings("ignore")
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer


In [2]:
df=pd.read_csv('data/raw.csv')

In [3]:
df.head(2)

Unnamed: 0,id,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,1,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,2,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K


In [4]:
print("Categories in 'workclass' variable:",end=" " )
print(df['workclass'].unique())

print("Categories in 'education' variable:",end=" ")
print(df['education'].unique())

print("Categories in'marital_status' variable:",end=" " )
print(df['marital_status'].unique())

print("Categories in 'occupation' variable:",end=" " )
print(df['occupation'].unique())

print("Categories in 'relationship' variable:",end=" " )
print(df['relationship'].unique())

print("Categories in 'race' variable:",end=" " )
print(df['race'].unique())
print("Categories in 'sex' variable:",end=" " )
print(df['sex'].unique())
print("Categories in 'native_country' variable:",end=" " )
print(df['native_country'].unique())
print("Categories in 'income' variable:",end=" " )
print(df['income'].unique())

Categories in 'workclass' variable: ['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' nan
 'Self-emp-inc' 'Without-pay' 'Never-worked']
Categories in 'education' variable: ['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm'
 'Assoc-voc' '7th-8th' 'Doctorate' 'Prof-school' '5th-6th' '10th'
 '1st-4th' 'Preschool' '12th']
Categories in'marital_status' variable: ['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed']
Categories in 'occupation' variable: ['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' nan
 'Protective-serv' 'Armed-Forces' 'Priv-house-serv']
Categories in 'relationship' variable: ['Not-in-family' 'Husband' 'Wife' 'Own-child' 'Unmarried' 'Other-relative']
Categories in 'race' variable: ['White' 'Black' 'Asian-Pac-Islander' 'Amer-Indian-Eskimo'

In [5]:
df_ml = df.drop(columns=["id", "fnlwgt", "education"])

In [6]:
df_ml["income"] = df_ml["income"].map({"<=50K": 0, ">50K": 1})

In [7]:
df_ml["capital_gain"] = np.log1p(df_ml["capital_gain"])
df_ml["capital_loss"] = np.log1p(df_ml["capital_loss"])

In [20]:
X = df_ml.drop("income", axis=1)
y = df_ml["income"]

In [21]:
numeric_features = [feature for feature in X.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in X.columns if df[feature].dtype == 'O']
print('We have {} numerical features : {}'.format(len(numeric_features), numeric_features))
print('\nWe have {} categorical features : {}'.format(len(categorical_features), categorical_features))

We have 5 numerical features : ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

We have 7 categorical features : ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']


In [22]:
num_cols = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

cat_cols = ['workclass', 'marital_status', 'occupation',
            'relationship', 'race', 'sex', 'native_country']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [24]:
def evaluate_model(true, predicted, predicted_proba=None):
    
    acc = accuracy_score(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    f1 = f1_score(true, predicted)
    
    if predicted_proba is not None:
        roc_auc = roc_auc_score(true, predicted_proba)
    else:
        roc_auc = None
        
    return acc, precision, recall, f1, roc_auc

In [28]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

In [30]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "XGBClassifier": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    "CatBoostClassifier": CatBoostClassifier(verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier()
}
model_list=[]
accuracy_list=[]
for name, model in models.items():
    
    # Create pipeline (VERY IMPORTANT)
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Fit model
    pipeline.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)
    
    # Training metrics
    train_acc = accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    
    # Test metrics
    test_acc = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    
    print(name)
    model_list.append(name)
    
    print("Model performance for Training set")
    print("- Accuracy: {:.4f}".format(train_acc))
    print("- F1 Score: {:.4f}".format(train_f1))
    
    print("----------------------------------")
    
    print("Model performance for Test set")
    print("- Accuracy: {:.4f}".format(test_acc))
    print("- F1 Score: {:.4f}".format(test_f1))
    
    accuracy_list.append(test_acc)
    
    print("=" * 35)
    print("\n")

Logistic Regression
Model performance for Training set
- Accuracy: 0.8453
- F1 Score: 0.6476
----------------------------------
Model performance for Test set
- Accuracy: 0.8486
- F1 Score: 0.6586


K-Neighbors Classifier
Model performance for Training set
- Accuracy: 0.8819
- F1 Score: 0.7436
----------------------------------
Model performance for Test set
- Accuracy: 0.8357
- F1 Score: 0.6499


Decision Tree Classifier
Model performance for Training set
- Accuracy: 0.9787
- F1 Score: 0.9545
----------------------------------
Model performance for Test set
- Accuracy: 0.8225
- F1 Score: 0.6362


Random Forest Classifier
Model performance for Training set
- Accuracy: 0.9787
- F1 Score: 0.9553
----------------------------------
Model performance for Test set
- Accuracy: 0.8495
- F1 Score: 0.6685


Gradient Boosting Classifier
Model performance for Training set
- Accuracy: 0.8690
- F1 Score: 0.6930
----------------------------------
Model performance for Test set
- Accuracy: 0.8698
- F1

In [31]:
pd.DataFrame(list(zip(model_list, accuracy_list)), columns=['Model Name', 'accuracy_score']).sort_values(by=["accuracy_score"],ascending=False)

Unnamed: 0,Model Name,accuracy_score
6,CatBoostClassifier,0.875633
5,XGBClassifier,0.872563
4,Gradient Boosting Classifier,0.869799
7,AdaBoost Classifier,0.85813
3,Random Forest Classifier,0.849532
0,Logistic Regression,0.84861
1,K-Neighbors Classifier,0.835713
2,Decision Tree Classifier,0.822509
