In [None]:
from sklearn.metrics import classification_report
from lightgbm import LGBMClassifier

In [None]:
from knowledge_distillation.io import *
from knowledge_distillation.processing import * 

In [None]:
df = load_adult()

In [None]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# Preprocess for lightgbm models

In [None]:
target_names = sorted(df.salary.unique())
target_names

[' <=50K', ' >50K']

In [None]:
X = df.drop('salary', axis=1)
y = (df.salary == ' >50K').astype(int)
 

In [None]:
# convert categorical columns to pd.Categorical
categories = X.select_dtypes('object').astype('category')

X[categories.columns] = categories

In [None]:
X_train, X_test, y_train, y_test = split_with_seed(X, y)
# X_train, X_test, y_train, y_test = train_test_split(
#     X, 
#     y, 
#     test_size=.2, 
#     random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((26048, 14), (6513, 14), (26048,), (6513,))

# Averaging ensemble: random forest, extratrees

In [None]:
rf = LGBMClassifier(
    boosting_type='rf',
    bagging_freq=1,
    bagging_fraction=0.6,
    random_state=1
)

rf.fit(X_train, y_train)



LGBMClassifier(bagging_fraction=0.6, bagging_freq=1, boosting_type='rf',
               random_state=1)

In [None]:
evaluate_model(X_train, X_test, y_train, y_test, rf, 'rf', save_to_disk=True, target_names=target_names)

=== Train ===
              precision    recall  f1-score   support

       <=50K       0.89      0.94      0.91     19778
        >50K       0.75      0.63      0.69      6270

    accuracy                           0.86     26048
   macro avg       0.82      0.78      0.80     26048
weighted avg       0.86      0.86      0.86     26048


=== Test ===
              precision    recall  f1-score   support

       <=50K       0.89      0.94      0.91      4942
        >50K       0.76      0.63      0.69      1571

    accuracy                           0.86      6513
   macro avg       0.82      0.78      0.80      6513
weighted avg       0.86      0.86      0.86      6513




Unnamed: 0,model_name,data,accuracy,precision,recall,f1,auc
0,rf,train,0.86164,0.754876,0.629665,0.686609,0.782423
1,rf,test,0.862429,0.759416,0.628899,0.688022,0.782782


In [None]:
rf_xt = LGBMClassifier(
    boosting_type='rf',
    bagging_freq=1,
    bagging_fraction=0.6,
    random_state=1, 
    extra_trees=True
)

rf_xt.fit(X_train, y_train)



LGBMClassifier(bagging_fraction=0.6, bagging_freq=1, boosting_type='rf',
               extra_trees=True, random_state=1)

In [None]:
evaluate_model(X_train, X_test, y_train, y_test, rf_xt, 'rf_xt', save_to_disk=True, target_names=target_names)

=== Train ===
              precision    recall  f1-score   support

       <=50K       0.89      0.92      0.90     19778
        >50K       0.71      0.64      0.67      6270

    accuracy                           0.85     26048
   macro avg       0.80      0.78      0.79     26048
weighted avg       0.85      0.85      0.85     26048


=== Test ===
              precision    recall  f1-score   support

       <=50K       0.89      0.92      0.90      4942
        >50K       0.71      0.64      0.67      1571

    accuracy                           0.85      6513
   macro avg       0.80      0.78      0.79      6513
weighted avg       0.85      0.85      0.85      6513




Unnamed: 0,model_name,data,accuracy,precision,recall,f1,auc
0,rf_xt,train,0.849931,0.709346,0.637959,0.671761,0.777544
1,rf_xt,test,0.851067,0.714796,0.636537,0.673401,0.7779


# Boosting ensemble: GDBT

In [None]:
gbdt = LGBMClassifier(
    boosting_type='gbdt',
    random_state=1
)

gbdt.fit(X_train, y_train)

LGBMClassifier(random_state=1)

In [None]:
evaluate_model(X_train, X_test, y_train, y_test, gbdt, 'gbdt', save_to_disk=True, target_names=target_names)

=== Train ===
              precision    recall  f1-score   support

       <=50K       0.91      0.95      0.93     19778
        >50K       0.82      0.70      0.76      6270

    accuracy                           0.89     26048
   macro avg       0.87      0.83      0.84     26048
weighted avg       0.89      0.89      0.89     26048


=== Test ===
              precision    recall  f1-score   support

       <=50K       0.91      0.94      0.92      4942
        >50K       0.79      0.69      0.74      1571

    accuracy                           0.88      6513
   macro avg       0.85      0.82      0.83      6513
weighted avg       0.88      0.88      0.88      6513




Unnamed: 0,model_name,data,accuracy,precision,recall,f1,auc
0,gbdt,train,0.892391,0.822752,0.704785,0.759213,0.828325
1,gbdt,test,0.881775,0.793837,0.688733,0.73756,0.815937


In [None]:
gbdt_xt = LGBMClassifier(
    boosting_type='gbdt',
    random_state=1, 
    extra_trees=True
)

gbdt_xt.fit(X_train, y_train)

LGBMClassifier(extra_trees=True, random_state=1)

In [None]:
evaluate_model(X_train, X_test, y_train, y_test, gbdt_xt, 'gbdt_xt', save_to_disk=True, target_names=target_names)

=== Train ===
              precision    recall  f1-score   support

       <=50K       0.90      0.95      0.92     19778
        >50K       0.79      0.67      0.73      6270

    accuracy                           0.88     26048
   macro avg       0.85      0.81      0.82     26048
weighted avg       0.87      0.88      0.88     26048


=== Test ===
              precision    recall  f1-score   support

       <=50K       0.90      0.94      0.92      4942
        >50K       0.78      0.67      0.72      1571

    accuracy                           0.87      6513
   macro avg       0.84      0.80      0.82      6513
weighted avg       0.87      0.87      0.87      6513




Unnamed: 0,model_name,data,accuracy,precision,recall,f1,auc
0,gbdt_xt,train,0.878916,0.794964,0.669697,0.726974,0.80747
1,gbdt_xt,test,0.874712,0.781927,0.666454,0.719588,0.803685
