In [None]:
# default_exp processing

In [None]:
from pandas.api.types import is_numeric_dtype

from knowledge_distillation.ensemble import *

In [None]:
# export
import numpy as np
import pandas as pd
from sklearn.ensemble import StackingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn_pandas import DataFrameMapper, gen_features

from knowledge_distillation.io import *

# Load data

In [None]:
df = load_adult()
assert df.shape == (32561, 15)

In [None]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# Preprocessing

In [None]:
# export

def scale_onehot(df, target):
    """Perform basic scaling and one-hot encoding"""
    
    features = df.drop(target, axis=1)
    
    categorical_cols = [[f] for f in features.select_dtypes('object').columns]
    categorical_pipe = gen_features(
        columns=categorical_cols, 
        classes=[{'class':SimpleImputer, 'strategy':'constant', 'fill_value':'Na'}, 
                 OneHotEncoder
                ])

    numerical_cols = [[f] for f in features.select_dtypes('number').columns]
    numerical_pipe = gen_features(
        columns=numerical_cols, 
        classes=[SimpleImputer, StandardScaler]
    )
    
    mapper = DataFrameMapper(categorical_pipe + numerical_pipe, df_out=True)
    
    X = mapper.fit_transform(df)
    
    y = df[target]
    
    target_names = sorted(y.unique())
    
    y = pd.Categorical(y, categories=target_names, ordered=True)
    y = y.codes
    
    return X, y, target_names

In [None]:
X, y, target_names = scale_onehot(df, target='salary')

assert all([is_numeric_dtype(X[c]) for c in X.columns]), "Some column is not numeric"

assert X.shape[0] == df.shape[0], "Some rows haave been lost in the preprocessing"

assert len(np.unique(y)) == 2

assert len(target_names) == len(np.unique(y))

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [None]:
# export

def split_with_seed(X, y, test_size=.2):
    X_train, X_test, y_train, y_test = train_test_split(
        X, 
        y, 
        test_size=test_size, 
        random_state=42)

    return X_train, X_test, y_train, y_test
    

In [None]:
X_train, X_test, y_train, y_test = split_with_seed(X, y)

assert all([is_numeric_dtype(X_train[c]) for c in X_train.columns]), "Some column is not numeric"


assert X_train.shape[0] + X_test.shape[0] == df.shape[0], "Some rows haave been lost in the preprocessing"

assert len(np.unique(y_train)) == 2
assert len(np.unique(y_test)) == 2

# Dummy training (baseline)

In [None]:
clf = DummyClassifier(strategy='most_frequent')

In [None]:
clf.fit(X_train, y_train)

DummyClassifier(strategy='most_frequent')

# Evaluate

In [None]:
np.nonzero(np.array([[1, 0, 0], [0, 0, 3]]))

(array([0, 1]), array([0, 2]))

In [None]:
# export

def evaluate_model(X_train, X_test, y_train, y_test, model, model_name, save_to_disk, print_report=True, target_names=None):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # if the y input is 2D (ie. one-hot encoded input), convert it to 1D
    if y_train.ndim > 1:
        y_train = np.nonzero(y_train)[1]
        y_test = np.nonzero(y_test)[1]
        
    elif y_train_pred.ndim > 1:
        # if the y input is 2D (ie. one-hot encoded target), convert it to 1D
        y_train_pred = np.nonzero(y_train_pred)[1]
        y_test_pred = np.nonzero(y_test_pred)[1]
            
    preds = [(y_train, y_train_pred), (y_test, y_test_pred)]

    df = pd.DataFrame(
        dict(
            model_name = [model_name, model_name],
            data = ['train', 'test'],
            accuracy = [accuracy_score(y_true, y_pred) for y_true, y_pred in preds],
            precision = [precision_score(y_true, y_pred) for y_true, y_pred in preds],
            recall = [recall_score(y_true, y_pred) for y_true, y_pred in preds],
            f1 = [f1_score(y_true, y_pred) for y_true, y_pred in preds],
            auc = [roc_auc_score(y_true, y_pred) for y_true, y_pred in preds],
        )
    )
    
    df.to_csv(ASSETS_PATH / f"{model_name}_evaluation.csv", index=False)
    if print_report:
        print("=== Train ===")
        print(classification_report(*preds[0], labels=[0, 1], target_names=target_names))
        print("\n=== Test ===")
        print(classification_report(*preds[1], labels=[0, 1], target_names=target_names))
        print()
    
    return df

In [None]:
evaluate_model(X_train, X_test, y_train, y_test, clf, 'dummy', save_to_disk=True, target_names=target_names)

=== Train ===
              precision    recall  f1-score   support

       <=50K       0.76      1.00      0.86     19778
        >50K       0.00      0.00      0.00      6270

    accuracy                           0.76     26048
   macro avg       0.38      0.50      0.43     26048
weighted avg       0.58      0.76      0.66     26048


=== Test ===
              precision    recall  f1-score   support

       <=50K       0.76      1.00      0.86      4942
        >50K       0.00      0.00      0.00      1571

    accuracy                           0.76      6513
   macro avg       0.38      0.50      0.43      6513
weighted avg       0.58      0.76      0.65      6513




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,model_name,data,accuracy,precision,recall,f1,auc
0,dummy,train,0.759291,0.0,0.0,0.0,0.5
1,dummy,test,0.75879,0.0,0.0,0.0,0.5


# Ensemble identical models with different random seed and average

In [None]:
# LR with random state 1..10
estimators = [(f'dummy_{x}', DummyClassifier(random_state=x)) for x in range(1, 11)]

stack = StackingClassifier(
    estimators, 
    final_estimator=UnbiasedAverage(), 
    n_jobs=-1)

stack.fit(X_train, y_train)

StackingClassifier(estimators=[('dummy_1', DummyClassifier(random_state=1)),
                               ('dummy_2', DummyClassifier(random_state=2)),
                               ('dummy_3', DummyClassifier(random_state=3)),
                               ('dummy_4', DummyClassifier(random_state=4)),
                               ('dummy_5', DummyClassifier(random_state=5)),
                               ('dummy_6', DummyClassifier(random_state=6)),
                               ('dummy_7', DummyClassifier(random_state=7)),
                               ('dummy_8', DummyClassifier(random_state=8)),
                               ('dummy_9', DummyClassifier(random_state=9)),
                               ('dummy_10', DummyClassifier(random_state=10))],
                   final_estimator=UnbiasedAverage(), n_jobs=-1)

In [None]:
evaluate_model(X_train, X_test, y_train, y_test, stack, 'dummy_ensemble', save_to_disk=True, target_names=target_names)

=== Train ===
              precision    recall  f1-score   support

       <=50K       0.76      1.00      0.86     19778
        >50K       0.00      0.00      0.00      6270

    accuracy                           0.76     26048
   macro avg       0.38      0.50      0.43     26048
weighted avg       0.58      0.76      0.66     26048


=== Test ===
              precision    recall  f1-score   support

       <=50K       0.76      1.00      0.86      4942
        >50K       0.00      0.00      0.00      1571

    accuracy                           0.76      6513
   macro avg       0.38      0.50      0.43      6513
weighted avg       0.58      0.76      0.65      6513




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,model_name,data,accuracy,precision,recall,f1,auc
0,dummy_ensemble,train,0.759291,0.0,0.0,0.0,0.5
1,dummy_ensemble,test,0.75879,0.0,0.0,0.0,0.5


# Export module

In [None]:
from nbdev.export import notebook2script
notebook2script()

Converted 00_io.ipynb.
Converted 01_processing.ipynb.
Converted 02_ensemble.ipynb.
Converted 03a_logistic.ipynb.
Converted 03b_NN.ipynb.
Converted 03c_bag_of_trees.ipynb.
Converted 03d_automl.ipynb.
Converted 04_train_NN.ipynb.
Converted 05_train_multiple_NNs.ipynb.
Converted 05a_test_seed_is_working.ipynb.
Converted 06_NN_ensemble.ipynb.
Converted 07_distill_multiple_NNs.ipynb.
Converted 07a_train_and_distill_multiple_NNs.ipynb.
Converted 08_compare.ipynb.
Converted XX_02_keras_native.ipynb.
Converted index.ipynb.
