# Baseline utility analysis for Adult dataset

This notebook contains ML classification analysis for Adult dataset. This includes the hyperparameter optimisation for various classifiers and cross validation using the optimal settings. These serve as baseline performace scores when comparing to fingerprinted data.
- [1. XGBoost](#XGBoost)
- [2. Multi-layer Perceptron](#MLP)
- [3. Random Forest](#Random-Forest)
- [4. Logistic Regression](#Logistic-Regression)
- [5. Gradient Boosting](#Gradient-Boosting)

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from itertools import product
import json
import os
import sys

from sklearn import metrics, preprocessing
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

In [2]:
# Include properly the project root into the path
project_root = os.path.abspath(os.path.join(os.getcwd(), '../../..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [3]:
from NCorrFP.ncorr import *
#from NCorrFP.analysis.NCorr_FP_plot import *
from datasets import Dataset, Adult
#from utils import fp_cross_val_score

## Hyperparameter optimisation and performance on clean data

In [4]:
# fetch dataset
original_data = Adult()

In [6]:
original_data.dataframe.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [7]:
# cleaning the data 
#original_data.dropna()
original_data.dataframe = original_data.dataframe.dropna()
# encode categorical features and drop redundant 
original_data.number_encode_categorical()
original_data.dataframe = original_data.dataframe.drop(['fnlwgt','education'], axis=1)

# define target attribute
X = original_data.get_features()
y = original_data.get_target()

# scale features
scaler = preprocessing.StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns) #, index=X.index)
X.shape

(48842, 12)

### XGBoost

In [8]:
# define the model and possible hyperparameters
n_estimators = range(20, 200, 20)
max_depth = range(3,12,1)
gamma = [i/10.0 for i in range(0,5)]
colsample_bytree = [i/10.0 for i in range(6,10)]
learning_rate = [i/1000.0 for i in range(1, 10, 2)]
subsample = [i/10.0 for i in range(5,9)]
reg_alpha = [1e-5, 1e-2, 0.1, 1, 100]

hyperparams = dict(n_estimators=n_estimators,
                  max_depth=max_depth,
                  gamma=gamma,
                  colsample_bytree=colsample_bytree, 
                  learning_rate=learning_rate,
                  subsample=subsample,
                  reg_alpha=reg_alpha)

# classifier definition
xgboost = xgb.XGBClassifier()
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

# random search
clf = RandomizedSearchCV(xgboost, hyperparams, random_state=4)
search_xgb = clf.fit(X, y)
search_xgb.best_score_

0.8628025195278752

In [44]:
search_xgb.best_params_

{'subsample': 0.5,
 'reg_alpha': 0.01,
 'n_estimators': 160,
 'max_depth': 10,
 'learning_rate': 0.009,
 'gamma': 0.0,
 'colsample_bytree': 0.9}

In [57]:
json.dumps(search_xgb.best_params_)

'{"subsample": 0.5, "reg_alpha": 0.01, "n_estimators": 160, "max_depth": 10, "learning_rate": 0.009, "gamma": 0.0, "colsample_bytree": 0.9}'

### MLP

In [50]:
# Define the model and possible hyperparameters
# WARNING: long runtime -- The results are already saved in mpl_best_params.json and scores.csv
hidden_layer_sizes = [(50,), (100,), (50, 50), (100, 50), (100, 100)]
activation = ['identity', 'logistic', 'tanh', 'relu']
solver = ['lbfgs', 'sgd', 'adam']
alpha = [0.0001, 0.001, 0.01, 0.1]
learning_rate = ['constant', 'invscaling', 'adaptive']

hyperparams = dict(hidden_layer_sizes=hidden_layer_sizes,
                   activation=activation,
                   solver=solver,
                   alpha=alpha,
                   learning_rate=learning_rate)

# Hyperparameter random search
mlp = MLPClassifier(max_iter=500)
clf = RandomizedSearchCV(mlp, hyperparams, random_state=0, n_iter=20, cv=5)
search_mlp = clf.fit(X, y)
search_mlp.best_score_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

0.8525844974198268

In [53]:
search_mlp.best_params_

{'solver': 'adam',
 'learning_rate': 'invscaling',
 'hidden_layer_sizes': (50,),
 'alpha': 0.001,
 'activation': 'relu'}

In [56]:
import json
j = json.dumps(search_mlp.best_params_)
j

'{"solver": "adam", "learning_rate": "invscaling", "hidden_layer_sizes": [50], "alpha": 0.001, "activation": "relu"}'

### Random Forest

In [20]:
# define the model and possible hyperparameters
n_estimators = range(20, 200, 20)
criterion = ['gini', 'entropy', 'log_loss']

hyperparams = dict(n_estimators=n_estimators,
                  criterion=criterion)

# hyperparameter random search
rf = RandomForestClassifier()
clf = RandomizedSearchCV(rf, hyperparams, random_state=0)
search_rf = clf.fit(X, y)
search_rf.best_score_

0.8529529016055962

In [21]:
search_rf.best_params_

{'n_estimators': 120, 'criterion': 'entropy'}

In [24]:
json.dumps(search_rf.best_params_)

'{"n_estimators": 120, "criterion": "entropy"}'

### Logistic Regression

In [25]:
solver = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
C = range(10, 100, 10)

hyperparams = dict(solver=solver, C=C)

# hyperparameter random search
lr = LogisticRegression(random_state=0)
clf = RandomizedSearchCV(lr, hyperparams, random_state=0)
search_lr = clf.fit(X, y)
search_lr.best_score_

0.8253739544158705

In [26]:
search_lr.best_params_

{'solver': 'newton-cg', 'C': 10}

### Gradient Boosting

In [31]:
param_grid = {
    'n_estimators': [100, 300, 500],          
    'learning_rate': [0.01, 0.05, 0.1, 0.2],      
    'max_depth': [3, 5, 7, 10],                   
    'min_samples_split': [2, 5, 10],        
    'min_samples_leaf': [1, 2, 4],             
    'subsample': [0.6, 0.8, 1.0],             
    'max_features': ['sqrt', 'log2', None],     
    'criterion': ['friedman_mse', 'squared_error']
}

# random search
gb = GradientBoostingClassifier(random_state=0)
clf = RandomizedSearchCV(gb, param_grid, random_state=0)
search_gb = clf.fit(X, y)
search_gb.best_score_

0.872055652444874

In [32]:
search_gb.best_params_

{'subsample': 1.0,
 'n_estimators': 500,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': None,
 'max_depth': 10,
 'learning_rate': 0.01,
 'criterion': 'squared_error'}

In [33]:
json.dumps(search_gb.best_params_)

'{"subsample": 1.0, "n_estimators": 500, "min_samples_split": 10, "min_samples_leaf": 4, "max_features": null, "max_depth": 10, "learning_rate": 0.01, "criterion": "squared_error"}'