In [79]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

In [84]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from tpot import TPOTRegressor
np.random.seed(8675309)  # seed courtesy of Tommy Tutone
from GPyOpt.methods import BayesianOptimization
from sklearn.model_selection import cross_val_score, KFold
from sklearn.datasets import load_diabetes
from scipy.stats import uniform, randint
import re

# Project 2 Homework

For this project you're going to apply hyperparameter optimization to both a regression and a classification problem. It looks like a lot to do below, but it's mostly a matter of modifying code from the presentation. 

Guidelines:

* Apply all 4 methods from the presentation: GridSearchCV, RandomSearchCV, BayesianOptimization, and TPOT.
    * For GridSearchCV you should include at least 2 or 3 values for each hyperparameter and one of those values should be the default.
    * For BayesianOptimization you'll have to use `int()` or `bool()` to cast the float values of the hyperparameters inside your `cross_cv()` function.
    * For TPOT you should use a finer grid than for GridSearchCV, but not more than 10 to 20 possible values for each hyperparameter.  You chould lower the number of possible values to keep the search space smaller.
    * If your code is too slow you can reduce the number of cross-validation folds to 3 and if your dataset is really large you can randomly choose a smaller subset of the rows.
* For TPOT do both hyperparameter optimization on the specified model and also run TPOT and let it choose the model.
* You can use either the specified dataset or you can choose your own.  
    * If you use your own data it should have at least 500 rows and 10 features.  
    * If your data has categorical features you'll need "one hot" encode it (convert categorical features into multiple binary features).  <a href="https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/">Here is a nice tutorial</a>.  For categories with only two values you can remove one of the two hot encoded columns.
* For each problem you need to include your narrated code and a summary.  
* The summaries should each include 
    * the results of all 5 approaches (the 4 methods and the TPOT model search)
    * a table that reports the optimized score from the test data as well as the number of model fits used in the optimization
    * a brief discussion about which hyperparameter optimization approach worked best
* If you do want to use your own data, we suggest first getting things working with the suggested datasets.  Finding, cleaning, and preparing data can take a lot of time.

# Problem 1 - Optimize Random Forest Regression

Find optimized hyperparameters for a random forest regression model. You may use either the diabetes data used in the presentation or a dataset that you choose.  **You do not need to include the TPOT general search for this problem**.  Include code and summary.  Here are ranges for a subset of the hyperparameters:

Hyperparameter |Type | Default Value | Typical Range
---- | ---- | ---- | ----
n_estimators | discrete / integer | 100 | 10 to 150
max_features | continuous / float | 1.0 | 0.05 to 1.0
min_samples_split | discrete / integer | 2 | 2 to 20
min_samples_leaf | discrete / integer | 1 | 1 to 20
bootstrap | discrete / boolean | True | True, False


You can add other hyperparameters to the optimization if you wish.
<a href="https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html">Documentation for sklearn RandomForestRegressor</a>

<font color = "blue"> *** 20 points - add cells below as needed </font>

In [71]:
diabetes = load_diabetes()
x = np.array(diabetes.data)
y = np.array(diabetes.target)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123)

In [178]:
def calculate_scores(model, best_params, x_test = x_test, y_test = y_test):
    r_squared = model.score(x_test,y_test)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test,y_pred)
    rmse = np.sqrt(mse)

    results = best_params
    results['rmse'] = rmse
    results['r_squared'] = r_squared

    return results

def cv_score(hyp_parameters):
    hyp_parameters = hyp_parameters[0]
    rf_model = RandomForestRegressor(n_estimators=int(hyp_parameters[0]),
                                 max_features=hyp_parameters[1],
                                 min_samples_split=int(hyp_parameters[2]),
                                 min_samples_leaf=int(hyp_parameters[3]),
                                 bootstrap=bool(hyp_parameters[4]))
    scores = cross_val_score(rf_model,
                             X=x_train,
                             y=y_train,
                             cv=KFold(n_splits=5))
    return np.array(scores.mean())

def lines_that_start_with(string, fp):
    return [line for line in fp if line.startswith(string)]

def optimize_model(model, 
                   opt_type, 
                   x_train = x_train, 
                   y_train = y_train, 
                   x_test = x_test, 
                   y_test = y_test, 
                   use_parallel = True,
                   cv = 5):
    
    n = 3 if opt_type != 'randomcv' else 5
    n_estimators = np.arange(10,170,30).tolist() if opt_type != 'randomcv' else randint(10,151)
    max_features =  np.linspace(0.05,1,n).tolist() if opt_type != 'randomcv' else uniform(0.05,1)
    min_samples_split = np.linspace(2,20,n, dtype=np.int).tolist() if opt_type != 'randomcv' else randint(2,21)
    min_samples_leaf = np.linspace(1,20,n, dtype=np.int).tolist() if opt_type != 'randomcv' else randint(1,21)
    bootstrap = np.array([True, False], dtype=bool).tolist() if opt_type != 'randomcv' else [True,False]

    params = {
        "n_estimators": n_estimators,
        "max_features": max_features,
        "min_samples_split": min_samples_split,
        "min_samples_leaf": min_samples_leaf,
        "bootstrap": bootstrap
    }
    
    n_jobs = -1 if use_parallel == True else 1
    if opt_type == 'gridcv':
        print("Performing GridSearchCV:")
        grid_search = GridSearchCV(model,
                           param_grid=params,
                           cv=cv,
                           verbose=0,
                           n_jobs=n_jobs,
                           return_train_score=True)
        
        grid_search.fit(x_train, y_train)
        best_params = grid_search.best_params_
        results = calculate_scores(grid_search, best_params)
        return(results)

    elif opt_type == 'randomcv':
        print("Performing RandomSearchCV:")
        random_search = RandomizedSearchCV(
            model,
            param_distributions=params,
            random_state=8675309,
            n_iter=25,
            cv=cv,
            verbose=0,
            n_jobs=n_jobs,
            return_train_score=True)
        
        random_search.fit(x_train, y_train)
        best_params = random_search.best_params_
        results = calculate_scores(random_search, best_params)
        return(results)
    
    elif opt_type == "bayes":
        hp_bounds = [{'name': 'n_estimators', 'type': 'discrete', 'domain': (min(n_estimators), max(n_estimators))}, 
                     {'name': 'max_features','type': 'continuous','domain': (min(max_features), max(max_features))}, 
                    {'name': 'min_samples_split','type': 'discrete','domain': (min(min_samples_split), max(min_samples_split))}, 
                     {'name': 'min_samples_leaf','type': 'discrete','domain': (min(min_samples_leaf), max(min_samples_leaf))}, 
                     {'name': 'bootstrap','type': 'discrete','domain': (True, False)}]
        

        optimizer = BayesianOptimization(f=cv_score,
                                         domain=hp_bounds,
                                         model_type='GP',
                                         acquisition_type='EI',
                                         acquisition_jitter=0.05,
                                         exact_feval=True,
                                         maximize=True,
                                         verbosity=False)

        optimizer.run_optimization(max_iter=20,verbosity=True)
        
        best_params = {}
        for i in range(len(hp_bounds)):
            if hp_bounds[i]['type'] == 'continuous':
                best_params[hp_bounds[i]['name']] = optimizer.x_opt[i]
            elif hp_bounds[i]['type'] == 'discrete' and hp_bounds[i]['name'] != 'bootstrap':
                best_params[hp_bounds[i]['name']] = int(optimizer.x_opt[i])
            else:
                best_params[hp_bounds[i]['name']] = bool(optimizer.x_opt[i])
                
        bayopt_search = RandomForestRegressor(**best_params)
        bayopt_search.fit(x_train,y_train)
        results = calculate_scores(bayopt_search, best_params)
        return(results)
    
    elif opt_type == 'tpot':
        tpot_config = {
            'sklearn.ensemble.RandomForestRegressor': {
                "n_estimators": n_estimators,
                "max_features": max_features,
                "min_samples_split": min_samples_split,
                "min_samples_leaf": min_samples_leaf,
                "bootstrap": bootstrap
            }
        }

        tpot = TPOTRegressor(generations=5,
                             scoring="r2",
                             population_size=20,
                             verbosity=0,
                             config_dict=tpot_config,
                             cv=3,
                             random_state=8675309)
        tpot.fit(x_train, y_train)
        tpot.export('tpot_rf.py')
        
        with open("tpot_rf.py", "r") as fp:
            for line in lines_that_start_with("exported_pipeline = ", fp):
                parse_this = line

        p = re.compile(r"[\w]+=[\w|[\d+\.\d]+")
        match_list = p.findall(parse_this)
        best_params = {}

        for match in match_list:
            key, val = match.split("=")
            best_params[key] = eval(val)
                
        results = calculate_scores(tpot, best_params)
        return(results)
        

rf_model = RandomForestRegressor(random_state=0)
optimize_model(rf_model, opt_type = "tpot")

Best pipeline: RandomForestRegressor(input_matrix, bootstrap=True, max_features=0.525, min_samples_leaf=1, min_samples_split=20, n_estimators=40)


{'bootstrap': True,
 'max_features': 0.525,
 'min_samples_leaf': 1,
 'min_samples_split': 20,
 'n_estimators': 40,
 'rmse': 53.54878523798661,
 'r_squared': 0.5448577447242925}

# Problem 2 - Optimize XGBoost Classifier

Find optimized hyperparameters for an xgboost classifier model.  The first cell below loads a subset of the loans default data from DS705 and your job is to predict whether a loan defaults or not.  The `status_bad` column is the target column and a 1 indicates a loan that defaulted.  We have selected a subset of the original data that includes 2000 each of good and bad loans.  The data has already been cleaned and encoded.  You're welcome to look into a different dataset, but start by getting this working and then add your own data.

The score for each model will be accuracy.  Your summary table should include accuracy, sensitivity, and precision for each optimized model applied to the test data.  (<a href="https://classeval.wordpress.com/introduction/basic-evaluation-measures/">Here is a nice overview of metrics for binary classification data</a>) that includes definitions of accuracy and such.

For the models you'll mostly just need to change 'regressor' to 'classifier', e.g. `XGBClassifier` instead of `XGBRegressor`.

Here is a basic outline of what to do:

* Write a function called `my_classifier_results` modeled after `my_regression_results` that applies a model to the test data and prints out the accuracy, sensitivity, precision, and the confusion matrix.  There is some code below to help you get started.  There is no need to make a plot.

* Start by training some baseline models using default values of the hyperparameters.  We've included logistic regression in a cell below to get you started.  Use `LogisticRegression`, `RandomForestClassifier`, and `GaussianNB` (Gaussian Naive Bayes) from `sklearn`.  Also use `XGBClassifier` from `xgboost` where you may need to include `objective="binary:logistic"` as an option. The default scoring method for all of the `sklearn` classifiers is accuracy. Apply `my_classifier_results` to the test data for each model.

* Now use the four hyperparameter optimization techniques on `XGBClassifier` and TPOT general model optimization.  Apply `my_classifer_results` to the test data in each case.
    * Feel free to use 3 folds instead of 5 for cross validation to speed things up. 
    * Choose a very small number of iterations, population size, etc. until you're sure things are working correctly, then turn up the numbers.  General TPOT optimization will take a while (fair warning: it took about 30 minutes on my Macbook Pro with generations = 10, population_size=40, and cv=5)  
    * The hyperparameters to consider for are the same as they were in the presentation , but here they are again for convenience:

Hyperparameter | Type | Default Value | Typical Range
---- | ---- | ---- | ----
n_estimators | discrete / integer | 100 | 50 to 150
max_depth | discrete / integer | 3| 1 to 10
min_child_weight | discrete / integer | 1 | 1 to 20
learning_rate | continuous / float | 0.1 | 0.001 to 1
sub_sample | continuous / float | 1 | 0.05 to 1
reg_lambda | continuous / float | 1 | 0 to 5
reg_alpha  | continuous / float | 0 | 0 to 5

* Write a summary as described above, but also discuss the following:
    * If the bank is primarily interested in correctly identifying loans that are truly bad, then which model should they use?  Why?
* Use section headers to label your work.  Your summary / discussion should be more than simply "XYZ is the best model", but it also shouldn't be more than a few paragraphs and a table.

<font color = "blue"> *** 30 points -  add cells below as needed *** (don't delete this cell) </font>

## Loading the data

In [22]:
# Do not change this cell for loading and preparing the data
import pandas as pd
import numpy as np

X = pd.read_csv('./data/loans_subset.csv')

# split into predictors and target
# convert to numpy arrays for xgboost, OK for other models too
y = np.array(X['status_Bad']) # 1 for bad loan, 0 for good loan
X = np.array(X.drop(columns = ['status_Bad']))

# split into test and training data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

## An example classifier

In [30]:
from sklearn.linear_model import LogisticRegression

# we do need to go higher than the default iterations for the solver to get convergence
# and the explicity declaration of the solver avoids a warning message, otherwise
# the parameters are defaults.
logreg_model = LogisticRegression(solver='lbfgs',max_iter=1000)

logreg_model.fit(X_train, y_train)

# Use score method to get accuracy of model
score = logreg_model.score(X_test, y_test) # this is accuracy
print(score)

0.5475


In [32]:
# obtaining the confusion matrix and making it look nice

from sklearn.metrics import confusion_matrix
import pandas as pd

y_pred = logreg_model.predict(X_test)

# must put true before predictions in confusion matrix function
cmtx = pd.DataFrame(
    confusion_matrix(y_test, y_pred, labels=[1,0]), 
    index=['true:bad', 'true:good'], 
    columns=['pred:bad','pred:good']
)
display(cmtx)

Unnamed: 0,pred:bad,pred:good
true: bad,126,71
true:good,110,93
