# Task

Write find_best() function for hyperparameter randomized search with cross-validation across different models each having different sets of hyperparameters. Use 2 metrics. The code should output a dataframe containing scores for all configurations and should be sorted by sum of these two metrics. find_best should have a parameter number of iterations (n_iter for RandomizedSearchCV) for each model; if it has value -1 it will do GridSearchCV (test all configurations).

I will utilize the Bank Customer Churn dataset I used in my previous exercise (homework #3). 

## Data Preparation

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Load and view the dataset 

bank_data = pd.read_csv('bank_customer_churn.csv')
bank_data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
# Basic data preprocessing

# Convert categorical values to numerical 
ohe = OneHotEncoder()
ohe.fit(bank_data.loc[:, ['Geography', 'Gender']])
ohe.transform(bank_data.loc[:, ['Geography', 'Gender']]).toarray()
bank_data_enc = pd.DataFrame(ohe.transform(bank_data.loc[:, ['Geography', 'Gender']]).toarray(), columns=ohe.get_feature_names())

# Add dummy columns to dataset
bank_data_1 = pd.concat([bank_data, bank_data_enc], axis=1)

# Drop columns that are not required
bank_data_1.drop(columns=['RowNumber', 'CustomerId', 'Surname', 'Geography', 'Gender'], axis=1, inplace=True)

## Hyperparameter Optimization Function

In [11]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import precision_score, roc_auc_score

In [12]:
# Define find_best funtion to perform a randomized search on model hyperparameters

def find_best(model, parameters, n_iter):
    
    # Define cross validator
    cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)
    
    # Use two performance metrics
    scoring = {'Precision Score': 'precision', 'ROC AUC Score': 'roc_auc'}
    
    # Define dataframe for function output
    df = pd.DataFrame()
    
    for score in scoring:
        # Define search procedure
        search = RandomizedSearchCV(model, parameters, n_iter=n_iter, scoring=scoring[score], n_jobs=-1, cv=cv, random_state=1)

        # Execute search
        result = search.fit(X_train, y_train)
        
        # Generate scores and configurations and store in dataframe
        scores = [round(score,3) for score in result.cv_results_['mean_test_score']]
        configs = result.cv_results_['params']

        df['Configuration'] = configs
        df[score] = scores 
    
    # Create new df column to store sum of scores
    df['Sum'] = df['Precision Score'] + df['ROC AUC Score']
    
    # Sort dataframe by sum of scores
    sorted_df = df.sort_values(by='Sum', ascending=False)
    
    return sorted_df

## Implementation

I will call the find_best function on four different classification models for the given dataset. 

The search space for each model consists of a random selection of values for a few selected parameters. 

The output of the implementation for each model is a report showing mean Precision and ROC AUC Scores for each configuration / parameter setting, sorted by the sum of these scores. 

### Train Test Data Split

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X = bank_data_1.drop(columns=['Exited'], axis=1)
y = bank_data_1['Exited']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

### Model: Random Forest Classifier

In [15]:
from sklearn.ensemble import RandomForestClassifier

# Define model
model = RandomForestClassifier()

# Define parameters for model
parameters = dict()
parameters['n_estimators'] = [100, 200, 400, 600, 800]
parameters['criterion'] = ['gini', 'entropy']
parameters['max_depth'] = [10, 20, 30, 40, 50, 60]
parameters['min_samples_split'] = [2, 5, 10]
parameters['min_samples_leaf'] = [1, 2, 4]
parameters['max_features'] = ['auto', 'sqrt']
parameters['bootstrap'] = [True, False]

# Call find_best function over model
find_best(model, parameters, n_iter=10)

Unnamed: 0,Configuration,Precision Score,ROC AUC Score,Sum
2,"{'n_estimators': 200, 'min_samples_split': 2, ...",0.787,0.854,1.641
6,"{'n_estimators': 800, 'min_samples_split': 5, ...",0.783,0.854,1.637
0,"{'n_estimators': 200, 'min_samples_split': 10,...",0.781,0.853,1.634
1,"{'n_estimators': 100, 'min_samples_split': 10,...",0.771,0.848,1.619
4,"{'n_estimators': 100, 'min_samples_split': 2, ...",0.767,0.85,1.617
5,"{'n_estimators': 800, 'min_samples_split': 5, ...",0.766,0.849,1.615
3,"{'n_estimators': 100, 'min_samples_split': 5, ...",0.761,0.848,1.609
8,"{'n_estimators': 100, 'min_samples_split': 10,...",0.761,0.848,1.609
7,"{'n_estimators': 400, 'min_samples_split': 2, ...",0.757,0.849,1.606
9,"{'n_estimators': 600, 'min_samples_split': 2, ...",0.743,0.844,1.587


### Model:  K-Neighbors Classifier 

In [19]:
from sklearn.neighbors import KNeighborsClassifier

# Define model
model = KNeighborsClassifier()

# Define parameters for model
parameters = dict()
parameters['n_neighbors'] = [3, 5, 8]
parameters['weights'] = ['uniform', 'distance']
parameters['algorithm'] = ['auto', 'ball_tree', 'kd_tree']

# Call find_best function over model
find_best(model, parameters, n_iter=10)

Unnamed: 0,Configuration,Precision Score,ROC AUC Score,Sum
3,"{'weights': 'uniform', 'n_neighbors': 5, 'algo...",0.272,0.54,0.812
4,"{'weights': 'uniform', 'n_neighbors': 5, 'algo...",0.272,0.54,0.812
0,"{'weights': 'uniform', 'n_neighbors': 3, 'algo...",0.26,0.537,0.797
1,"{'weights': 'distance', 'n_neighbors': 5, 'alg...",0.253,0.54,0.793
6,"{'weights': 'distance', 'n_neighbors': 5, 'alg...",0.253,0.54,0.793
7,"{'weights': 'uniform', 'n_neighbors': 8, 'algo...",0.248,0.544,0.792
9,"{'weights': 'uniform', 'n_neighbors': 8, 'algo...",0.248,0.544,0.792
2,"{'weights': 'distance', 'n_neighbors': 3, 'alg...",0.248,0.535,0.783
5,"{'weights': 'distance', 'n_neighbors': 3, 'alg...",0.248,0.535,0.783
8,"{'weights': 'distance', 'n_neighbors': 3, 'alg...",0.248,0.535,0.783


### Model: Gradient Boosting Classifier

In [17]:
from sklearn.ensemble import GradientBoostingClassifier

# Define model
model = GradientBoostingClassifier()

# Define parameters for model
parameters = dict()
parameters['learning_rate'] = [0.1, 0.2, 0.4]
parameters['n_estimators'] = [100, 200, 300]
parameters['subsample'] = [1.0, 1.5]
parameters['criterion'] = ['friedman_mse', 'mse']
parameters['min_samples_leaf'] = [1, 2, 3]

# Call find_best function over model
find_best(model, parameters, n_iter=10)

Unnamed: 0,Configuration,Precision Score,ROC AUC Score,Sum
7,"{'subsample': 1.0, 'n_estimators': 100, 'min_s...",0.765,0.859,1.624
9,"{'subsample': 1.0, 'n_estimators': 100, 'min_s...",0.763,0.859,1.622
6,"{'subsample': 1.0, 'n_estimators': 300, 'min_s...",0.713,0.845,1.558
8,"{'subsample': 1.0, 'n_estimators': 100, 'min_s...",0.708,0.843,1.551
1,"{'subsample': 1.0, 'n_estimators': 100, 'min_s...",0.7,0.847,1.547
2,"{'subsample': 1.0, 'n_estimators': 200, 'min_s...",0.671,0.836,1.507
0,"{'subsample': 1.0, 'n_estimators': 300, 'min_s...",0.666,0.831,1.497
3,"{'subsample': 1.5, 'n_estimators': 300, 'min_s...",,,
4,"{'subsample': 1.5, 'n_estimators': 300, 'min_s...",,,
5,"{'subsample': 1.5, 'n_estimators': 100, 'min_s...",,,


### Model: Gaussian Naive Bayes

In [18]:
from sklearn.naive_bayes import GaussianNB

# Define model
model = GaussianNB()

# Define parameters for model
parameters = dict()
parameters['var_smoothing'] = np.logspace(0,-9, num=100)

# Call find_best function over model
find_best(model, parameters, n_iter=10)

Unnamed: 0,Configuration,Precision Score,ROC AUC Score,Sum
4,{'var_smoothing': 3.5111917342151273e-09},0.312,0.731,1.043
0,{'var_smoothing': 5.336699231206302e-08},0.308,0.692,1.0
3,{'var_smoothing': 4.3287612810830526e-08},0.207,0.697,0.904
1,{'var_smoothing': 2.310129700083158e-08},0.14,0.712,0.852
7,{'var_smoothing': 3.5111917342151277e-08},0.136,0.702,0.838
8,{'var_smoothing': 5.336699231206313e-07},0.0,0.622,0.622
9,{'var_smoothing': 1.232846739442066e-06},0.0,0.605,0.605
6,{'var_smoothing': 0.0005336699231206307},0.0,0.585,0.585
2,{'var_smoothing': 0.001},0.0,0.584,0.584
5,{'var_smoothing': 0.02848035868435802},0.0,0.584,0.584
