In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('credit-card-full.csv')
sex = pd.get_dummies(df['SEX'], prefix = 'SEX')
education = pd.get_dummies(df['EDUCATION'], prefix = 'EDUCATION')
marriage = pd.get_dummies(df['MARRIAGE'], prefix = 'MARRIAGE')
df2 = pd.concat([df, sex, education, marriage], axis = 1)
df2 = df2.sample(3200)
X = df2[['LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5',
       'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4',
       'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
       'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'SEX_2', 'EDUCATION_1',
       'EDUCATION_2', 'EDUCATION_3', 'EDUCATION_4', 'EDUCATION_5',
       'EDUCATION_6', 'MARRIAGE_1', 'MARRIAGE_2', 'MARRIAGE_3']]
y = df2['default payment next month']
df2.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,EDUCATION_5,EDUCATION_6,MARRIAGE_0,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3
17751,17752,50000,2,3,1,61,-1,-1,-2,-2,...,0,0,1,0,0,0,0,1,0,0
3680,3681,30000,1,1,2,36,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
25268,25269,120000,2,2,1,25,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
19592,19593,170000,2,2,1,45,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
16133,16134,60000,2,2,1,41,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5)

## Hyperparameters and Parameters

### Extracting a Logistic Regression parameter

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg_clf = LogisticRegression(solver = 'liblinear').fit(X_train, y_train)

In [None]:
# Create a list of original variable names from the training DataFrame
original_variables = X_train.columns

# Extract the coefficients of the logistic regression estimator
model_coefficients = log_reg_clf.coef_[0]

# Create a dataframe of the variables and coefficients & print it out
coefficient_df = pd.DataFrame({"Variable" : original_variables, "Coefficient": model_coefficients})
display(coefficient_df)

# Print out the top 3 positive variables
top_three_df = coefficient_df.sort_values(by='Coefficient', axis=0, ascending=False)[0:3]
display(top_three_df)

### Extracting a Random Forest parameter

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False).fit(X_train, y_train)

In [None]:
# Extract the 7th tree from the random forest
chosen_tree = rf_clf.estimators_[6]

# visualize the graph
# imgplot = plt.imshow(tree_viz)
# plt.show()

# Extract the parameters and level of the top node
split_column = chosen_tree.tree_.feature[0]
split_column_name = X_train.columns[split_column]
split_value = chosen_tree.tree_.threshold[0]

# Print out the feature and level
print("This node split on feature {}, at a value of {}".format(split_column_name, split_value))

In [None]:
from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(chosen_tree, 
                out_file='tree.dot', 
                feature_names = X_train.columns,
                class_names = 'default payment next month',
                rounded = True, proportion = False, 
                precision = 2, filled = True)

In [None]:
# Convert to png

# !dot -Tpng tree.dot -o tree.png -Gdpi=600

In [None]:
# from IPython.display import Image
# Image(filename = 'tree.png')

### Exploring Random Forest Hyperparameters

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
rf_clf_old = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False).fit(X_train, y_train)
rf_old_predictions = rf_clf_old.predict(X_test)
# Print out the old estimator, notice which hyperparameter is badly set
print(rf_clf_old)

# Get confusion matrix & accuracy for the old rf_model
print("Confustion Matrix: \n {} \n\n Accuracy Score: \n {}".format(confusion_matrix(y_test, rf_old_predictions),  accuracy_score(y_test, rf_old_predictions))) 

In [None]:
# Create a new random forest classifier with better hyperparamaters
rf_clf_new = RandomForestClassifier(n_estimators=500)

# Fit this to the data and obtain predictions
rf_new_predictions = rf_clf_new.fit(X_train, y_train).predict(X_test)

# Assess the new model
print("Confustion Matrix: \n", confusion_matrix(y_test, rf_new_predictions))
print("Accuracy Score: \n", accuracy_score(y_test, rf_new_predictions))

### Hyperparameters of KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
# Build a knn estimator for each value of n_neighbours
knn_5 = KNeighborsClassifier(n_neighbors=5)
knn_10 = KNeighborsClassifier(n_neighbors=10)
knn_20 = KNeighborsClassifier(n_neighbors=20)

# Fit each to the training data & produce predictions
knn_5_predictions = knn_5.fit(X_train, y_train).predict(X_test)
knn_10_predictions = knn_10.fit(X_train, y_train).predict(X_test)
knn_20_predictions = knn_20.fit(X_train, y_train).predict(X_test)

# Get an accuracy score for each of the models
knn_5_accuracy = accuracy_score(y_test, knn_5_predictions)
knn_10_accuracy = accuracy_score(y_test, knn_10_predictions)
knn_20_accuracy = accuracy_score(y_test, knn_20_predictions)
print("The accuracy of 5, 10, 20 neighbours was {}, {}, {}".format(knn_5_accuracy, knn_10_accuracy, knn_20_accuracy))

### Automating Hyperparameter Choice

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
# Set the learning rates & results storage
learning_rates = [0.001, 0.01, 0.05, 0.1, 0.2, 0.5]
results_list = []

# Create the for loop to evaluate model predictions for each learning rate
for learning_rate in learning_rates:
    model = GradientBoostingClassifier(learning_rate=learning_rate)
    predictions = model.fit(X_train, y_train).predict(X_test)
    # Save the learning rate and accuracy score
    results_list.append([learning_rate, accuracy_score(y_test, predictions)])

# Gather everything into a DataFrame
results_df = pd.DataFrame(results_list, columns=['learning_rate', 'accuracy'])
print(results_df)

### Building Learning Curves

In [None]:
# Set the learning rates & accuracies list
learn_rates = np.linspace(0.01, 2, num=30)
accuracies = []

# Create the for loop
for learn_rate in learn_rates:
  	# Create the model, predictions & save the accuracies as before
    model = GradientBoostingClassifier(learning_rate=learn_rate)
    predictions = model.fit(X_train, y_train).predict(X_test)
    accuracies.append(accuracy_score(y_test, predictions))

# Plot results    
plt.plot(learn_rates, accuracies)
plt.gca().set(xlabel='learning_rate', ylabel='Accuracy', title='Accuracy for different learning_rates')
plt.show()

## Grid search

### Build Grid Search functions

In [None]:
# Create the function
def gbm_grid_search(learn_rate, max_depth):

	# Create the model
    model = GradientBoostingClassifier(learning_rate=learn_rate, max_depth= max_depth)
    
    # Use the model to make predictions
    predictions = model.fit(X_train, y_train).predict(X_test)
    
    # Return the hyperparameters and score
    return([learn_rate, max_depth, accuracy_score(y_test, predictions)])

### Iteratively tune multiple hyperparameters

In [None]:
# Create the relevant lists
results_list = []
learn_rate_list = [0.01, 0.1, 0.5]
max_depth_list =[2, 4, 6]

# Create the for loop
for learn_rate in learn_rate_list:
    for max_depth in max_depth_list:
        results_list.append(gbm_grid_search(learn_rate, max_depth))

# Print the results
print(results_list)   

In [None]:
results_list = []
learn_rate_list = [0.01, 0.1, 0.5]
max_depth_list = [2,4,6]

# Extend the function input
def gbm_grid_search_extended(learn_rate, max_depth, subsample):

	# Extend the model creation section
    model = GradientBoostingClassifier(learning_rate=learn_rate, max_depth=max_depth, subsample=subsample)
    
    predictions = model.fit(X_train, y_train).predict(X_test)
    
    # Extend the return part
    return([learn_rate, max_depth, subsample, accuracy_score(y_test, predictions)])       

In [None]:
results_list = []

# Create the new list to test
subsample_list = [0.4 , 0.6]

for learn_rate in learn_rate_list:
    for max_depth in max_depth_list:
    
    	# Extend the for loop
        for subsample in subsample_list:
        	
            # Extend the results to include the new hyperparameter
            results_list.append(gbm_grid_search_extended(learn_rate, max_depth, subsample))
            
# Print results
print(results_list)            

### GridSearchCV with Scikit Learn

In [None]:
from sklearn.model_selection import GridSearchCV
# Create a Random Forest Classifier with specified criterion
rf_class = RandomForestClassifier(criterion = 'entropy')

# Create the parameter grid
param_grid = {'max_depth': [2, 4, 8, 15], 'max_features':['auto', 'sqrt'], 'n_estimators': [1, 5, 10, 15]} 

# Create a GridSearchCV object
grid_rf_class = GridSearchCV(
    estimator=rf_class,
    param_grid=param_grid,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    refit=True, return_train_score=True)
grid_rf_class.fit(X, y)

### Exploring the grid search results

In [None]:
# Read the cv_results property into a dataframe & print it out
cv_results_df = pd.DataFrame(grid_rf_class.cv_results_)
display(cv_results_df)

# Get and show the column with dictionaries of the hyperparameters used
column = cv_results_df.loc[:, ["params"]]
display(column)

# Get and show the row that had the best mean test score
best_row = cv_results_df[cv_results_df["rank_test_score"] == 1]
display(best_row)

### Analyzing the best results

In [None]:
# Print out the ROC_AUC score from the best grid search square
best_score = grid_rf_class.best_score_
display(best_score)

# Recreate the best_row variable
best_row = cv_results_df.loc[[grid_rf_class.best_index_]]
display(best_row)

# Get the n_estimators from the best grid search square
best_n_estimators = grid_rf_class.best_params_["n_estimators"]
best_n_estimators

### Using the best results

In [None]:
# See what type of object the best_estimator_ property is
print(type(grid_rf_class.best_estimator_))

# Create an array of predictions directly using the best_estimator_ property
predictions = grid_rf_class.best_estimator_.predict(X_test)

# Take a look to confirm it worked, this should be an array of 1's and 0's
print(predictions[0:5])

# Now create a confusion matrix 
print("Confustion Matrix \n", confusion_matrix(y_test, predictions))

# Get the ROC-AUC score
print("ROC-AUC Score \n", roc_auc_score(y_test, predictions))

## Random Search

### Randomly Search with Random Forest

In [None]:
import random
from itertools import product
# Create lists for criterion and max_features
criterion_list = ["gini", "entropy"]
max_feature_list = ["auto", "sqrt", "log2", None]

# Create a list of values for the max_depth hyperparameter
max_depth_list = list(range(3,56))

# Combination list
combinations_list = [list(x) for x in product(criterion_list, max_feature_list, max_depth_list)]

# Sample hyperparameter combinations for a random search
combinations_random_chosen = random.sample(combinations_list, 10)

# Print the result
print(combinations_random_chosen)

### Randomly Sample Hyperparameters

In [None]:
from itertools import product
# Create a list of values for the learning_rate hyperparameter
learn_rate_list = np.linspace(0.01,1.5,200)

# Create a list of values for the min_samples_leaf hyperparameter
min_samples_list = list(range(10,41))

# Combination list
combinations_list = [list(x) for x in product(learn_rate_list, min_samples_list)]

# Sample hyperparameter combinations for a random search.
random_combinations_index = np.random.choice(range(1, len(combinations_list)+1), 10, replace=False)
combinations_random_chosen = [combinations_list[x] for x in random_combinations_index]

# Print the result
print(combinations_random_chosen)

### Visualizing a Random Search

In [None]:
def sample_hyperparameters(n_samples):
    global combinations_random_chosen
  
    if n_samples == len(combinations_list):
        combinations_random_chosen = combinations_list
        return
  
    combinations_random_chosen = []
    random_combinations_index = np.random.choice(range(1, len(combinations_list)), n_samples, replace=False)
    combinations_random_chosen = [combinations_list[x] for x in random_combinations_index]
    return

In [None]:
def visualize_search():
    rand_y, rand_x = [x[0] for x in combinations_random_chosen], [x[1] for x in combinations_random_chosen]
    x_lims, y_lims = [0.01, 1.5], [10, 29]
    # Plot all together
    plt.clf() 
    plt.scatter(rand_y, rand_x, c=['blue']*len(combinations_random_chosen))
    plt.gca().set(xlabel='learn_rate', ylabel='min_samples_leaf', title='Random Search Hyperparameters')
    plt.gca().set_xlim(x_lims)
    plt.gca().set_ylim(y_lims)
    plt.show()

In [None]:
# Confirm how hyperparameter combinations & print
number_combs = len(combinations_list)
print(number_combs)

# Sample and visualise combinations
for x in [50, 500, 1500 ]:
    sample_hyperparameters(x)
    visualize_search()

# Sample all the hyperparameter combinations & visualise
sample_hyperparameters(number_combs)
visualize_search()

### The RandomizedSearchCV Object

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Create the parameter grid
param_grid = {'learning_rate': np.linspace(0.1, 2, 150), 'min_samples_leaf': list(range(20, 65))} 

# Create a random search object
random_GBM_class = RandomizedSearchCV(
    estimator = GradientBoostingClassifier(),
    param_distributions = param_grid,n_iter = 10,
    scoring='accuracy', n_jobs=4, cv = 5, refit=True, return_train_score = True)

# Fit to the training data
random_GBM_class.fit(X_train, y_train)

# Print the values used for both hyperparameters
print(random_GBM_class.cv_results_['param_learning_rate'])
print(random_GBM_class.cv_results_['param_min_samples_leaf'])

### RandomSearchCV in Scikit Learn

In [None]:
# Create the parameter grid
param_grid = {'max_depth': list(range(5,26)), 'max_features': ['auto' , 'sqrt']} 

# Create a random search object
random_rf_class = RandomizedSearchCV(
    estimator = RandomForestClassifier(n_estimators=80),
    param_distributions = param_grid, n_iter = 5,
    scoring='roc_auc', n_jobs=4, cv = 3, refit=True, return_train_score = True)

# Fit to the training data
random_rf_class.fit(X_train, y_train)

# Print the values used for both hyperparameters
print(random_rf_class.cv_results_['param_max_depth'])
print(random_rf_class.cv_results_['param_max_features'])

### Grid and Random Search Side by Side

In [None]:
learn_rate = np.linspace(0.01, 3, 200)
min_samples_leaf = list(range(5,24))

# Combination list
combinations_list = [list(x) for x in product(learn_rate, min_samples_leaf)]
len(combinations_list)

In [None]:
def visualize_search(grid_combinations_chosen, random_combinations_chosen):
    x_lims, y_lims = [0.01, 3.0], [5, 24]
    grid_y, grid_x = [x[0] for x in grid_combinations_chosen], [x[1] for x in grid_combinations_chosen]
    rand_y, rand_x = [x[0] for x in random_combinations_chosen], [x[1] for x in random_combinations_chosen]

    # Plot all together
    plt.scatter(grid_y + rand_y, grid_x + rand_x, c=['red']*300 + ['blue']*300)
    plt.gca().set(xlabel='learn_rate', ylabel='min_samples_leaf', title='Grid and Random Search Hyperparameters')
    plt.gca().set_xlim(x_lims)
    plt.gca().set_ylim(y_lims)
    plt.show()

In [None]:
# Sample grid coordinates
grid_combinations_chosen = combinations_list[0:300]

# Randomly pick numbers to index
random_combinations_index = np.random.choice(range(1, len(combinations_list)), 300, replace=False)

# Use indexes to select a random sample
random_combinations_chosen = [combinations_list[x] for x in random_combinations_index]

# Call the function to produce the visualization
visualize_search(grid_combinations_chosen, random_combinations_chosen)

## Informed Search


### Visualizing Coarse to Fine

In [None]:
def gb_grid_search(learn_rate, max_depth, min_samples_leaf):
    model = GradientBoostingClassifier(learning_rate=learn_rate, max_depth=max_depth, min_samples_leaf = min_samples_leaf)   
    predictions = model.fit(X_train, y_train).predict(X_test)
    return [learn_rate, max_depth, min_samples_leaf, accuracy_score(y_test, predictions)]  

In [None]:
results_list = []
max_depth_list = list(range(1,60,10))
min_samples_leaf_list = list(range(2, 19, 5))
learn_rate_list = np.linspace(0.01, 2, 4)

for learn_rate in learn_rate_list:
    for max_depth in max_depth_list:
        for min_samples_leaf in min_samples_leaf_list:
            results_list.append(gb_grid_search(learn_rate, max_depth, min_samples_leaf))

In [None]:
results_df = pd.DataFrame(results_list, columns=['learning_rate', 'max_depth', 'min_samples_lea','accuracy'])

In [None]:
def visualize_hyperparameter(name):
    plt.clf()
    plt.scatter(results_df[name],results_df['accuracy'], c=['blue']*12)
    plt.gca().set(xlabel='{}'.format(name), ylabel='accuracy', title='Accuracy for different {}s'.format(name))
    plt.gca().set_ylim([0,1])
    plt.show()

In [None]:
# Confirm the size of the combinations_list
print(len(combinations_list))

# Sort the results_df by accuracy and print the top 10 rows
display(results_df.sort_values(by='accuracy', ascending=False).head(10))

# Confirm which hyperparameters were used in this search
print(results_df.columns)

In [None]:
# Call visualize_hyperparameter() with each hyperparameter in turn
visualize_hyperparameter('max_depth')
# visualize_hyperparameter('min_samples_leaf')
# visualize_hyperparameter('learn_rate')

### Coarse to Fine Iterations

In [None]:
def visualize_first():
    for name in results_df.columns[0:2]:
        plt.clf()
        plt.scatter(results_df[name],results_df['accuracy'], c=['blue']*12)
        plt.gca().set(xlabel='{}'.format(name), ylabel='accuracy', title='Accuracy for different {}s'.format(name))
        plt.gca().set_ylim([0,1])
        x_line = 20
        if name == "learn_rate":
            x_line = 1
        plt.axvline(x=x_line, color="red", linewidth=4)
        plt.show()

In [None]:
# Use the provided function to visualize the first results
visualize_first()

In [None]:
def visualize_second():
    for name in results_df2.columns[0:2]:
        plt.clf()
        plt.scatter(results_df[name],results_df['accuracy'], c=['blue']*12)
        plt.gca().set(xlabel='{}'.format(name), ylabel='accuracy', title='Accuracy for different {}s'.format(name))
        plt.gca().set_ylim([0,1])
        plt.show()

In [None]:
# Create some combinations lists & combine:
max_depth_list = list(range(1,21))
learn_rate_list = np.linspace(0.001,1,50)

# Call the function to visualize the second results
visualize_second()

### Bayes Rule in Python

In [None]:
# Assign probabilities to variables 
p_unhappy = 0.15
p_unhappy_close = 0.35

# Probabiliy someone will close
p_close = 0.07

# Probability unhappy person will close
p_close_unhappy = (p_unhappy_close * p_close) / p_unhappy
print(p_close_unhappy)

### Bayesian Hyperparameter tuning with Hyperopt

In [None]:
# Set up space dictionary with specified hyperparameters
space = {'max_depth': hp.quniform('max_depth', 2, 10, 2),'learning_rate': hp.uniform('learning_rate', 0.001, 0.9)}

# Set up objective function
def objective(params):
    params = {'max_depth': int(params['max_depth']),'learning_rate': params['learning_rate']}
    gbm_clf = GradientBoostingClassifier(n_estimators=100, **params) 
    best_score = cross_val_score(gbm_clf, X_train, y_train, scoring='accuracy', cv=2, n_jobs=4).mean()
    loss = 1 - best_score
    return loss

# Run the algorithm
best = fmin(fn=objective,space=space, max_evals=20, rstate=np.random.RandomState(42), algo=tpe.suggest)
print(best)

### Genetic Hyperparameter Tuning with TPOT

In [None]:
# Assign the values outlined to the inputs
number_generations = 3
population_size = 4
offspring_size = 3
scoring_function = 'accuracy'
# Create the tpot classifier
tpot_clf = TPOTClassifier(generations=number_generations, population_size=population_size,
                          offspring_size=offspring_size, scoring=scoring_function,
                          verbosity=2, random_state=2, cv=2)

# Fit the classifier to the training data
tpot_clf.fit(X_train, y_train)

# Score on the test set
print(tpot_clf.score(X_test, y_test))

### Analysing TPOT's stability

In [None]:
# Create the tpot classifier 
tpot_clf = TPOTClassifier(generations=2, population_size=4, offspring_size=3, scoring='accuracy', cv=2,
                          verbosity=2, random_state=42)

# Fit the classifier to the training data
tpot_clf.fit(X_train, y_train)

# Score on the test set
print(tpot_clf.score(X_test, y_test))

In [None]:
# Create the tpot classifier 
tpot_clf = TPOTClassifier(generations=2, population_size=4, offspring_size=3, scoring='accuracy', cv=2,
                          verbosity=2, random_state=122)

# Fit the classifier to the training data
tpot_clf.fit(X_train, y_train)

# Score on the test set
print(tpot_clf.score(X_test, y_test))

In [None]:
# Create the tpot classifier 
tpot_clf = TPOTClassifier(generations=2, population_size=4, offspring_size=3, scoring='accuracy', cv=2,
                          verbosity=2, random_state=99)

# Fit the classifier to the training data
tpot_clf.fit(X_train, y_train)

# Score on the test set
print(tpot_clf.score(X_test, y_test))