# Paramter-optimize the ML methods from pre-selction
Using Python 3.7 and fast.ai v2

Author: Patrick Ruoff

Date: January 2020

In [1]:
import pandas as pd
import numpy as np

import data_preparation_methods as prep
import analysis_methods as ana
import scikitlearn_methods as sci
seed = 42
rs = np.random.RandomState()
rs.seed(seed)

There are 3 occupants in the data set that I collected: 'O1', 'O2', or 'O3'. This notebook
is implemented to run the model_selection for one single occupant. 

In [2]:
occupant = 'U3'
n_folds = 5

In [3]:
data = prep.import_data(occupant)
print(np.shape(data))

(20606, 153)


In [6]:
target = '{}_is_heating'.format(occupant)
cat_names = ['o_condition']
cont_names = data.columns.drop(cat_names).to_numpy()
print('dependency variable distribution:\n', data[target].value_counts()/data.shape[0])

dependency variable distribution:
 0.0    0.643897
1.0    0.356103
Name: U3_is_heating, dtype: float64


In [7]:
naive_acc, naive_f1, pos_label, Y_df, X_df = prep.prepare_data(
    data, target)

The target variable distribution is:
 0.0    0.643897
1.0    0.356103
Name: U3_is_heating, dtype: float64
X shape: (20598, 166)


For this project I designed a novel kind of interval-stratified k-fold cross-validation. It seperates 
the data into k groups by data collection-intervals. Additionally, the resulting groups are
of equal size and have an equal target variable distribution. Check the implementation 
for details. 

In [8]:
train_test_indices = prep.interval_stratified_k_fold_cross_validation(data, n_folds, target, rs, seed)

Training and validation set sizes are:
 Fold 0:   train/valid  16491/4107   with 1465 positive labels
 Fold 1:   train/valid  16467/4131   with 1468 positive labels
 Fold 2:   train/valid  16496/4102   with 1469 positive labels
 Fold 3:   train/valid  16499/4099   with 1465 positive labels
 Fold 4:   train/valid  16439/4159   with 1468 positive labels


In [9]:
training_attributes = {
    'occupant': occupant,
    'pos_label': pos_label,
    'X_df': X_df,
    'X_array': np.array(X_df),
    'Y_df': Y_df,
    'Y_array': np.array(Y_df),
    'train_test_indices': train_test_indices,
    'train_test_indices_int': \
    prep.transform_date_time_index_to_int(
        data, train_test_indices
    )
}

In [None]:
plot_tuples = {}

# Randomized Grid Search

In [15]:
opt_methods = [
    'RF-Clf_opt', 
    'GradientBDT_opt',
    'LR_opt', 
    'kNN_opt',
    'SVM_opt'
]
# leave out GaussianNB for occupant 3 after pre-selection
if occupant != 'U3':
    opt_methods.append('GaussianNB_opt')

In [16]:
from scipy.stats import expon
from scipy.stats import geom

param_distributions = {}
param_distributions['SVM_opt'] = {
    'clf__C': expon(scale=100), 
    'clf__gamma': expon(scale=.1), 
    'clf__kernel': ['rbf', 'linear', 'sigmoid']
}
param_distributions['RF-Clf_opt'] = {
    'clf__n_estimators': geom(0.006),
    'clf__criterion': ['entropy', 'gini'],
    'clf__bootstrap': [True, False],
    'clf__max_features': ['auto', 'sqrt', 'log2'],
    'clf__class_weight': [None, 'balanced', 'balanced_subsample']
}
param_distributions['LR_opt'] = {
    'clf__penalty': ['l2', 'none'],
    'clf__C': expon(scale=100),
    'clf__fit_intercept': [True, False],
    'clf__class_weight': [None, 'balanced'],
    'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}
param_distributions['GaussianNB_opt'] = {
    'clf__var_smoothing': expon(scale=0.000000003)
}
param_distributions['GradientBDT_opt'] = {
    'clf__loss': ['deviance', 'exponential'],
    'clf__learning_rate': expon(scale=0.3),
    'clf__n_estimators': geom(0.006),
    'clf__subsample': [0.7, 0.9, 1.0],
    'clf__max_features': ['auto', 'sqrt', 'log2']
}
param_distributions['kNN_opt'] = {
    'clf__n_neighbors': geom(0.006),
    'clf__weights': ['uniform', 'distance'],
    'clf__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'clf__leaf_size': geom(0.00025),
    'clf__p': [1, 2]
}

In [17]:
n_iter = 200
plot_tuples_new, random_searches = sci.run_random_search(
    opt_methods, param_distributions, n_iter, training_attributes, rs
)
for method in plot_tuples_new:
    plot_tuples[method] = plot_tuples_new[method]

In check_random_search_results.ipynb I compare the results of this randomized search. 
Afterwards I run refit_with_optimized_parameters.ipynb to refit all the models with 
the optimal parameters with three different seeds as is good practice  
