## Test Notebook 2: Model Evaluation, Analysis, and Selection
<br>

##### Purpose: 
* Model Development and Selection: build and validate models using our training data and select the model with the best mean accuracy of cross validation

##### Model Types
* We will use the following ML Classifier Algorithms to model: K-Nearest Neighbors, Logistic Regression, Support Vector Machine, and Standard Decision Tree

##### Data
* We will use data from before the previous calendar year as our training data. The rest of the data will be used for final testing

##### Results
* For both models, the best performing model was the <u>Decision Tree</u> Algorithm
* Best parameters for the <b>LONG</b> model:
    * Minimum Sample Split: 8
    * Splitter: Best
    * Criterion: Gini
* Best parameters for the <b>SHORT</b> model:
    * Minimum Sample Split: 8
    * Splitter: Best
    * Criterion: log_loss

In [1]:
import pandas as pd
import numpy as np
import sklearn as skl

### Upload and Prepare Data

In [2]:
'''
Upload data and exclude the previous year
'''

candles = pd.read_pickle('CAD_USD_H2')
candles = candles[:-3069]

In [3]:
candles.head()

Unnamed: 0,range_14,mid_c_prev,stochastic_range_k,stochastic_range_d,rsi_14_lag_1,MACD_12_26_9_lag_1,mid_o_lag_1,mid_l_lag_1,mid_h_lag_1,stochastic_range_k_lag_1,...,stochastic_range_d_lag_9,rsi_14_lag_10,MACD_12_26_9_lag_10,mid_o_lag_10,mid_l_lag_10,mid_h_lag_10,stochastic_range_k_lag_10,stochastic_range_d_lag_10,long,short
37,0.00959,0.97308,0.222,0.196333,33.408488,-0.000375,0.00057,0.00057,-0.00083,0.213,...,0.066845,37.432008,0.0,-0.0034,-0.00311,-0.00532,0.151,0.106078,False,False
38,0.00959,0.97317,0.065361,0.166787,33.796158,-0.000232,6e-05,0.00019,-0.00052,0.222,...,0.095262,33.116789,-0.000124,-0.00455,-0.003,-0.00555,0.043,0.066845,True,False
39,0.00851,0.97153,0.148587,0.134529,28.903525,-0.000218,-0.00162,9e-05,-0.00176,0.065361,...,0.079146,27.659356,-0.000305,-0.00505,-0.00277,-0.00592,0.05,0.095262,False,True
40,0.00819,0.97238,0.426892,0.20857,34.162447,-0.000126,0.00083,0.00121,-0.00022,0.148587,...,0.12418,22.628277,-0.00054,-0.00239,0.00092,-0.00322,0.108,0.079146,False,True
41,0.00777,0.975,0.371,0.303889,47.310287,0.000123,0.0026,0.00283,-0.00028,0.426892,...,0.079521,24.485597,-0.000635,0.00249,0.00372,0.00141,0.198906,0.12418,True,False


In [4]:
'''
Separate the decision (y) variable (long, short) from the rest of the features (X)
'''

candle_features = candles[[col for col in candles.columns if col not in ['long', 'short']]]
long = candles['long']
short = candles['short']

In [5]:
from sklearn.preprocessing import StandardScaler

In [6]:
'''
Normalize the data for faster model training
'''

scaler = StandardScaler()
scaler.fit(candle_features)

scaled_candles = scaler.transform(candle_features)

### Model Evaluation 
<br>
The same steps are repeated for every model. Notes are only left for first model for this reason

### K-Nearest Nearbors modeling

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [8]:
'''
Using GridSearchCV, compile model with parameter grid
'''

knn_params = {
    'n_neighbors': list(range(1,30)),
    'weights': ('uniform', 'distance')
}
knn_grid_model_long = GridSearchCV(KNeighborsClassifier(), 
                                   param_grid=knn_params, 
                                   cv=5)
knn_grid_model_short = GridSearchCV(KNeighborsClassifier(), 
                                    param_grid=knn_params, 
                                    cv=5)

In [9]:
'''
Fit and validate models
'''

knn_grid_model_long.fit(scaled_candles, long)
knn_grid_model_short.fit(scaled_candles, short)

In [10]:
'''
Store results in a Pandas DataFrame and display the results
'''

knn_short_results = pd.DataFrame(knn_grid_model_short.cv_results_)
knn_short_results[['param_n_neighbors', 'param_weights', 'mean_test_score'
                  ]].sort_values('mean_test_score', ascending=False)

Unnamed: 0,param_n_neighbors,param_weights,mean_test_score
53,27,distance,0.596504
56,29,uniform,0.596504
57,29,distance,0.596441
52,27,uniform,0.596409
51,26,distance,0.595871
55,28,distance,0.595807
54,28,uniform,0.594446
48,25,uniform,0.593844
49,25,distance,0.593622
47,24,distance,0.592926


In [11]:
'''
Store results in a Pandas DataFrame and display the results
'''

knn_long_results = pd.DataFrame(knn_grid_model_long.cv_results_)
knn_long_results[['param_n_neighbors', 'param_weights', 'mean_test_score'
                 ]].sort_values('mean_test_score', ascending=False)

Unnamed: 0,param_n_neighbors,param_weights,mean_test_score
49,25,distance,0.593053
48,25,uniform,0.592989
56,29,uniform,0.592483
57,29,distance,0.592229
52,27,uniform,0.592008
45,23,distance,0.591913
44,23,uniform,0.591849
53,27,distance,0.591723
43,22,distance,0.591406
47,24,distance,0.591343


### Logistic Regression Modeling

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
lr_params = {
    'penalty':['elasticnet', 'l1', 'l2'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'C': [.01, .1, 1, 10, 100],
}

lr_grid_model_short = GridSearchCV(LogisticRegression(),
                                 param_grid=lr_params,
                                 cv=5)
lr_grid_model_long = GridSearchCV(LogisticRegression(),
                                 param_grid=lr_params,
                                 cv=5)

In [14]:
lr_grid_model_long.fit(scaled_candles, long)
lr_grid_model_short.fit(scaled_candles, short)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [15]:
lr_short_results = pd.DataFrame(lr_grid_model_short.cv_results_)
lr_short_results[['param_penalty', 'param_solver', 'param_C', 'mean_test_score'
                 ]].sort_values('mean_test_score', ascending=False)

Unnamed: 0,param_penalty,param_solver,param_C,mean_test_score
22,l1,liblinear,0.1,0.764527
39,l1,saga,1,0.763831
24,l1,saga,0.1,0.763736
43,l2,sag,1,0.763419
54,l1,saga,10,0.763419
...,...,...,...,...
63,elasticnet,sag,100,
64,elasticnet,saga,100,
65,l1,newton-cg,100,
66,l1,lbfgs,100,


In [16]:
lr_long_results = pd.DataFrame(lr_grid_model_long.cv_results_)
lr_long_results[['param_penalty', 'param_solver', 'param_C', 'mean_test_score'
                ]].sort_values('mean_test_score', ascending=False)

Unnamed: 0,param_penalty,param_solver,param_C,mean_test_score
37,l1,liblinear,1,0.763546
56,l2,lbfgs,10,0.763514
67,l1,liblinear,100,0.763419
71,l2,lbfgs,100,0.763388
41,l2,lbfgs,1,0.763324
...,...,...,...,...
63,elasticnet,sag,100,
64,elasticnet,saga,100,
65,l1,newton-cg,100,
66,l1,lbfgs,100,


### Support Vector Machine Classifier

In [17]:
from sklearn.svm import LinearSVC

In [18]:
svc_params = {
    'penalty':['l1', 'l2'],
    'C': [.01, .1, 1, 10, 100],
}

svc_grid_model_short = GridSearchCV(LinearSVC(),
                                 param_grid=svc_params,
                                 cv=5)
svc_grid_model_long = GridSearchCV(LinearSVC(),
                                 param_grid=svc_params,
                                 cv=5)

In [None]:
svc_grid_model_long.fit(scaled_candles, long)
svc_grid_model_short.fit(scaled_candles, short)



In [None]:
svc_long_results = pd.DataFrame(svc_grid_model_long.cv_results_)
svc_long_results[['param_penalty', 'param_C', 'mean_test_score'
                 ]].sort_values('mean_test_score', ascending=False)

In [None]:
svc_short_results = pd.DataFrame(svc_grid_model_short.cv_results_)
svc_short_results[['param_penalty', 'param_C', 'mean_test_score'
                  ]].sort_values('mean_test_score', ascending=False)

### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier as DTC

In [None]:
dtc_params = {
    'criterion':['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'min_samples_split': [2,4,6,8],
}

dtc_grid_model_short = GridSearchCV(DTC(),
                                 param_grid=dtc_params,
                                 cv=5)
dtc_grid_model_long = GridSearchCV(DTC(),
                                 param_grid=dtc_params,
                                 cv=5)

In [None]:
dtc_grid_model_long.fit(scaled_candles, long)
dtc_grid_model_short.fit(scaled_candles, short)

In [None]:
dtc_long_results = pd.DataFrame(dtc_grid_model_long.cv_results_)
dtc_long_results[['param_criterion', 'param_splitter', 'param_min_samples_split', 
                   'mean_test_score']].sort_values('mean_test_score', ascending=False)

In [None]:
dtc_short_results = pd.DataFrame(dtc_grid_model_short.cv_results_)
dtc_short_results[['param_criterion', 'param_splitter', 'param_min_samples_split', 
                   'mean_test_score']].sort_values('mean_test_score', ascending=False)