## Validating Prediction

In [1]:
import numpy as np

#### MSE (Mean Squared Error)
* penalize outlier errors

In [46]:
n = 3
y = np.array([4,3,5])
yhat = np.array([4,5,6])
sub = y-yhat
sqr = sub**2
summation = np.sum(sqr)
mse = summation/n
print(sqr)
print(mse)

[0 4 1]
1.6666666666666667


#### RMSE (Root Mean Squared Error)

In [8]:
n = 3
y = np.array([4,3,5])
yhat = np.array([4,100,6])
sub = y-yhat
sqr = sub**2
div = sqr/n
summation = np.sum(div)
rmse = np.sqrt(summation)
rmse

56.005952064639224

#### MAE (Mean Absolute Percentage Error)

In [11]:
n = 3
y = np.array([4,3,5])
yhat = np.array([4,100,6])
sub = y-yhat
absolute = np.abs(sub)
summation = np.sum(absolute)
mae = summation/n
mae

32.666666666666664

#### MAPE (Mean Absolute Percentage Error)

In [14]:
n = 3
y = np.array([4,6,8])
yhat = np.array([2,3,4])
sub = y-yhat
perc = sub/y
absol = np.abs(perc)
summation = np.sum(absol)
mape = summation/n
mape

0.5

#### Correlation Y and Yhat

In [17]:
from scipy.stats import pearsonr

In [21]:
y = np.array([4,6,8])
yhat = np.array([2,3,4])
np.corrcoef(y,yhat)

array([[1., 1.],
       [1., 1.]])

In [22]:
corr, pvalue = pearsonr(y,yhat)
print(corr)
print(pvalue)

0.9999999999999998
1.3415758552508151e-08


#### reasonable to use the median vs mean

In [53]:
y = np.array([4,6,8,5,2,3,4])
yhat = np.array([2,3,4,1,2,5,100])
absolute_errors = abs(y - yhat)
print("Mean: {}".format(np.mean(absolute_errors)))
print("Median: {}".format(np.median(absolute_errors)))

Mean: 15.857142857142858
Median: 3.0


## Validating Classification

<img src="https://miro.medium.com/max/1106/1*vMEqRXTl8PRfRtgWgwUVEg.jpeg"
     width="500" height="300" />

In [35]:
y = np.array([1,1,0,0,0,0])
yhat = np.array([1, 0, 0, 1, 0, 0])

print("Y  - Yhat")
for i in zip(y, yhat):
    print(i[0], " - ", i[1], " - ", i[0] == i[1])

Y  - Yhat
1  -  1  -  True
1  -  0  -  False
0  -  0  -  True
0  -  1  -  False
0  -  0  -  True
0  -  0  -  True


* true positive: is positive and we predicted positive
    * y = 1 and yhat = 1
* true negatives: is negative and we predicted negative
    * y = 0 and yhat = 0
* false positives: is negative but we marked as positive
    * y = 0 and yhat = 1
* false negatives: is positive but we marked as negative
    * y = 1 and yhat = 0

In [38]:
true_positives = 1
true_negatives = 3
false_positives = 1
false_negatives = 1

#### recall
* how many of our positive cases did we recall, or find

In [41]:
recall = true_positives/(true_positives + false_negatives)
recall

0.5

#### precision
* of all our true predictions, how precise were we, how many were actually true

In [42]:
precision = true_positives/(true_positives+false_positives)
precision

0.5

#### f1
* weighed average of precision and recall

In [43]:
f1 = 2 * (precision * recall) / (precision + recall)
f1

0.5

#### accuracy

In [44]:
y = np.array([1,1,0,0,0,0])
yhat = np.array([1, 0, 0, 1, 0, 0])

correct = 4
n = 8
accuracy = correct/n
accuracy

0.5

#### logic extends to multilabel classification

## Cross Validation
* we split our data in training and test
* then we divide up our data into k-folds (some arbitrary number)
* say we have 5 folds, we will make a model on 4 folds, then use the last as a test
* then we will pick 4 more folds, then have a new 5th fold be the test
* we repeat this process and validate
* test for model stability
* cross_validate
* The cross_validate function differs from cross_val_score in two ways:
    * It allows specifying multiple metrics for evaluation.
    * It returns a dict containing fit-times, score-times (and optionally training scores as well as fitted estimators) in addition to the test score.


<img src="https://scikit-learn.org/stable/_images/grid_search_cross_validation.png"
     width="600" height="400" />

In [20]:
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [5]:
df = pd.read_csv("data/iris.csv")

In [6]:
x = df.drop("variety", 1)
y = df["variety"]

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [24]:
clf = DecisionTreeClassifier()

In [27]:
cv_results = cross_validate(clf, x_train, y_train, cv=5)

In [28]:
cv_results

{'fit_time': array([0.00237703, 0.0019598 , 0.001724  , 0.00260282, 0.00201583]),
 'score_time': array([0.00131798, 0.00112319, 0.00097799, 0.00186205, 0.00092697]),
 'test_score': array([0.96      , 1.        , 0.83333333, 0.95833333, 0.95652174])}

In [29]:
cross_val_score(clf, x, y, cv=5)

array([0.96666667, 0.96666667, 0.9       , 0.96666667, 1.        ])

In [30]:
cv_results = cross_validate(clf, x_train, y_train, cv=5, scoring=('accuracy', 'recall_weighted'))
cv_results

{'fit_time': array([0.00190592, 0.00145197, 0.00179386, 0.001333  , 0.00133395]),
 'score_time': array([0.00274205, 0.00211406, 0.00248909, 0.00208187, 0.00174284]),
 'test_accuracy': array([0.96      , 1.        , 0.83333333, 0.95833333, 0.95652174]),
 'test_recall_weighted': array([0.96      , 1.        , 0.83333333, 0.95833333, 0.95652174])}

In [32]:
cv_results = cross_validate(clf, x_train, y_train, cv=5, scoring=('accuracy', 'f1_weighted'))
cv_results

{'fit_time': array([0.00228906, 0.00201583, 0.00184202, 0.00183177, 0.00175023]),
 'score_time': array([0.003052  , 0.00277829, 0.00258088, 0.00247407, 0.00235891]),
 'test_accuracy': array([0.96      , 0.95833333, 0.83333333, 0.95833333, 0.95652174]),
 'test_f1_weighted': array([0.9597193 , 0.95816993, 0.83068783, 0.95816993, 0.95612827])}

In [19]:
final_model = DecisionTreeClassifier()
final_model.fit(x_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [34]:
yhat = final_model.predict(x_test)

## Grid Search
* https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
* The simple rule is that data used for evaluating the performance of a model should not have been used to optimize the model in any way
* training and test split
* we can use a grid search with cross validation on the training
* understand the stability and get the best params
* make that model
* then test on the holdout

In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV

In [42]:
param_map = {
    'criterion':('gini', 'entropy'), 
    "max_depth":[5,7,9,11,13,15,None],
    "min_samples_split":[2,3,4,5,6,7,8,9,10]
}

In [43]:
clf = DecisionTreeClassifier()

In [44]:
gs = GridSearchCV(clf, param_map, cv=5, verbose = 1)
gs.fit(x,y)

Fitting 5 folds for each of 126 candidates, totalling 630 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 630 out of 630 | elapsed:    1.5s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'criterion': ('gini', 'entropy'),
                         'max_depth': [5, 7, 9, 11, 13, 15, None],
          

In [45]:
gs

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'criterion': ('gini', 'entropy'),
                         'max_depth': [5, 7, 9, 11, 13, 15, None],
          

In [46]:
gs.best_params_

{'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 3}

In [50]:
gs.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_criterion', 'param_max_depth', 'param_min_samples_split', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [56]:
gs.cv_results_["param_min_samples_split"]

masked_array(data=[2, 3, 4, 5, 6, 7, 8, 9, 10, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                   2, 3, 4, 5, 6, 7, 8, 9, 10, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                   2, 3, 4, 5, 6, 7, 8, 9, 10, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                   2, 3, 4, 5, 6, 7, 8, 9, 10, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                   2, 3, 4, 5, 6, 7, 8, 9, 10, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                   2, 3, 4, 5, 6, 7, 8, 9, 10, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                   2, 3, 4, 5, 6, 7, 8, 9, 10, 2, 3, 4, 5, 6, 7, 8, 9, 10],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False

In [53]:
gs.cv_results_["std_test_score"]

array([0.03265986, 0.03651484, 0.03651484, 0.02108185, 0.02108185,
       0.02108185, 0.02108185, 0.02108185, 0.02108185, 0.03399346,
       0.03399346, 0.03651484, 0.02108185, 0.02108185, 0.02108185,
       0.02108185, 0.02108185, 0.02108185, 0.03651484, 0.03399346,
       0.03651484, 0.02108185, 0.02108185, 0.02108185, 0.02108185,
       0.02108185, 0.02108185, 0.03265986, 0.03399346, 0.03651484,
       0.02108185, 0.02108185, 0.02108185, 0.02108185, 0.02108185,
       0.02108185, 0.03265986, 0.03399346, 0.03651484, 0.02108185,
       0.02108185, 0.02108185, 0.02108185, 0.02108185, 0.02108185,
       0.03265986, 0.03399346, 0.03651484, 0.02108185, 0.02108185,
       0.02108185, 0.02108185, 0.02108185, 0.02108185, 0.03651484,
       0.03265986, 0.03399346, 0.02108185, 0.02108185, 0.02108185,
       0.02108185, 0.02108185, 0.02108185, 0.03399346, 0.03399346,
       0.03399346, 0.02494438, 0.02494438, 0.02494438, 0.02494438,
       0.02494438, 0.02494438, 0.03399346, 0.03399346, 0.03399