# Cross-validation Checks

Does cross-validation make a difference as compared to OOB error from the RF?

In [58]:
import sys

import numpy as np
import astropy.units as u
from sunpy.map import Map
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import (train_test_split, 
                                     permutation_test_score,
                                     cross_val_score,
                                     ShuffleSplit,
                                     KFold,
                                     StratifiedKFold,)
from sklearn.ensemble import RandomForestClassifier

sys.path.append('../paper/python')
from classify import prep_data, classify_ar

In [39]:
rf_options = {
    'n_estimators': 500,
    'max_features': 'sqrt',
    'criterion': 'gini',
    'max_depth': 30,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'bootstrap': True,
    'oob_score': True,
    'max_leaf_nodes': None,
    'min_impurity_decrease': 0,
    'random_state': 42,  # this is to fix the seed of the "randomness" of the RF
    'n_jobs': -1,
}

In [26]:
channels = [94, 131, 171, 193, 211, 335]
heating = [
    'high_frequency',
    'intermediate_frequency',
    'low_frequency'
]
channel_pairs = [
    (94,335),
    (94,211),
    (94,193),
    (94,171),
    (94,131),
    (335,211),
    (335,193),
    (335,171),
    (335,131),
    (211,193),
    (211,171),
    (211,131),
    (193,171),
    (193,131),
    (171,131),
]
correlation_threshold = 0.1
rsquared_threshold = 0.75

Grab the formatted data

In [27]:
X, Y, X_observation, bad_pixels = prep_data(
    '../paper/data/',
    channel_pairs,
    heating,
    correlation_threshold=correlation_threshold,
    rsquared_threshold=rsquared_threshold,
)

  em_valid = np.log10(emcube.as_array()[i_valid])


In [10]:
le = LabelEncoder()
le.fit(Y)
Y_encoded = le.transform(Y)

In [12]:
Y_encoded

array([0, 0, 0, ..., 2, 2, 2])

Do the usual 2/3-1/3 train-test split and run the model

In [29]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y_encoded, test_size=0.33)

In [62]:
clf = RandomForestClassifier(**rf_options)

In [63]:
clf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=30, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=True, random_state=42, verbose=0, warm_start=False)

In [64]:
test_error = 1. - clf.score(X_test, Y_test)

In [65]:
print(test_error)

0.021860019175455436


Now, do $K$-fold cross-validation

In [66]:
clf2 = RandomForestClassifier(**rf_options)

In [67]:
cv_score = cross_val_score(clf2,
                           X,
                           y=Y_encoded,
                           scoring=None,
                           n_jobs=-1,
                           cv=ShuffleSplit(5))

In [68]:
cv_score

array([0.9802025 , 0.98128729, 0.98119689, 0.98137769, 0.98372808])

In [70]:
1 - cv_score

array([0.0197975 , 0.01871271, 0.01880311, 0.01862231, 0.01627192])

In [None]:
def classify_ar_mod(classifier_params, X_model, Y_model, **kwargs):
    """
    Train random forest classifier on simulation data, apply to real data

    Parameters
    ----------
    classifier_params {[type]} -- [description]
    X_model {[type]} -- [description]
    Y_model {[type]} -- [description]
    X_observation {[type]} -- [description]
    bad_pixels {[type]} -- [description]
    """
    # Encode labels
    le = LabelEncoder()
    le.fit(Y_model)
    Y_model = le.transform(Y_model)
    # Split training and test data
    X_train, X_test, Y_train, Y_test = train_test_split(
        X_model, Y_model, test_size=kwargs.get('test_size', 0.33))
    # Fit classifier
    clf = RandomForestClassifier(**classifier_params)
    clf.fit(X_train, Y_train)
    test_error = 1. - clf.score(X_test, Y_test)
    print(test_error)
    return clf
    # Classify observations
    #Y_observation = clf.predict(X_observation)
    #Y_observation_prob = clf.predict_proba(X_observation)
    ## Frequency map
    #data = np.empty(bad_pixels.shape)
    #data[bad_pixels] = np.nan
    #data[~bad_pixels] = Y_observation
    #class_map = data.copy()
    ## Probability maps
    #probability_maps = {}
    #for i, c in enumerate(le.inverse_transform(clf.classes_)):
    #    data = np.empty(bad_pixels.shape)
    #    data[bad_pixels] = np.nan
    #    data[~bad_pixels] = Y_observation_prob[:, i]
    #    probability_maps[c] = data.copy()
    #
    #return class_map, probability_maps, clf, test_error