## Investigate PCA parameters
Investigate the parameters for the PCA used before the GNB classifier.

In [1]:
from ErrorML.ErrorML import *
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt

In [2]:
def run_and_predict(X_train, y_train, X_test, y_test, kind='rf_classifier', pca_n_elements=None):
    classifier = create_pipeline(kind=kind, pca_n_elements=pca_n_elements)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    
    return y_pred

In [3]:
def robin_metric(y_true, y_pred):
    """A metric of accuracy that ignores the class with the highest accuracy.
    
    Given y_true and y_pred, it calculates a confusion matrix, and then takes
    the average of the diagonal elements of the matrix, ignoring the highest value.
    This gives us an accuracy of all but the class which is predicted best - which
    is useful for imbalanced learning, where one class is always predicted very well."""
    cm = confusion_matrix(y_true, y_pred)
    
    # Normalize confusion matrix
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    metric = np.sort(np.diag(cm))[:-1].mean()
    
    return metric

In [4]:
def run_multiple_times(filename, classes, categorised=True, focal=False,
                       scale=False, exclude=None, absolute=False, subset=None, use_cols=None,
                       kind='rf_classifier', pca_n_elements=None):
    df = load_data(filename)
    X, y = get_processed_data(df, classes=classes, categorised=categorised, focal=focal,
                              scale=scale, exclude=exclude, absolute=absolute, subset=subset,
                              use_cols=use_cols)
    X = X.values
    y = y.values
    
    metrics = []
    conf_matrices = []

    for i in range(5):
        skf = StratifiedKFold(n_splits=5)
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            ros = RandomOverSampler()
            X_train, y_train = ros.fit_resample(X_train, y_train)

            y_pred = run_and_predict(X_train, y_train, X_test, y_test, kind=kind, pca_n_elements=pca_n_elements)
            
            metric = robin_metric(y_test, y_pred)
            metrics.append(metric)
            
            cm = confusion_matrix(y_test, y_pred)
            conf_matrices.append(cm)
            
    
    mean_metric = np.array(metrics).mean()
    sum_cms = np.dstack(conf_matrices).sum(axis=2)
    
    return mean_metric, sum_cms

## Trying PCA n_components but using default (all) columns

In [7]:
# With all columns
results = []

n_elements_list = range(16, 0, -1)

for n_elements in n_elements_list:
    metric, cm = run_multiple_times('2017_ValidationPts_ALL_Update15March19b_ROBIN.csv',
                                    classes=[-2,-0.2, 0.2, 6.5],
                                    focal=None, subset='Exposed',
                                    kind='gnb_pca_default', pca_n_elements=n_elements)
    print(f"{n_elements}: {metric}")
    results.append({'n_components':n_elements, 'metric':metric})

res = pd.DataFrame(results)

16: 0.7064425856608119
15: 0.7077381706497745
14: 0.7057995393031962
13: 0.7157767828006527
12: 0.7166917938381802
11: 0.718825290334965
10: 0.7201742681639313
9: 0.7274962088492178
8: 0.727705192436894
7: 0.7307131394567616
6: 0.5800457433534888
5: 0.587385971142464
4: 0.5924229868509454
3: 0.5841482579902101
2: 0.5781643503855137
1: 0.5389620788943277


In [9]:
res.sort_values('metric', ascending=False)

Unnamed: 0,metric,n_components
9,0.730713,7
8,0.727705,8
7,0.727496,9
6,0.720174,10
5,0.718825,11
4,0.716692,12
3,0.715777,13
1,0.707738,15
0,0.706443,16
2,0.7058,14


## Trying PCA n_components using just best columns

In [11]:
results = []

n_elements_list = range(7, 0, -1)

for n_elements in n_elements_list:
    metric, cm = run_multiple_times('2017_ValidationPts_ALL_Update15March19b_ROBIN.csv',
                                    classes=[-2,-0.2, 0.2, 6.5],
                                    focal=None, subset='Exposed',
                                    kind='gnb_pca_default', pca_n_elements=n_elements,
                                    use_cols=['Veg', 'MaxSlope_Focal', 'MinSlope_Focal', 'CQ_Mean', 'DepthRC_JD', 'Shadow', 'Type'])
    print(f"{n_elements}: {metric}")
    results.append({'n_components':n_elements, 'metric':metric})

res = pd.DataFrame(results)

7: 0.7602544773970631
6: 0.7616895767348114
5: 0.7593048373164412
4: 0.743436241481908
3: 0.5902819272482964
2: 0.5957886073519532
1: 0.6108974373740282
