In [None]:
import sys
import os
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import SelectFromModel, RFECV, SelectKBest, f_classif
from sklearn.metrics import f1_score, balanced_accuracy_score, make_scorer, roc_curve, roc_auc_score, auc
import joblib
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from itertools import compress

import warnings
warnings.filterwarnings("ignore")

working_dir = '/home/jovyan/arvum/data/dea_landcover/c3/training/'
filename = '2015_training_data.csv'

filename = os.path.join(working_dir, filename)
# model_input = numpy.loadtxt(filename, skiprows=1)

model_input = pd.read_csv(filename)
random_state = 1234

In [None]:
model_input["binary_class"] = model_input["binary_class"].apply(lambda x:1 if x==111 else 0)
model_input["binary_class"].unique()

In [None]:
# model_variables = ['blue','red','green','nir','swir1','swir2','edev','sdev','bcdev', 'NDVI', 'MNDWI', 'BAI', 'BUI', 'BSI', 'TCG', 'TCW', 'TCB', 'NDMI', 'LAI', 'EVI', 'AWEI_sh', 'BAEI', 'NDSI', 'SAVI']

# original pickle variables
model_variables = ['nir', 'edev', 'sdev', 'NDVI', 'BUI', 'BSI', 'TCG', 'NDMI', 'LAI', 'EVI', 'SAVI']

In [None]:
y = model_input['binary_class'].to_numpy()
X = model_input[model_variables].to_numpy()

In [None]:
# Modelling

# Feature selection using LASSO
#feature_selection = SelectFromModel(LinearSVC(C=0.01, penalty="l1", dual=False, max_iter=10000))
# set to all
feature_selection = SelectKBest(f_classif, k='all')

model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
        max_depth=50, max_leaf_nodes=None,
                   #    min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=150,
                       n_jobs=-1, oob_score=True, random_state=random_state, verbose=0,
                       warm_start=False)

# Hyperparameter grid to explore
param_grid = { 
            'max_depth': [20,30, 50],
                'class_weight': [None, 'balanced', 'balanced_subsample'],
                }

# To be used within GridSearch
inner_cv = KFold(n_splits=5, shuffle=True, random_state=random_state)

# To be used in outer CV (you asked for 10)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=random_state)

# iterate over parameter grid
cv_model = GridSearchCV(estimator=model, param_grid=param_grid, cv= inner_cv, refit=True)

# Pipe selected features into hyper parameter search
pipe = Pipeline([('feature_selection', feature_selection),
        ('classification', cv_model)
        ])

In [None]:
scoring = {
    'precision': 'precision',
    'recall':'recall',
    'f1_score': 'f1',
    'Accuracy': 'accuracy'
}

cv_results = cross_validate(pipe, X, y, cv=outer_cv, n_jobs=-1, scoring=scoring)
for key, rsl in cv_results.items():
    print(key, rsl.mean())

In [None]:
# Fit pipe
pipe.fit(X, y)

In [None]:
print("Number of features:", pipe['classification'].best_estimator_.n_features_in_, "/", len(model_variables))
model_variables = list(compress(model_variables, pipe['feature_selection'].get_support()))

# Variable importance
for var_name, var_importance in zip(model_variables, pipe['classification'].best_estimator_.feature_importances_):
    print("{}: {:.04}".format(var_name, var_importance))


ml_model_dict = {}

ml_model_dict['variables'] = model_variables
ml_model_dict['classes'] = {'Cultivated' : 111,
                            'Not Cultivated' : 0}
ml_model_dict['classifier'] = pipe['classification'].best_estimator_
ml_model_dict['accuracy']=cv_results['test_Accuracy']
ml_model_dict['f1']=cv_results['test_f1_score']

print(ml_model_dict)

# Pickle model
with open(os.path.join(working_dir, '2010_2015_median_model_indices_feature_selection_kbest_15.joblib'), 'wb') as f:
    #pickle.dump(ml_model_dict, f)
    joblib.dump(ml_model_dict, f)

In [None]:
# Validate the trained model using the independant validation set 

working_dir = '/home/jovyan/arvum/data/dea_landcover/c3/validation/'

# Change the year in filename to 2015 to inspect the 2015 results
validation_filename = '2010_validation_data.csv'
validation_filepath = os.path.join(working_dir, validation_filename)

validation_data = pd.read_csv(validation_filepath)

validation_data['output'] = validation_data['output'].apply(lambda x: 1 if x==111 else 0)

In [None]:
# Predict on the validation set

y_test = validation_data['output'].to_numpy()
validation_data.drop(labels=['output'], axis=1, inplace=True)
X_test = validation_data[model_variables].to_numpy()

y_pred = pipe.predict(X_test)

In [None]:
# Print Metrics 

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

print('Validation accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))
print('Validation F1 Score: {0:0.4f}'. format(f1_score(y_test, y_pred)))
print('Validation precision score: {0:0.4f}'. format(precision_score(y_test, y_pred)))
print('Validation Recall score: {0:0.4f}'. format(recall_score(y_test, y_pred)))

In [None]:
# Print a confusion matrix - note the cultivated class is 1

import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

cm_data = {'y_test': y_test, 'y_pred': y_pred}

cm_df = pd.DataFrame(cm_data)
confusion_matrix = pd.crosstab(cm_df['y_test'], cm_df['y_pred'], rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True, fmt='g')
plt.title(f"Validation Data {validation_filename[0:4]}")
plt.show