# ML Training Notebook for Cultivated

Notebook to train ML model to discriminate between cultivated and natural vegetation. Text file with extracted data can be downloaded from: Model is trained using data extracted to a CSV file.

The version with all pixels can be downloaded from: https://rsg.pml.ac.uk/shared_files/dac/train_input_geomedian_tmad.txt.gz

As geomedian and mads are calculated separatly need to combine to a single file using:
```python
import numpy
input_data = numpy.loadtxt("geomedian_stats_2015.txt", skiprows=1)
input_data_mads = numpy.loadtxt("tmad_stats_2015.txt", skiprows=1)

combined_data = numpy.hstack((input_data, input_data_mads[:,1:]))

column_names = 'classnum blue green red nir swir1 swir2 BUI BSI NBI EVI NDWI MSAVI sdev edev bcdev'

numpy.savetxt("training_data_2015_geomedian_mads_poly_mean",
              combined_data,             
              header=column_names, comments='', fmt='%.4f')
```

A version using the mean value for each feature is in the same repo as this notebook.

In [1]:
import os
import pickle

import numpy
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
# Set up working dir
working_dir = '/home/jovyan/cultivated_classification'

In [3]:
# Read in text file
# As it takes a while first see if there is a pickled version from a previous run
pickled_model_input = os.path.join(working_dir, 'training_data_2015_geomedian_mads_poly_mean_numpy.npy')

if os.path.isfile(pickled_model_input):
    print('Loading pickled model input file')
    model_input = numpy.load(pickled_model_input)
else:
    print('Reading model input from text file...')
    model_input = numpy.loadtxt(os.path.join(working_dir, 'training_data_2015_geomedian_mads_poly_mean.txt'), skiprows=1)
    numpy.save(pickled_model_input, model_input)
    
# Headers are
# classnum blue green red nir swir1 swir2 BUI BSI NBI EVI NDWI MSAVI sdev edev bcdev
column_names = 'classnum blue green red nir swir1 swir2 BUI BSI NBI EVI NDWI MSAVI sdev edev bcdev'.split()

column_names_indices = {}

for col_num, var_name in enumerate(column_names):
    column_names_indices[var_name] = col_num

Loading pickled model input file


In [4]:
# Split into training and testing data, 50 % is used for training with 50 % held back for testing.
# Use class to provide similar distribution across classes
# in training and testing data
model_train, model_test = model_selection.train_test_split(model_input, stratify=model_input[:,0],
                                                           train_size=0.8, random_state=0)

In [5]:
# Set up model
model = RandomForestClassifier(n_estimators=100, n_jobs=-1, verbose=0, oob_score=True)
#model = MLPClassifier(early_stopping=True, max_iter=10, verbose=2)

model_variables = ['red', 'nir', 'swir1', 'swir2', 'sdev', 'edev']

model_col_indices = []

for model_var in model_variables:
    model_col_indices.append(column_names_indices[model_var])

In [6]:
# Train model
classifier = model.fit(model_train[:,model_col_indices], model_train[:,0])

In [7]:
# Test model using data held back for training
score = classifier.score(model_test[:,model_col_indices], model_test[:,0])
print("Accuracy: {:.03}".format(score))

Accuracy: 0.925


In [8]:
# Variable importance
for var_name, var_importance in zip(model_variables, classifier.feature_importances_):
    print("{}: {:.04}".format(var_name, var_importance))

red: 0.2071
nir: 0.1541
swir1: 0.1153
swir2: 0.1325
sdev: 0.1206
edev: 0.2704


In [9]:
ml_model_dict = {}

ml_model_dict['variables'] = model_variables
ml_model_dict['classes'] = {'Not natural terrestrial vegetation' : 111,
                            'Natural terrestrial vegetation ' : 112}
ml_model_dict['classifier'] = classifier

# Pickle model
with open(os.path.join(working_dir, 'model_pickle.pickle'), 'wb') as f:
    pickle.dump(ml_model_dict, f)