### Code for training and testing SVM
Before training
* Prepare train and test data in separate files (scripts: )
* Run hyperparameters tuning & get final C and Gamma (scripts: )

In [1]:
# import required libraries
import os
import time
import pickle
import numpy as np
import pandas as pd
import geopandas as gpd
from osgeo import gdal
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import multilabel_confusion_matrix

In [3]:
# Code to run sklearn algorithms on all cores of CPU
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


## 1. Data

In [3]:
# Training data
# 85% of full dataset
x = pd.read_csv("./training_data/hollstein/x_train_hypertune_svm.csv")
y = pd.read_csv("./training_data/hollstein/y_train_hypertune_svm.csv")

x_train = x[['443', '490', '560', '665', '705', '740', '783', '842', '865', '945', '1380', '1610', '2190']]
y_train = y[['class']]
             
# print(x_train.head())
# y_train.head()
# print(x_train.shape, y_train.shape)

## 2. Training

In [5]:
# Initialise the model with paramaters
svc = svm.SVC(C=0.1,
              kernel='rbf',
              gamma=0.0001,
             probability=True)

In [6]:
# Training
start = time.time()
svc.fit(x_train, y_train)
end = time.time()
print("Training time is :", end-start)

  y = _column_or_1d(y, warn=True)
  return f(**kwargs)


Training time is : 2674.904105901718


In [7]:
# Save Model (change the model name)
filename = './models/svm_hollstein/svm_c01_rbf_g00001_2cl_cloudcirrus_clear_hollstein_full_170622.sav'
pickle.dump(svc, open(filename, 'wb'))

## 3. Testing

In [6]:
# load model
# filename = './models/svm_hollstein/svm_c100_rbf_g1_6cl_hypertunedata2_hollstein.sav'
filename = './models/svm_hollstein/svm_c100_rbf_g1_hollstein_13i_6o.sav'
model = pickle.load(open(filename, 'rb'))

In [7]:
# load test data
x_test = pd.read_csv("./training_data/hollstein/x_test_hypertune_svm.csv")
x_test = x_test[['443', '490', '560', '665', '705', '740', '783', '842', '865', '945', '1380', '1610', '2190']]
y_test = pd.read_csv("./training_data/hollstein/y_test_hypertune_svm.csv")
y_test = y_test[['class']]
# print(x_test.head())
# print(y_test.head())

In [8]:
# Accuracy Score and Confusion Matrix
y_pred_test = model.predict(x_test)
c_matrix = confusion_matrix(y_test, y_pred_test)
acc = accuracy_score(y_test, y_pred_test)

print("Accuracy Score: ", acc)
print ("Confusion matrix \n", c_matrix)

Accuracy Score:  0.9868165144905411
Confusion matrix 
 [[36753    53     1   744   148   479]
 [   13 16938     0     0     0    78]
 [   11     1 15463    13     0     0]
 [  596   280    76 74348    33    30]
 [  126     0     2    67 72328     0]
 [  324    75     0     2     0 20105]]


In [9]:
# Accuracy metrics (IoU Score, Precision, Recall and F-Score) for each class
def acc_metrics(y_test, y_pred, num_classes=6):
    cm_multi = multilabel_confusion_matrix(y_test, y_pred)
    result_array = np.zeros(shape=(num_classes,4))
    for j in range(len(cm_multi)):
        iou = cm_multi[j][1][1] / (cm_multi[j][1][1] + cm_multi[j][0][1] + cm_multi[j][1][0]) # IoU Score
        prec = cm_multi[j][1][1] / (cm_multi[j][1][1] + cm_multi[j][0][1]) # Precision
        rec = cm_multi[j][1][1] / (cm_multi[j][1][1] + cm_multi[j][1][0])  # Recall
        f_sco = (2 * prec * rec) / (prec + rec) # F-Score
        result_array[j] = result_array[j] + np.array([iou, prec, rec, f_sco])
    return result_array

index_values = ['clear pixels', 'Water', 'snow', 'cirrus', 'cloud', 'shadow']
# index_values = ['non clear pixels', 'clear pixels']
column_values = ['IoU Score', 'Precision', 'Recall', 'F-Score']

results_array = acc_metrics(y_test, y_pred_test)
df_ann = pd.DataFrame(data = results_array, index = index_values, columns = column_values)
print("Accuracy of Test data\n", df_ann)

Accuracy of Test data
               IoU Score  Precision    Recall   F-Score
clear pixels   0.936430   0.971710  0.962675  0.967171
Water          0.971327   0.976422  0.994656  0.985455
snow           0.993319   0.994917  0.998386  0.996648
cirrus         0.975836   0.989012  0.986532  0.987770
cloud          0.994828   0.997504  0.997311  0.997407
shadow         0.953160   0.971632  0.980445  0.976018


In [11]:
# Accuracy of different types of clouds (with Pixbox data)
# load data
pix_df = pd.read_csv("./reference_data/pixbox/pixbox_28tiles2.csv")
pix_df2 = pix_df[['B1', 'B2', 'B3', 'B4', 'B5', 'B6','B7', 'B8', 'B8A', 'B9', 'B10', 'B11', 'B12']]
# pix_df2.head()

In [12]:
# Predict
y_pred_pix = model.predict(pix_df2)

In [13]:
# add results to the main dataframe
pix_df['model_ot'] = y_pred_pix

In [15]:
# Prints class id, Number of pixels in class, True Possitives and Accuracy (refer confluence page)
for i in [2, 3, 4, 5, 6, 7, 10, 11]:
    temp = pix_df[pix_df['CLOUD_CHARACTERISTICS_ID']==i]
    size, _ = temp.shape
    if i == 7:
        temp2 = temp[(temp['model_ot']==0) |  (temp['model_ot']==1) | (temp['model_ot']==5)]
    else:
        temp2 = temp[(temp['model_ot']==2) | (temp['model_ot']==3) | (temp['model_ot']==4)]
    true_size, _ = temp2.shape
    print(i, size, true_size, true_size/size)

2 2076 1108 0.5337186897880539
3 32 32 1.0
4 889 656 0.7379077615298087
5 1813 1053 0.5808052950910094
6 2625 1454 0.5539047619047619
7 8296 4703 0.5668997107039537
10 117 60 0.5128205128205128
11 128 128 1.0
