# Persistent homology on FLAVIA dataset

In [None]:
import numpy as np
import collections
import pickle
import gudhi as gd
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

PATH_CURRENT = "../" 
import sys
sys.path.append(PATH_CURRENT + "SRC/")

import data_construction
import ph
import model
import ml
import plots

## Construct dataset

The FLAVIA dataset of images of plant leaves comes from:

Stephen Gang Wu, Forrest Sheng Bao, Eric You Xu, Yu-Xuan Wang, Yi-Fan Chang, and 671 Qiao-Liang Xiang. A leaf recognition algorithm for plant classification using probabilistic
672 neural network. In 2007 IEEE International Symposium on Signal Processing and Information 673 Technology, pages 11–16. IEEE, 2007.

http://flavia.sourceforge.net/

In [None]:
# The IDs of images in the complete FLAVIA dataset.
ids = list(range(1001, 1617)) + list(range(2001, 2613)) + list(range(2616, 2676)) + list(range(3001, 3564)) + list(range(3566, 3622))


data, labels = data_construction.build_dataset_flavia(ids, num_x_pixels = 30, num_y_pixels = 30)


with open(PATH_CURRENT  + "DATASETS/flavia/data.pkl", "wb") as f:
    pickle.dump(data, f)     
with open(PATH_CURRENT + "DATASETS/flavia/labels.pkl", "wb") as f:
    pickle.dump(labels, f) 

## Calculate PH

In [None]:
data_ph = ph.calculate_ph_height_point_clouds(data, normalization = True)
data_ph_sum = np.sum(data_ph, axis = 1)

print(data_ph.shape)
print(data_ph_sum.shape)

## Comparison of labels and simple scalar PH signature

In [None]:
num_images = data.shape[0]
plt.bar(np.arange(0, num_images), 1-labels)
plt.title("Concavity measure across images", fontsize=20)
plt.show()
plt.bar(np.arange(0, num_images), data_ph_sum)
plt.title("Simple PH simple across images", fontsize=20)
plt.show()

## Linear regression on simple PH 

In [None]:
lr = LinearRegression()
data_ph_sum = data_ph_sum.reshape(-1, 1)
data_ph_sum_train, data_ph_sum_test, labels_train, labels_test = train_test_split(data_ph_sum, labels, test_size = 0.33, random_state  = 42)

lr = lr.fit(data_ph_sum_train, labels_train)
model.plot_regression_line(data_ph_sum, labels, lr)
mse =  model.get_score(data_ph_sum_test, labels_test, lr)
print("mean squared error = ", np.around(mse, 5))

## Linear regression on PH 

In [None]:
lr = LinearRegression()
data_ph_train, data_ph_test, labels_train, labels_test = train_test_split(data_ph, labels, test_size = 0.33, random_state = 42)

lr = lr.fit(data_ph_train, labels_train)
model.plot_regression_line(data_ph, labels, lr)
mse =  model.get_score(data_ph_test, labels_test, lr)
print("mean squared error = ", np.around(mse, 5))

In [None]:
num_incorrect_predictions = 0
predictions = lr.predict(data_ph)

eps = 0.1
i = 0
ids_wrong = []
for image, prediction, label in zip(data, predictions, labels):
    if prediction - label > eps:
        num_incorrect_predictions = num_incorrect_predictions + 1
        plt.imshow(image)
        plt.title("label=%.2f, prediction=%.2f, image id=%d" %(label, prediction, ids[i])) 
        plt.show()
        ids_wrong.append(ids[i])
    i = i + 1 
num_samples = len(data)
perc_incorrect_predictions = num_incorrect_predictions / num_samples
print("Percenatage of incorrect predictions = ", np.around(100 * perc_incorrect_predictions, 2), "%. \n")
print("ids_wrong = ", ids_wrong)