## FOSS4G Belem - Brazil 2024
## Workshop
### Setting the scene - Machine learning  an intro

**Dr. Rosa Aguilar**<br>
r.aguilar@utwente.nl<br>
https://www.linkedin.com/in/rosamaguilar/<br>

**Part 2 - Clustering** <br>

In this notebook, we will conduct a regionalization (clustering) based on
physical variables such as the Land Surface Temperature (LST),
Vegetation Indices (EVI/NDVI) and water index (NDWI).

The steps are as follows: <br>
<ol>
    <li>Aggregating temporal variables</li>
    <li>Create a datacube with the aggregates</li>
    <li>Executing a k-means clustering</li>
    <li>Inspecting results and elbow method</li>
</ol>

In [None]:
# import libraries

import matplotlib.pyplot as plt
import numpy as np

from datetime import datetime
import rasterio
from rasterio.plot import show
from rasterio.transform import from_origin

from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.impute import SimpleImputer

from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor

In [None]:
# define a function to create the datacube
def create_datacube(date):
    # Read all the raster data for a specific day
    # Data format = '2021_01_23'
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    
    NDVI = rasterio.open('./belem/NDVI/' + date +'.NDVI.tif').read(1)
    imp_mean.fit(NDVI)
    NDVI = imp_mean.transform(NDVI)
    
    EVI = rasterio.open('./belem/EVI/' + date +'.EVI.tif').read(1)
    imp_mean.fit(EVI)
    EVI = imp_mean.transform(EVI)
    
    LST = rasterio.open('./belem/LST/' + date +'.LST_Day_1km.tif').read(1)
    imp_mean.fit(LST)
    LST = imp_mean.transform(LST)
    
    NDWI = rasterio.open('./belem/NDWI/' + date +'.NDWI.tif').read(1)
    imp_mean.fit(NDWI)
    NDWI = imp_mean.transform(NDWI)
    
    # transform LST to obtain values in Celsius grades 
    mean = int(LST.mean())
    LST[LST < 7500] = mean
    LST = LST*0.02 - 273.15
    stack = np.stack([NDVI,EVI,NDWI,LST])
    table = np.stack((NDVI.ravel(),EVI.ravel(),NDWI.ravel(),LST.ravel()), axis=1)

    return table

In [None]:
# read an input raster to use it as a template when saving results
template = './belem/EVI/2021_01_20.EVI.tif'
raster_template = rasterio.open(template)

In [None]:
dates =[ '2021_01_20', '2021_01_21', '2021_01_22','2021_01_23','2021_01_24','2021_01_25', '2021_01_26', '2021_01_27']

In [None]:
# regr =  RandomForestRegressor()
# regr.fit(X=xtrain, y=ytrain)
results = []
for d in dates:
    print('procesing' + d)
    scaler = MinMaxScaler()
    cube = create_datacube(d)
    #scale
    cube = scaler.fit_transform(cube)
    # predict
    # ypred = regr.predict(cube)
    # result = ypred.reshape(raster_template.shape)
    results.append(cube)

In [None]:
# results

In [None]:
# another visualization - next to each other
extent = raster_template.bounds
fig, axes = plt.subplots(2, 4, figsize=(12,10))
for ax, r, title in zip(axes.ravel(), results, dates):
    print (r.shape)
    ax.matshow(r, extent=extent, cmap='RdYlGn')
    ax.set_title(title, y=1.15)

In [None]:
table_results = np.hstack(results)
table_results

In [None]:
table_results.shape

In [None]:
# Segmenting the data - first try with two groups
kmeans_labels = KMeans(n_clusters=2, random_state=0).fit_predict(table_results)
kmeans_labels

In [None]:
kmeans_labels.shape

In [None]:
result = kmeans_labels.reshape(raster_template.shape)
show(result,cmap='RdYlGn')  #'RdBu'

In [None]:
# execute the extremes
kmeans_labels = KMeans(n_clusters=9, random_state=0).fit_predict(table_results)
result = kmeans_labels.reshape(raster_template.shape)
fig, ax = plt.subplots(figsize=(5, 5))
retted = rasterio.plot.show(result, ax=ax, cmap='RdYlGn',title = 'Nine Clusters')
im = retted.get_images()[0]
fig.colorbar(im, ax=ax)

In [None]:
# determine the best number of groups via the Elbow method
Sum_of_squared_distances = []
K = range(1,10)
for num_clusters in K:
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(table_results)
    Sum_of_squared_distances.append(kmeans.inertia_)
    print(num_clusters, Sum_of_squared_distances)
plt.plot(K, Sum_of_squared_distances,'bx-')
plt.xlabel('Values of K') 
plt.ylabel('Sum of squared distances/Inertia') 
plt.title('Elbow Method For Optimal ')
plt.show()

In [None]:
# execute k-means with five clusters
kmeans_labels = KMeans(n_clusters=5, random_state=0).fit_predict(table_results)
result = kmeans_labels.reshape(raster_template.shape)
fig, ax = plt.subplots(figsize=(5, 5))
retted = rasterio.plot.show(result, ax=ax, cmap='RdYlGn',title = 'Four Clusters')
im = retted.get_images()[0]
fig.colorbar(im, ax=ax)

In [None]:
# execute k-means with six clusters
kmeans_labels = KMeans(n_clusters=6, random_state=0).fit_predict(table_results)
result = kmeans_labels.reshape(raster_template.shape)
fig, ax = plt.subplots(figsize=(5, 5))
retted = rasterio.plot.show(result, ax=ax, cmap='RdYlGn',title = 'Five Clusters')
im = retted.get_images()[0]
fig.colorbar(im, ax=ax)

In [None]:
# save the prediction/result
# open the original raster to get the metadata
meta = None
with rasterio.open(template) as src:
    if meta is None:
        meta = src.meta.copy()
        meta.update(count=0)  # Initialize with no bands

# update the band count
meta.update(count=1)
# Path to save the output GeoTIFF
output_path = './belem/image_clust2024.tif'

# Write the stacked array to the new GeoTIFF file
with rasterio.open(output_path, 'w', **meta) as dst:
    dst.write(result[:,:],1)

print(f"Clustered image saved to {output_path}")

### Questions?
Stay in touch <br>
r.aguilar@utwente.nl <br>
rosamaguilar@gmail.com<br>
https://www.linkedin.com/in/rosamaguilar/