## SETUP

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn import metrics
import mlflow
from math import radians, cos, sin, asin, sqrt


sample_url = "floods_geocoordinates.csv"

sample_data = pd.read_csv(sample_url)

## TREINAMENTO

In [3]:
#------------------------HAVERSINE DISTANCE------------------------
def haversine(coordinates_1, coordinates_2):
    
    latitude_1, longitude_1 = coordinates_1
    latitude_2, longitude_2 = coordinates_2
    longitude_1, latitude_1, longitude_2, latitude_2 = map(radians, [longitude_1, latitude_1, longitude_2, latitude_2])

    # haversine formula 
    distance_longitude = longitude_2 - longitude_1 # diference between two longitude
    distance_latitude = latitude_2 - latitude_1    # diference between two latitude
    
    aux = sin(distance_latitude/2)**2 + cos(latitude_1) * cos(latitude_2) * sin(distance_longitude/2)**2
    result = 2 * asin(sqrt(aux)) 
    
    radius = 6371 # Radius of earth in kilometers
    
    return result * radius

In [4]:
from scipy.spatial.distance import pdist, squareform

sample_data = sample_data[['latitude', 'longitude']]

#---------------------DISTANCE MATRIX---------------------
distance_matrix = squareform(pdist(sample_data, (lambda u,v: haversine(u,v))))

#---------------------CLUSTERS AGGLOMERATIVE - COMPLETE---------------------
agglomerative_clustering = AgglomerativeClustering(n_clusters = None, affinity='precomputed', linkage='complete', distance_threshold=0.9, compute_full_tree=True)  
rotulo_agglomerative = agglomerative_clustering.fit(distance_matrix)


## VALIDAÇÃO

In [5]:
#---------------------MÉTRICAS---------------------
silhoutte_metric = metrics.silhouette_score(distance_matrix, rotulo_agglomerative.labels_)
print("silhoutte_metric: ", silhoutte_metric)
calinski_harabasz_metric = metrics.calinski_harabasz_score(distance_matrix, rotulo_agglomerative.labels_)
print("calinski_harabasz_metric: ", calinski_harabasz_metric)
davies_bouldin_metric = metrics.davies_bouldin_score(distance_matrix, rotulo_agglomerative.labels_)
print("davies_bouldin_metric: ", davies_bouldin_metric)

silhoutte_metric:  0.6255070649341068
calinski_harabasz_metric:  6237.508216167444
davies_bouldin_metric:  0.3603617663490956


## VERSIONAMENTO

In [6]:
from datetime import datetime

now = datetime.now()

current_time = now.strftime("%d/%m/%Y - %H:%M:%S")
current_time

'14/12/2024 - 00:10:58'

In [7]:
mlflow.set_tracking_uri('http://localhost:5001/')
mlflow.set_experiment(f'flood_areas_identifier')

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1734145744440, experiment_id='1', last_update_time=1734145744440, lifecycle_stage='active', name='flood_areas_identifier', tags={}>

In [8]:
distance_threshold_list = [100,200,300,400,500,600,700,800,900,1000,1100,1200,1300]

for i in distance_threshold_list:

    run_name = f'floods_identifier_distance_threshold_{i}'

    # Start run
    mlflow.start_run(run_name=run_name)

    #---------------------CLUSTERS AGGLOMERATIVE - WARD---------------------
    agglomerative_clustering = AgglomerativeClustering(n_clusters = None, metric='precomputed', linkage='complete', distance_threshold=i/1000, compute_full_tree=True)  
    agglomerative_clustering = agglomerative_clustering.fit(distance_matrix)

    silhoutte_metric = metrics.silhouette_score(distance_matrix, agglomerative_clustering.labels_)
    calinski_harabasz_metric = metrics.calinski_harabasz_score(distance_matrix, agglomerative_clustering.labels_)
    davies_bouldin_metric = metrics.davies_bouldin_score(distance_matrix, agglomerative_clustering.labels_)

    parametros = {
        "n_clusters": None,
        "affinity": 'precomputed',
        "linkage": "complete",
        "distance_threshold": i,
        "compute_full_tree": True,
    }

    print(parametros)

    metricas = {
        "num_generated_clusters": len(set(agglomerative_clustering.labels_)),
        "silhouette": silhoutte_metric,
        "calinski_harabasz": calinski_harabasz_metric,
        "davies_bouldin_metric": davies_bouldin_metric,
    }

    print(metricas)

    mlflow.set_tag("data", current_time)
    mlflow.log_params(parametros)
    mlflow.log_metrics(metricas)

    mlflow.sklearn.log_model(agglomerative_clustering, "agglomerative_clustering")

    mlflow.end_run()



{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 100, 'compute_full_tree': True}
{'num_generated_clusters': 506, 'silhouette': 0.6456451698205722, 'calinski_harabasz': 320821.95671159064, 'davies_bouldin_metric': 0.08448774763158479}




{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 200, 'compute_full_tree': True}
{'num_generated_clusters': 434, 'silhouette': 0.6432629106826596, 'calinski_harabasz': 65414.09020961858, 'davies_bouldin_metric': 0.13232032099940952}




{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 300, 'compute_full_tree': True}
{'num_generated_clusters': 374, 'silhouette': 0.637188536958003, 'calinski_harabasz': 29248.006338425355, 'davies_bouldin_metric': 0.18035678420214135}




{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 400, 'compute_full_tree': True}
{'num_generated_clusters': 325, 'silhouette': 0.6422741295454057, 'calinski_harabasz': 18049.26736773855, 'davies_bouldin_metric': 0.23849659285162952}




{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 500, 'compute_full_tree': True}
{'num_generated_clusters': 287, 'silhouette': 0.6275432197373685, 'calinski_harabasz': 11797.753517548437, 'davies_bouldin_metric': 0.2860023604898106}




{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 600, 'compute_full_tree': True}
{'num_generated_clusters': 256, 'silhouette': 0.63366160352704, 'calinski_harabasz': 9366.019105304522, 'davies_bouldin_metric': 0.31667014442800334}




{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 700, 'compute_full_tree': True}
{'num_generated_clusters': 234, 'silhouette': 0.633091393611812, 'calinski_harabasz': 8224.676595377652, 'davies_bouldin_metric': 0.33473168773430717}




{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 800, 'compute_full_tree': True}
{'num_generated_clusters': 217, 'silhouette': 0.6303548435533345, 'calinski_harabasz': 7275.816355435488, 'davies_bouldin_metric': 0.3527423029615411}




{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 900, 'compute_full_tree': True}
{'num_generated_clusters': 199, 'silhouette': 0.6255070649341068, 'calinski_harabasz': 6237.508216167444, 'davies_bouldin_metric': 0.3603617663490956}




{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 1000, 'compute_full_tree': True}
{'num_generated_clusters': 183, 'silhouette': 0.6220386455000728, 'calinski_harabasz': 5525.159166869797, 'davies_bouldin_metric': 0.3675144582800715}




{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 1100, 'compute_full_tree': True}
{'num_generated_clusters': 172, 'silhouette': 0.6038880737529144, 'calinski_harabasz': 4772.1663687048995, 'davies_bouldin_metric': 0.41736905845777844}




{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 1200, 'compute_full_tree': True}
{'num_generated_clusters': 158, 'silhouette': 0.5869519761189761, 'calinski_harabasz': 4114.094689937569, 'davies_bouldin_metric': 0.4573280673982022}




{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 1300, 'compute_full_tree': True}
{'num_generated_clusters': 151, 'silhouette': 0.5888104614104508, 'calinski_harabasz': 4043.4187548152804, 'davies_bouldin_metric': 0.46798412136560263}
