# Análisis Clusters

Análisis de los hiperparámetros elegidos en el proceso de optimización realizado en `hyper_velocity_stars_detection/jobs/google_jobs/download_globular_clusters.py`

### Requirements

In [35]:
%load_ext autoreload
%autoreload 2


import sys
import os

import numpy as np
from dotenv import load_dotenv
from google.cloud import storage
from tqdm import trange

import pandas as pd
from sklearn.cluster import DBSCAN, HDBSCAN
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

    
project_path = os.path.abspath(os.path.join(os.getcwd(), "../../src"))  # Subir un nivel
if project_path not in sys.path:
    sys.path.append(project_path)
            
from hyper_velocity_stars_detection.jobs.utils import read_clusters_harris_catalog
from hyper_velocity_stars_detection.jobs.google_jobs.utils import load_globular_cluster, ProjectDontExist
from hyper_velocity_stars_detection.cluster_detection.clustering_methods import GaussianMixtureClustering

load_dotenv("../../data/.env")
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../../data/hvs-storage.json"

PATH = "../../data/report_notebook"

RADIUS_SCALE = 1
CATALOG = "gaiadr3"
FILTERS = {"ast_params_solved": 3, "ruwe": 1.4, "v_periods_used": 10, "min_parallax": 0}
PROJECT = os.environ["PROJECT_ID"]
BUCKET = os.environ["BUCKET"]
BUCKET_PATH = "report/gc_clusters/"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Descarga de los datos generados de la optimización.

In [36]:
clusters_specials = ["ngc 104", "ngc 5139", "ngc 5286", "ngc 6266", "ngc 6388"]
clusters = read_clusters_harris_catalog()
gc_objects = {}
for pos in trange(len(clusters)):
    cluster = clusters[pos].name
    try:
        gc_objects[cluster] = load_globular_cluster(cluster, PROJECT, BUCKET, BUCKET_PATH)
    except ProjectDontExist:
        continue


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 147/147 [03:06<00:00,  1.27s/it]


Calculo de resumen de los hiperpaŕametros del proceso de optimización `hyper_velocity_stars_detection/jobs/google_jobs/download_globular_clusters.py`

In [27]:
def get_algorithm_type(gc_object: GaussianMixtureClustering) -> str:
    model = gc.clustering_results.clustering.model
    if isinstance(model, HDBSCAN):
        return "HDBSCAN"
    if isinstance(model, DBSCAN):
        return "DBSCAN"
    if isinstance(model, GaussianMixtureClustering):
        return "GaussianMixtureModel"
    return None

def get_noise_method(gc_object: GaussianMixtureClustering) -> str:
    noise_method = gc.clustering_results.clustering.noise_method
    if noise_method is not None:
        model = noise_method.model
        if isinstance(model, LocalOutlierFactor):
            return "LocalOutlierFactor"
        if isinstance(model, IsolationForest):
            return "IsolationForest"
    return None


def get_scaler(gc_object: GaussianMixtureClustering) -> str:
    scaler = gc.clustering_results.clustering.scaler
    if scaler is not None:
        if isinstance(scaler, StandardScaler):
            return "StandardScaler"
        if isinstance(scaler, MinMaxScaler):
            return "MinMaxScaler"
    return None

In [40]:
results = pd.DataFrame(columns=["Name", "Cluster Algorithm", "Noise Method", "Scaler"])

for pos, cluster in enumerate(gc_objects.keys()):
    gc = gc_objects[cluster]
    algorithm, noise, scaler = get_algorithm_type(gc), get_noise_method(gc), get_scaler(gc)
    results.loc[pos] = (gc.name,algorithm, noise, scaler)
results

Unnamed: 0,Name,Cluster Algorithm,Noise Method,Scaler
0,NGC_104,HDBSCAN,,StandardScaler
1,NGC_288,GaussianMixtureModel,IsolationForest,StandardScaler
2,NGC_362,HDBSCAN,,StandardScaler
3,NGC_1261,HDBSCAN,,StandardScaler
4,NAME_E_1,DBSCAN,,StandardScaler
...,...,...,...,...
115,M_15,HDBSCAN,,StandardScaler
116,M_2,DBSCAN,,StandardScaler
117,M_30,DBSCAN,,StandardScaler
118,Cl_Pal_12,GaussianMixtureModel,IsolationForest,MinMaxScaler


In [45]:
results.groupby(["Cluster Algorithm", "Noise Method"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Scaler
Cluster Algorithm,Noise Method,Unnamed: 2_level_1,Unnamed: 3_level_1
GaussianMixtureModel,IsolationForest,12,12
GaussianMixtureModel,LocalOutlierFactor,4,3


In [42]:
results["Noise Method"].value_counts()


Noise Method
IsolationForest       12
LocalOutlierFactor     4
Name: count, dtype: int64

In [43]:
results["Scaler"].value_counts()

Scaler
StandardScaler    112
MinMaxScaler        7
Name: count, dtype: int64