## Hopkins Statistics

In [4]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mpl_scatter_density 
from matplotlib.colors import LinearSegmentedColormap
import warnings
warnings.filterwarnings('ignore')

In [43]:
def hopkins_statistic(data, n_neighbors=10):
    """
    Calculate the Hopkins statistic for a given dataset to determine if it has potential clusters or not.

    Parameters:
    -----------
    data : numpy array
        The dataset to calculate the Hopkins statistic on.
    n_neighbors : int, optional (default=10)
        The number of nearest neighbors to use when calculating the Hopkins statistic.

    Returns:
    --------
    float
        The Hopkins statistic for the given dataset.
    """
    # Calculate the number of samples in the dataset
    n = data.shape[0]

    # Generate random points in the same space as the data
    rand_points = np.random.rand(n, data.shape[1])

    # Fit a k-nearest neighbors model to the data
    knn = NearestNeighbors(n_neighbors=n_neighbors)
    knn.fit(data)

    # Calculate the distance between each data point and its nearest neighbor
    data_distances, _ = knn.kneighbors(data)

    # Fit another k-nearest neighbors model to the random points
    knn_rand = NearestNeighbors(n_neighbors=n_neighbors)
    knn_rand.fit(rand_points)

    # Calculate the distance between each random point and its nearest neighbor
    rand_distances, _= knn_rand.kneighbors(rand_points)

    # Calculate the Hopkins statistic
    numerator = np.sum(data_distances)
    denominator = np.sum(data_distances) + np.sum(rand_distances)
    hopkins_stat = numerator / denominator

    print('The dataset has a prob. of', round(hopkins_stat *100, 2), '% of containing a cluster')

In [44]:
df = pd.read_csv('../data/NGC188-result.csv')

In [45]:
df = df[['ra','dec','parallax','pmra','pmdec','bp_rp','r_est', 'phot_g_mean_mag']].dropna(axis=0)

In [46]:
hopkins_statistic(data=df, n_neighbors=5)

The dataset has a prob. of 94.93 % of containing a cluster
