In [1]:
from sklearn.neighbors import BallTree
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale



The Hopkins statistic is a way of measuring the cluster tendency of a data set. It belongs to the family of sparse sampling tests.

In [2]:
# dataset = pd.read_csv('./Absenteeism_at_work.csv', sep=';')
dataset = pd.read_pickle('./state_clubbed_df.pickle')
sampling_size = 350

In [3]:
rows, columns = dataset.shape
print ("Total rows : {}\t Total columns: {}".format(rows, columns))

Total rows : 527	 Total columns: 13


In [4]:
dataset.head()

Unnamed: 0,state,year,murder,rape,foeticide,kidnapping and abduction,abetment of suicide,exposure and abandonment,procuration of minor girls,selling girls for prostitution,prohibition of child marriage act,other crimes,total
0,andhra pradesh,2001,35.0,84.0,0.0,57.0,7.0,22.0,12.0,0.0,6.0,47.0,270.0
1,arunachal pradesh,2001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,assam,2001,0.0,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0
3,bihar,2001,2.0,16.0,0.0,26.0,0.0,1.0,16.0,1.0,2.0,18.0,83.0
4,chhattisgarh,2001,14.0,150.0,5.0,46.0,1.0,15.0,0.0,0.0,0.0,354.0,585.0


In [5]:
# Scaling and Preprocessing the dataset

dataset.drop(['state', 'year', 'total'], axis=1, inplace=True)
dataset = pd.DataFrame(scale(dataset))

In [6]:
# Sample n observations from D : P

if sampling_size > dataset.shape[0]:
    raise Exception(
        'The number of sample of sample is bigger than the shape of D')

sample_dataset = dataset.sample(n=sampling_size)
sample_dataset.shape

(350, 10)

In [7]:
sample_dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
251,0.205333,-0.246456,-0.288881,0.161355,-0.281905,2.47758,-0.139839,-0.183679,2.35679,0.04663
443,0.499617,0.754637,-0.288881,0.138,0.05574,-0.398377,-0.216594,-0.183679,6.372755,-0.151363
170,-0.505852,-0.55617,-0.422895,-0.387492,-0.281905,-0.435728,-0.216594,-0.183679,-0.442216,-0.38687
376,-0.407758,-0.484216,-0.422895,-0.350336,-0.281905,-0.435728,-0.216594,-0.183679,-0.442216,-0.361861
206,-0.505852,-0.553041,-0.422895,-0.390677,-0.281905,-0.435728,-0.216594,-0.183679,-0.442216,-0.38687


In [8]:
# Get the distance to their neirest neighbors in D : X

tree = BallTree(dataset, leaf_size=2)
dist, _ = tree.query(sample_dataset, k=2)
sample_knn_dist = dist[:, 1]

In [9]:
sample_knn_dist

array([6.61834747e-01, 1.77271333e+00, 3.78086878e-03, 4.63737317e-02,
       3.30363296e-03, 1.02953566e+00, 3.81836746e-01, 2.50470205e-01,
       2.14411993e-02, 1.81471642e-02, 2.38881198e-01, 5.79036287e+00,
       1.16262272e+00, 0.00000000e+00, 6.85397542e-01, 5.45582309e-01,
       2.40330236e-01, 5.44952346e-01, 3.12841782e-03, 4.30297107e+00,
       2.04115495e-02, 3.98495585e-01, 2.93591612e-02, 2.93261373e+00,
       2.31588838e-01, 3.78086878e-03, 4.90846646e+00, 5.62756423e-03,
       1.24480535e+00, 8.46560314e-03, 3.36201164e+00, 3.41071097e-01,
       2.81669321e-01, 3.38606227e-01, 1.04243160e-01, 1.77509966e-01,
       1.24380464e+00, 1.04827806e+00, 7.09143749e-02, 0.00000000e+00,
       3.72983524e-02, 2.45043610e+00, 3.38324110e-02, 4.15575684e-02,
       5.62756423e-03, 3.12836941e-02, 7.04209396e-02, 8.63783511e-01,
       5.24376437e-02, 1.91928332e-01, 7.72327120e-01, 0.00000000e+00,
       6.29781052e-01, 4.48910002e-02, 1.08853347e-01, 1.64811693e+00,
      

In [10]:
# Randomly simulate n points with the same variation as in D : Q.

max_data = dataset.max()
min_data = dataset.min()

uniform_sel_x = np.random.uniform(min_data[0], max_data[0], sampling_size)
uniform_sel_y = np.random.uniform(min_data[1], max_data[1], sampling_size)

uniform_obsv = np.column_stack((uniform_sel_x, uniform_sel_y))
if len(max_data) >= 2:
    for i in range(2, len(max_data)):
        temp = np.random.uniform(min_data[i], max_data[i], sampling_size)
        to_stack = (uniform_obsv, temp)
        uniform_obsv = np.column_stack(to_stack)

uniform_obsv_df = pd.DataFrame(uniform_obsv)
uniform_obsv_df.columns = dataset.columns

In [11]:
uniform_obsv_df.shape

(350, 10)

In [12]:
uniform_obsv_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.062445,6.51666,4.598102,-0.378325,2.786484,2.519009,2.248037,4.705813,5.202199,2.38778
1,4.140199,4.751915,4.315821,5.262096,9.780663,4.666201,12.505019,5.969092,2.979575,-0.095969
2,1.131441,3.866245,6.097364,4.391449,5.113613,3.447965,9.851231,-0.090195,3.984842,4.099522
3,5.852598,0.241065,9.05233,6.2579,1.150803,2.612642,12.463085,6.008442,8.45169,6.386666
4,4.356451,4.224753,3.052427,1.518536,0.58597,2.071665,4.532575,1.154561,4.829143,1.539132


In [13]:
# Get the distance to their neirest neighbors in D : Y

tree = BallTree(dataset, leaf_size=2)
dist, _ = tree.query(uniform_obsv_df, k=1)
uniform_knn_dist = dist

In [14]:
# Calculate the Hopkins Score

x = sum(sample_knn_dist)
y = sum(uniform_knn_dist)

if x + y == 0:
    raise Exception('The denominator of the hopkins statistics is null')

h_stat = (y[0] / (x + y)[0])
print ("The hopkins statistics measure is {}".format(h_stat))

The hopkins statistics measure is 0.9571890578451321
