In [1]:
from sklearn.neighbors import BallTree
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale



The Hopkins statistic is a way of measuring the cluster tendency of a data set. It belongs to the family of sparse sampling tests.

Dataset used : [https://archive.ics.uci.edu/ml/datasets/Absenteeism+at+work](https://archive.ics.uci.edu/ml/datasets/Absenteeism+at+work)

In [2]:
# dataset = pd.read_csv('./Absenteeism_at_work.csv', sep=';')
dataset = pd.read_pickle('./state_clubbed_df.pickle')
sampling_size = 350

In [3]:
rows, columns = dataset.shape
print ("Total rows : {}\t Total columns: {}".format(rows, columns))

Total rows : 527	 Total columns: 13


In [4]:
dataset.head()

Unnamed: 0,state,year,murder,rape,foeticide,kidnapping and abduction,abetment of suicide,exposure and abandonment,procuration of minor girls,selling girls for prostitution,prohibition of child marriage act,other crimes,total
0,andhra pradesh,2001,35.0,84.0,0.0,57.0,7.0,22.0,12.0,0.0,6.0,47.0,270.0
1,arunachal pradesh,2001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,assam,2001,0.0,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0
3,bihar,2001,2.0,16.0,0.0,26.0,0.0,1.0,16.0,1.0,2.0,18.0,83.0
4,chhattisgarh,2001,14.0,150.0,5.0,46.0,1.0,15.0,0.0,0.0,0.0,354.0,585.0


In [5]:
# Scaling and Preprocessing the dataset

dataset.drop(['state', 'year', 'total'], axis=1, inplace=True)
dataset = pd.DataFrame(scale(dataset))

In [6]:
# Sample n observations from D : P

if sampling_size > dataset.shape[0]:
    raise Exception(
        'The number of sample of sample is bigger than the shape of D')

sample_dataset = dataset.sample(n=sampling_size)
sample_dataset.shape

(350, 10)

In [7]:
sample_dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
447,0.315689,0.623244,-0.020852,1.081761,0.562208,-0.417052,5.112365,6.105498,4.790708,-0.201382
312,0.38926,0.404255,-0.422895,1.994737,-0.281905,0.087174,-0.205629,-0.001384,-0.442216,-0.01381
500,-0.444543,-0.468574,-0.422895,-0.132708,-0.281905,-0.136927,-0.205629,-0.183679,-0.442216,-0.382702
220,-0.481329,-0.484216,-0.422895,-0.384307,-0.281905,-0.435728,0.05753,-0.183679,0.044568,-0.361861
497,0.475093,-0.37785,-0.288881,1.295143,-0.281905,0.684775,-0.205629,-0.183679,0.653047,-0.09926


In [8]:
# Get the distance to their neirest neighbors in D : X

tree = BallTree(dataset, leaf_size=2)
dist, _ = tree.query(sample_dataset, k=2)
sample_knn_dist = dist[:, 1]

In [9]:
sample_knn_dist

array([1.98432335e+00, 1.24380464e+00, 1.65078318e-01, 2.80763145e-01,
       1.27192284e+00, 1.43961009e-01, 7.13527214e-01, 1.56073992e+00,
       7.88201973e-01, 2.54348827e+00, 6.08961683e-01, 1.01815988e-01,
       1.12090617e-01, 2.12638560e-02, 3.12836941e-02, 1.28651724e-01,
       0.00000000e+00, 1.27321961e+00, 3.12841782e-03, 2.52275265e-02,
       0.00000000e+00, 6.86162104e-01, 1.64765904e-01, 0.00000000e+00,
       8.53659731e-01, 7.76514413e-01, 2.03811404e+00, 0.00000000e+00,
       2.93217870e-01, 9.22338609e+00, 2.94403752e+00, 1.06265084e+00,
       0.00000000e+00, 7.28329460e-01, 4.46429525e-03, 2.00096472e-01,
       1.28937023e+00, 7.15415690e-01, 3.58582262e-01, 1.29281065e-01,
       5.17869113e-01, 1.82038519e-01, 2.93261373e+00, 1.31839299e-01,
       1.04272414e-01, 1.30796992e-02, 0.00000000e+00, 7.80317828e-01,
       2.53197552e-01, 8.10604407e-02, 1.51655223e-02, 7.32356156e-03,
       9.07253083e-02, 3.78379091e-02, 2.53197552e-01, 1.79859330e+00,
      

In [10]:
# Randomly simulate n points with the same variation as in D : Q.

max_data = dataset.max()
min_data = dataset.min()

uniform_sel_x = np.random.uniform(min_data[0], max_data[0], sampling_size)
uniform_sel_y = np.random.uniform(min_data[1], max_data[1], sampling_size)

uniform_obsv = np.column_stack((uniform_sel_x, uniform_sel_y))
if len(max_data) >= 2:
    for i in range(2, len(max_data)):
        temp = np.random.uniform(min_data[i], max_data[i], sampling_size)
        to_stack = (uniform_obsv, temp)
        uniform_obsv = np.column_stack(to_stack)

uniform_obsv_df = pd.DataFrame(uniform_obsv)
uniform_obsv_df.columns = dataset.columns

In [11]:
uniform_obsv_df.shape

(350, 10)

In [12]:
uniform_obsv_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.177442,4.952025,-0.025218,3.843219,3.749885,1.917612,0.08073,2.575391,1.307402,2.801802
1,5.388605,1.094842,2.422359,4.935218,9.240526,2.626178,12.884304,10.115116,2.575047,6.187084
2,1.174849,0.009273,7.35157,0.922745,4.356972,0.190388,9.314885,8.039558,7.05487,-0.194636
3,6.071437,4.231859,7.472364,0.406209,6.812481,2.34591,10.91502,3.746295,-0.043107,1.231912
4,1.001363,2.133695,3.57091,4.18102,0.005408,1.928006,5.748648,4.45478,3.51298,2.0419


In [13]:
# Get the distance to their neirest neighbors in D : Y

tree = BallTree(dataset, leaf_size=2)
dist, _ = tree.query(uniform_obsv_df, k=1)
uniform_knn_dist = dist

In [14]:
# Calculate the Hopkins Score

x = sum(sample_knn_dist)
y = sum(uniform_knn_dist)

if x + y == 0:
    raise Exception('The denominator of the hopkins statistics is null')

h_stat = (y[0] / (x + y)[0])
print ("The hopkins statistics measure is {}".format(h_stat))

The hopkins statistics measure is 0.9529207904947263
