In [1]:
from sklearn.neighbors import BallTree
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale



The Hopkins statistic is a way of measuring the cluster tendency of a data set. It belongs to the family of sparse sampling tests.

Dataset used : [https://archive.ics.uci.edu/ml/datasets/Absenteeism+at+work](https://archive.ics.uci.edu/ml/datasets/Absenteeism+at+work)

In [2]:
# dataset = pd.read_csv('./Absenteeism_at_work.csv', sep=';')
dataset = pd.read_pickle('./state_clubbed_df.pickle')
sampling_size = 350

In [3]:
rows, columns = dataset.shape
print ("Total rows : {}\t Total columns: {}".format(rows, columns))

Total rows : 527	 Total columns: 13


In [4]:
dataset.head()

Unnamed: 0,state,year,murder,rape,foeticide,kidnapping and abduction,abetment of suicide,exposure and abandonment,procuration of minor girls,selling girls for prostitution,prohibition of child marriage act,other crimes,total
0,andhra pradesh,2001,35.0,84.0,0.0,57.0,7.0,22.0,12.0,0.0,6.0,47.0,270.0
1,arunachal pradesh,2001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,assam,2001,0.0,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0
3,bihar,2001,2.0,16.0,0.0,26.0,0.0,1.0,16.0,1.0,2.0,18.0,83.0
4,chhattisgarh,2001,14.0,150.0,5.0,46.0,1.0,15.0,0.0,0.0,0.0,354.0,585.0


In [5]:
# Scaling and Preprocessing the dataset

dataset.drop(['state', 'year', 'total'], axis=1, inplace=True)
dataset = pd.DataFrame(scale(dataset))

In [6]:
# Sample n observations from D : P

if sampling_size > dataset.shape[0]:
    raise Exception(
        'The number of sample of sample is bigger than the shape of D')

sample_dataset = dataset.sample(n=sampling_size)
sample_dataset.shape

(350, 10)

In [7]:
sample_dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
127,-0.49359,-0.553041,-0.288881,-0.389615,-0.281905,-0.435728,-0.216594,-0.183679,-0.442216,-0.382702
82,-0.137998,-0.340309,-0.422895,-0.367321,-0.281905,-0.417052,-0.139839,-0.183679,-0.442216,-0.286832
279,-0.505852,-0.543656,-0.422895,-0.376876,-0.113082,-0.417052,-0.216594,-0.183679,-0.442216,-0.37645
136,-0.518114,-0.553041,-0.422895,-0.391738,-0.281905,-0.417052,-0.216594,-0.183679,-0.442216,-0.38687
351,-0.518114,-0.493601,-0.422895,-0.375814,-0.281905,-0.435728,-0.216594,-0.183679,-0.442216,-0.38687


In [8]:
# Get the distance to their neirest neighbors in D : X

tree = BallTree(dataset, leaf_size=2)
dist, _ = tree.query(sample_dataset, k=2)
sample_knn_dist = dist[:, 1]

In [9]:
sample_knn_dist

array([1.43418414e-02, 1.60404301e-01, 2.14411993e-02, 3.78086878e-03,
       8.46560314e-03, 1.05081060e+00, 5.79036287e+00, 2.81284194e+00,
       7.15177607e-01, 3.32705481e-02, 1.18437479e+00, 2.14411993e-02,
       6.85397542e-01, 2.66010591e-01, 1.28314870e-02, 1.38563474e-02,
       0.00000000e+00, 7.09143749e-02, 5.48479239e-01, 6.75342808e-01,
       2.45172932e-02, 1.05994124e+00, 8.28493101e-01, 1.01815988e-01,
       7.04209396e-02, 3.41071097e-01, 1.77865114e+00, 0.00000000e+00,
       3.38606227e-01, 1.03626282e-01, 0.00000000e+00, 1.76785313e-01,
       1.10564771e+00, 5.62756423e-03, 3.30363296e-03, 1.97506099e-02,
       1.39738131e-02, 1.44975519e+00, 0.00000000e+00, 1.19779621e+00,
       0.00000000e+00, 2.73844681e-02, 6.61834747e-01, 1.69608565e-02,
       2.83706676e-02, 2.93591612e-02, 0.00000000e+00, 2.53197552e-01,
       3.12841782e-03, 5.89102923e-02, 7.32356156e-03, 3.92000065e-01,
       2.00096472e-01, 1.05994124e+00, 0.00000000e+00, 3.79739988e-01,
      

In [10]:
# Randomly simulate n points with the same variation as in D : Q.

max_data = dataset.max()
min_data = dataset.min()

uniform_sel_x = np.random.uniform(min_data[0], max_data[0], sampling_size)
uniform_sel_y = np.random.uniform(min_data[1], max_data[1], sampling_size)

uniform_obsv = np.column_stack((uniform_sel_x, uniform_sel_y))
if len(max_data) >= 2:
    for i in range(2, len(max_data)):
        temp = np.random.uniform(min_data[i], max_data[i], sampling_size)
        to_stack = (uniform_obsv, temp)
        uniform_obsv = np.column_stack(to_stack)

uniform_obsv_df = pd.DataFrame(uniform_obsv)
uniform_obsv_df.columns = dataset.columns

In [11]:
uniform_obsv_df.shape

(350, 10)

In [12]:
uniform_obsv_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.582426,4.603702,3.570874,-0.10813,13.164222,0.566526,12.152493,7.562518,5.104814,5.696784
1,1.978109,5.94315,9.390731,2.013481,2.231923,4.654225,0.307149,3.769595,5.839326,5.077317
2,4.891266,0.36942,9.735983,3.960046,5.121592,5.437629,6.228414,7.567991,-0.188147,2.321662
3,3.447057,1.242973,2.201363,5.605777,14.137308,1.169938,12.935832,0.950043,1.526524,6.791687
4,2.357543,1.148352,8.271596,6.478728,4.458407,1.341618,1.165756,0.188902,-0.022944,2.194953


In [13]:
# Get the distance to their neirest neighbors in D : Y

tree = BallTree(dataset, leaf_size=2)
dist, _ = tree.query(uniform_obsv_df, k=1)
uniform_knn_dist = dist

In [14]:
# Calculate the Hopkins Score

x = sum(sample_knn_dist)
y = sum(uniform_knn_dist)

if x + y == 0:
    raise Exception('The denominator of the hopkins statistics is null')

h_stat = (y[0] / (x + y)[0])
print ("The hopkins statistics measure is {}".format(h_stat))

The hopkins statistics measure is 0.9548781992912345
