In [1]:
%pip install hdbscan

Collecting hdbscan
  Downloading hdbscan-0.8.29.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25l- \ | / - \ | done
[?25h  Getting requirements to build wheel ... [?25l- done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
Collecting cython>=0.27
  Using cached Cython-0.29.34-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (2.0 MB)
Building wheels for collected packages: hdbscan
  Building wheel for hdbscan (pyproject.toml) ... [?25l- \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ done
[?25h  Created wheel for hdbscan: filename=hdbscan-0.8.29-cp38-cp38-linux_x86_64.whl size=3913908 sha256=aa596a10f86dfe329b762a3e18c082a96a82d55864bd716b6a533fe59fb056d2
  Stored in directory: /home/azureuser/.cache/pip/wheels/e0/05/13

In [18]:
import numpy as np
import pandas as pd
from hdbscan import HDBSCAN

df = pd.read_pickle("clean_data.pkl")
df.columns

Index(['ACCT_NBR', 'SUCCESSOR', 'MS_METER_NBR', 'BS_RATE', 'time_series',
       'label', 'XRHSH', 'VOLTAGE', 'PARNO', 'CONTRACT_CAPACITY',
       'ACCT_CONTROL', 'ACCT_WGS84_X', 'ACCT_WGS84_Y', 'SUPPLIER',
       'SUPPLIER_TO', 'REQUEST_TYPE', 'COMPL_REQUEST_STATUS'],
      dtype='object')

In [19]:
df_geo = pd.DataFrame()

# Replace commas with decimal points in the 'ACCT_WGS84_X' and 'ACCT_WGS84_Y' columns
df_geo['ACCT_WGS84_X'] = df['ACCT_WGS84_X'].str.replace(',', '.')
df_geo['ACCT_WGS84_Y'] = df['ACCT_WGS84_Y'].str.replace(',', '.')

# Convert the columns to float data type
df_geo = df_geo.astype({"ACCT_WGS84_X": "float32", "ACCT_WGS84_Y": "float32"})

# Check the data types of the columns
print(df_geo.dtypes)


ACCT_WGS84_X    float32
ACCT_WGS84_Y    float32
dtype: object


In [20]:
import gc
del df
gc.collect()

10362

In [25]:
import numpy as np
from hdbscan import HDBSCAN

class GeoClustering:
    def __init__(self, data):
        self.data = data
        self.hdbscan = None

    def deg2rad(self, degrees):
        return np.radians(degrees)

    def cluster(self, min_samples=2, chunk_size=50000):
        # Convert coordinates to radians
        rad_coords = self.deg2rad(self.data)

        # Divide the dataset into smaller chunks
        num_chunks = int(np.ceil(rad_coords.shape[0] / chunk_size))
        chunked_data = np.array_split(rad_coords, num_chunks)

        # Initialize a new column for cluster labels
        self.data['cluster_labels'] = -1

        # Process each chunk separately
        for idx, chunk in enumerate(chunked_data):
            print(f"Processing chunk {idx + 1} of {num_chunks}")

            # Instantiate HDBSCAN using the haversine metric
            self.hdbscan = HDBSCAN(min_samples=min_samples, metric='haversine', core_dist_n_jobs=-1, prediction_data=True)

            # Fit HDBSCAN to the chunk
            clusters = self.hdbscan.fit(chunk)

            # Store the cluster labels for this chunk
            chunk_indices = chunk.index
            self.data.loc[chunk_indices, 'cluster_labels'] = clusters.labels_

        return self.data

    def predict(self, new_points):
        if self.hdbscan is None:
            raise ValueError("You must call .cluster() before calling .predict()")

        # Convert new_points to radians
        rad_new_points = self.deg2rad(new_points)

        # Predict the cluster labels for the new points using approximate_predict()
        labels, strengths = HDBSCAN.approximate_predict(self.hdbscan, rad_new_points)

        return labels, strengths
        
    def save(self, file_path):
        if self.hdbscan is None:
            raise ValueError("You must call .cluster() before calling .save()")
        
        with open(file_path, 'wb') as f:
            pickle.dump(self.hdbscan, f)

    def load(self, file_path):
        with open(file_path, 'rb') as f:
            self.hdbscan = pickle.load(f)

In [35]:
df_geo = df_geo.fillna(0)

In [36]:
import pickle

# Instantiate GeoClustering with the clustered_data
geo_clustering = GeoClustering(df_geo)

geo_data = geo_clustering.cluster(chunk_size=5000)

geo_clustering.save("hdbscan_model.pkl")

# d) Save the new clustered dataset in a new pickle
with open('clustered_data.pkl', 'wb') as file:
    pickle.dump(geo_data, file)



Processing chunk 1 of 307
Processing chunk 2 of 307
Processing chunk 3 of 307
Processing chunk 4 of 307
Processing chunk 5 of 307
Processing chunk 6 of 307
Processing chunk 7 of 307
Processing chunk 8 of 307
Processing chunk 9 of 307
Processing chunk 10 of 307
Processing chunk 11 of 307
Processing chunk 12 of 307
Processing chunk 13 of 307
Processing chunk 14 of 307
Processing chunk 15 of 307
Processing chunk 16 of 307
Processing chunk 17 of 307
Processing chunk 18 of 307
Processing chunk 19 of 307
Processing chunk 20 of 307
Processing chunk 21 of 307
Processing chunk 22 of 307
Processing chunk 23 of 307
Processing chunk 24 of 307
Processing chunk 25 of 307
Processing chunk 26 of 307
Processing chunk 27 of 307
Processing chunk 28 of 307
Processing chunk 29 of 307
Processing chunk 30 of 307
Processing chunk 31 of 307
Processing chunk 32 of 307
Processing chunk 33 of 307
Processing chunk 34 of 307
Processing chunk 35 of 307
Processing chunk 36 of 307
Processing chunk 37 of 307
Processing