# HDBSCAN (Unsupervised Learning)

In [1]:
"""
Created on Mon Sep 24 21:52:30 2018

@author: whitneyreiner
"""
#Import packages
import numpy as np
import pandas as pd, matplotlib.pyplot as plt, time
import seaborn as sns
import hdbscan
%matplotlib inline

In [None]:
#Data cleaning stuff

#change dtype to numeric
df['colname'] = pd.to_numeric(df['colname'])

#if already in appropriate format, change to datetime
df['colname'] = pd.to_datetime(df['colname'])

#List-wise deletion of nulls (remove each row where column value is null)
df = df~df['colname_where_the_null_is'].isnull()] 

#check how many null values in two diff columns within a df
df[['colname','colname']].isnull().sum()

In [3]:
#You may want to first implement sklearn train/test split

#define which col data will be in sample
trainset = df[['col1', 'col2', 'col_n']]

# Initial cluster fitting
### https://hdbscan.readthedocs.io/en/latest/
## Visit the excellent documentation for HDBSCAN to see how to choose the  *ONE* parameter you MUST set (min_cluster_size).
## There are many additional parameters to specify if you think it is appropriate and this includes the distance metric.) 

In [None]:
# Initial cluster fitting
clusterer = hdbscan.HDBSCAN(min_cluster_size=5) #see below for ex. of specifying a few addit. params.
clusterer.fit(np.radians(sample1Coords)) #fit the clusterer to the data
clusterer.labels_ #print the cluster labels 

######### example of specifying a few additional parameters:
##      clusterer = hdbscan.HDBSCAN(metric='haversine', min_cluster_size=5, min_samples=6)

In [7]:
# turn the array of labels into a df so it is easier to inspect
clusterer_labels = pd.DataFrame({'Cluster_no':clusterer.labels_})

In [None]:
#how many clusters are there?
clusterer.labels_.max()

In [None]:
#check out the clusters, anything assigned to cluster no. -1 is noise
clusterer_labels['Cluster_no'].value_counts()[:1000]

In [None]:
#look at the probabilities for each cluster
clusterer_probabilities = clusterer.probabilities_
clusterer_probabilities

In [None]:
#checking out the values
clusterer_probabilities['Cluster_p'].value_counts()

In [None]:
clusterer_probabilities = pd.DataFrame({'Cluster_p':clusterer.probabilities_})

#add the labels and probabilities to original dataframe
dfclust = pd.concat([df, clusterer_labels, clusterer_probabilities], axis=1)
dfclust.describe()

In [None]:
#if you have geographic coordinate data and want to use the Haversine function (matric='Haversine') you will need to transform coordinates into radians
coordinates_array = df[['latitude', 'longitude']].values
radians_array = np.radians(coordinates_array)

In [None]:
#First check out your data points (plotting original un-clustered data)
fig, ax = plt.subplots(figsize=[20, 20])
plt.scatter(*datacoor.T, s=1, linewidth=0, c='b', alpha=0.25)

In [None]:
#clustering and plotting the clusters
clusterer = hdbscan.HDBSCAN(min_cluster_size=[#fill in with an int#])
clusterer.fit(coordinates_array)

#if you are using Haversine function & radians:
clusterer = hdbscan.HDBSCAN(metric='haversine', min_cluster_size=[#fill in with an int#])
clusterer.fit(np.radians(coordinates_array))
#or clusterer.fit(radians_array)

#plot clusters with each cluster in a diff. color
color_palette = sns.color_palette('deep', 100)
cluster_colors = [color_palette[x] if x >= 0
                  else (0.5, 0.5, 0.5)
                  for x in clusterer.labels_]
cluster_member_colors = [sns.desaturate(x, p) for x, p in
                         zip(cluster_colors, clusterer.probabilities_)]
fig, ax = plt.subplots(figsize=[20, 20])
plt.scatter(*radians_array.T, s=25, linewidth=0, c=cluster_member_colors, alpha=0.75)


In [40]:
#extract the clusters
sns.set_style('white')
sns.set_color_codes()
plot_kwds = {'alpha' : 0.5, 's' : 25, 'linewidths':0}

In [None]:
palette = sns.color_palette("hls", 6500)
cluster_colors = [sns.desaturate(palette[col], sat)
                  if col >= 0 else (0.5, 0.5, 0.5) for col, sat in
                  zip(clusterer.labels_, clusterer.probabilities_)]
fig, ax = plt.subplots(figsize=[20, 20])
plt.scatter(datacoor.T[0], datacoor.T[1], c=cluster_colors, **plot_kwds)

In [4]:
# make a condensed tree if you'd like to check out the clusters this way
fig, ax = plt.subplots(figsize=[10, 10])
clusterer.condensed_tree_.plot(select_clusters=True, selection_palette=sns.color_palette())


# Fitting new data to clustered data

In [56]:
new_data_array= new_data.values
#if using radians
new_radians = np.radians(new_data)
new_radians_array = new_radians.values

In [263]:
#'training' (already clustered) data = radians_array

In [None]:
#set it up so it knows it is prediciton data
clusterer = hdbscan.HDBSCAN(min_cluster_size=5,prediction_data=True)
clusterer.fit(coordinates_array)

pal = sns.color_palette('deep', 100)
colors = [sns.desaturate(pal[col], sat) for col, sat in zip(clusterer.labels_, clusterer.probabilities_)]
fig, ax = plt.subplots(figsize=[20, 20])
plt.scatter(coordinates_array.T[0], coordinates_array.T[1], c=colors, **plot_kwds);

# if you are plotting radians because you are using Haversine function: 
# MAKE SURE YOU ARE PLOTTING BOTH AS RADS OR BOTH AS COORDINATES!!!!

In [None]:
# "test" points = new_data_array
#"training" points = coordinates_array
fig, ax = plt.subplots(figsize=[20, 20])
colors = [sns.desaturate(pal[col], sat) for col, sat in zip(clusterer.labels_, clusterer.probabilities_)]

plt.scatter(coordinates_array.T[0], coordinates_array.T[1], c=colors, **plot_kwds); # plot the originally clustered data "training set"
plt.scatter(*new_data_array.T, c='k', s=80) #overlay the new "test" data

In [None]:
#get the labels for the "test" data
test_labels, strengths = hdbscan.approximate_predict(clusterer, test_points)
test_labels

In [None]:
test_labels_df = pd.DataFrame({'test_clust_no': test_labels})
test_labels_df.head()

In [None]:
#you can append the cluster labels to the "test" set dataframe
new_data_df = pd.concat([testsample, test_labelsdf], axis=1)
new_data_df.head()

In [None]:
# if you want the "test" points to be the same color as the cluster they are assigned to 

fig, ax = plt.subplots(figsize=[20, 20])

colors = [sns.desaturate(pal[col], sat) for col, sat in zip(clusterer.labels_, clusterer.probabilities_)]
test_colors = [pal[col] if col >= 0 else (0.1, 0.1, 0.1) for col in test_labels]
plt.scatter(coordinates_array.T[0], coordinates_array.T[1], c=colors, **plot_kwds);
plt.scatter(*new_data_array.T, c=test_colors, s=100, linewidths=1, edgecolors='k')

In [63]:
#to save as numpy arrays so you can easily load an already clustered set of data and then use the prediction_data=True method on new data laters
np.save('train_datacoor',coordinates_array)
#save training data radians
np.save('train_dataRadians',radians_array)
#save clusterer_labels
np.save('train_clustLabs',clusterer.labels_)
#save clusterer_probabilities
np.save('train_clustProbs', clusterer.probabilities_)