In [1]:
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

from tqdm import tqdm

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.datasets import make_blobs
from collections import defaultdict

import folium
import re


cols = ['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4',
        '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', 
        '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', 
        '#000075', '#808080']*10

In [2]:
df = pd.read_csv('hotels.csv')

## Visualizing Geographical Data

In [3]:
m=folium.Map(location=[df['latitude'].mean(),df['longitude'].mean()],zoom_starts=9,tiles='Stamen Toner')
for _,row in df.iterrows():
    folium.CircleMarker(location=[row['latitude'],row['longitude']],radius=5).add_to(m)


In [4]:
m

In [5]:
X = np.array(df[['latitude', 'longitude']], dtype='float64')
k = 6
model = KMeans(n_clusters=k, random_state=17).fit(X)
class_predictions = model.predict(X)
df[f'CLUSTER_kmeans{k}'] = class_predictions

In [6]:
def create_map(df, cluster_column):
    m=folium.Map(location=[df['latitude'].mean(),df['longitude'].mean()],zoom_starts=9,tiles='Stamen Toner')

    for _, row in df.iterrows():
        #print(row[cluster_column])
        if row[cluster_column] == -1:
            cluster_colour = '#000000'
        else:
            cluster_colour = cols[int(row[cluster_column])]

        folium.CircleMarker(
            location= [row['latitude'], row['longitude']],
            radius=5,
            popup= row[cluster_column],
            color=cluster_colour,
            fill=True,
            fill_color=cluster_colour
        ).add_to(m)
        
    return m

m = create_map(df, 'CLUSTER_kmeans6')
print(f'K={k}')
print(f'Silhouette Score: {silhouette_score(X, class_predictions)}')

K=6
Silhouette Score: 0.4056594886985069


In [7]:
m.save('kmeans_clusters.html')

In [8]:
import hdbscan
model = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=2, 
                        cluster_selection_epsilon=0.01)
#min_cluster_size
#min_samples
#cluster_slection_epsilon

class_predictions = model.fit_predict(X)
df['CLUSTER_HDBSCAN'] = class_predictions

In [9]:
m = create_map(df, 'CLUSTER_HDBSCAN')

print(f'Number of clusters found: {len(np.unique(class_predictions))-1}')
print(f'Number of outliers found: {len(class_predictions[class_predictions==-1])}')

print(f'Silhouette ignoring outliers: {silhouette_score(X[class_predictions!=-1], class_predictions[class_predictions!=-1])}')

no_outliers = np.array([(counter+2)*x if x==-1 else x for counter, x in enumerate(class_predictions)])
print(f'Silhouette outliers as singletons: {silhouette_score(X, no_outliers)}')

Number of clusters found: 6
Number of outliers found: 13
Silhouette ignoring outliers: 0.48964577222551026
Silhouette outliers as singletons: 0.158696128381134


In [10]:
m.save('HDBScan_clusters.html')