## Geospatial clustering

In [None]:
#!{sys.executable} -m pip install branca jinja2 requests folium

import random
import math

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import haversine_distances

import hdbscan

import folium

### Load the data

Flickr is an online photo management and sharing application developed by SmugMug. 
- Photo metadata like tags, descriptions and geospatial data is also available.

The dataset contains 20K sets of photos records gathered from Flickr . This dataset is limited to a geographical bounding box which includes locations in the city of London, and photos are taken between 2014 and 2019.

Citation: https://www.kaggle.com/datasets/amiralisa/flickr_london?select=london_20k.csv

In [None]:
flickr_df = pd.read_csv('data/geospatial/london_20k.csv')
flickr_df.head()

In [None]:
# Plot map based on mean longitude/lattitude of the dataset
m = folium.Map(location=[flickr_df['lat'].mean(), flickr_df['lon'].mean()], zoom_start=12)
m

In [None]:
# Display all the locations(points) where photos were taken
for idx, row in flickr_df.iterrows():
        folium.CircleMarker([row['lat'], row['lon']], radius=0.1, color="red").add_to(m) 
m

In [None]:
# Points can also be shown as a scatterplot
plt.figure(figsize=(10,10))
sns.scatterplot(flickr_df, x='lat', y='lon',s=2)

In [None]:
def plot_spatial_clusters(clusterer, data_df, lat_key='lat', lon_key='lon'):
    
    # Get number of clusters and determine if there is noise
    num_clust = len(set(clusterer.labels_))
    add_const = 1 if -1 in clusterer.labels_ else 0

    # Get colormap
    cmap = matplotlib.colormaps['nipy_spectral']  
    colors = list(cmap(np.linspace(0, 1, num_clust-add_const)))

    # Shuffle collors to avoid similar colors being close to each other
    # on the map
    random.shuffle(colors)

    # Use black for noise dots
    if add_const == 1:
        colors.insert(0, (0,0,0,1))
    
    # Get color for each dot
    collor_list = [matplotlib.colors.to_hex(colors[clust+add_const]) for clust in clusterer.labels_]

    # Plot map based on mean longitude/lattitude of the dataset
    m_clust = folium.Map(location=[flickr_df['lat'].mean(), flickr_df['lon'].mean()], zoom_start=12)
    # Add collored dots to the map
    for idx in range(data_df.shape[0]):
            folium.CircleMarker(
                [data_df.loc[idx, lat_key], data_df.loc[idx, lon_key]], 
                radius=0.1, 
                color=collor_list[idx]
            ).add_to(m_clust) 

    # Return map with dots for plotting
    return(m_clust)

In [None]:
# Transform latitude and longitude to radians
flickr_df['lat_rad'] = flickr_df['lat'].map(math.radians)
flickr_df['lon_rad'] = flickr_df['lon'].map(math.radians)
clust_data = flickr_df[['lat_rad', 'lon_rad']].to_numpy()


### DBSCAN for geospatial clustering

In [None]:
eps_1m = 1/6371000
eps = eps_1m*100
clusterer = DBSCAN(eps=eps, min_samples=40, metric='haversine').fit(clust_data)
unique_labels = np.unique(clusterer.labels_, return_counts=True)
print(hdbscan.validity_index(clust_data, clusterer.labels_))
print(pd.Series(unique_labels[1], index=unique_labels[0]).sort_values(ascending=False).reset_index())

In [None]:
plot_spatial_clusters(
    clusterer=clusterer, 
    data_df=flickr_df
)

In [None]:
eps_1m = 1/6371000
eps = eps_1m*200
clusterer = DBSCAN(eps=eps, min_samples=40, metric='haversine').fit(clust_data)
unique_labels = np.unique(clusterer.labels_, return_counts=True)
print(hdbscan.validity_index(clust_data, clusterer.labels_))
print(pd.Series(unique_labels[1], index=unique_labels[0]).sort_values(ascending=False).reset_index())

In [None]:
plot_spatial_clusters(
    clusterer=clusterer, 
    data_df=flickr_df
)

### HDBSCAN clustering

In [None]:
# HDDBSCAN - EOM 
clusterer = hdbscan.HDBSCAN(
    min_samples=10, 
    min_cluster_size=40,
    metric='haversine'
).fit(clust_data)
print(hdbscan.validity_index(clust_data, clusterer.labels_))
unique_labels = np.unique(clusterer.labels_, return_counts=True)
print(pd.Series(unique_labels[1], index=unique_labels[0]).sort_values(ascending=False).reset_index())

In [None]:
plot_spatial_clusters(
    clusterer=clusterer, 
    data_df=flickr_df
)

### Tag analysis

In [None]:
# Remove noise points
flickr_df['cluster'] = clusterer.labels_
flickr_df_denoised = flickr_df[flickr_df['cluster'] != -1].filter(
    ['owner', 'taken', 'tags', 'lat', 'lon', 'lat_rad', 'lon_rad', 'cluster']
)

flickr_df_denoised.head(2)

In [None]:
# Lets pretend each phot comes from separate owner !!
flickr_df['owner'].nunique()

In [None]:
# Extend dataframe to have single tag per row

# Drop all nan values
flickr_df_denoised = flickr_df_denoised.dropna(subset=['tags'])

# Split tag array into list
flickr_df_denoised['tag_list'] = flickr_df_denoised['tags'].str.split(',')

flickr_df_denoised = flickr_df_denoised.explode('tag_list')


flickr_df_denoised

In [None]:
# Exolode the list column
flickr_df_denoised = flickr_df_denoised.drop(
    columns=['tags']
).rename(columns={'tag_list': 'tag'}).reset_index(drop=True)

# Remove whitespaces from tags
flickr_df_denoised['tag'] = flickr_df_denoised['tag'].str.strip()

# Get tag frequencies
tag_freq = flickr_df_denoised['tag'].value_counts(ascending=False)
tag_freq.head(50)

#### Find all places with graffiti

In [None]:
tag_df = flickr_df_denoised[flickr_df_denoised['tag']=='graffiti']
clust_tag_count = tag_df.groupby('cluster')['cluster'].count().sort_values(ascending=False)
tag_clusts = list(clust_tag_count[clust_tag_count > 40].index)
clust_tag_count

In [None]:
target_cluster = flickr_df[flickr_df['cluster'].isin(tag_clusts)]
m = folium.Map(location=[target_cluster['lat'].mean(), target_cluster['lon'].mean()], zoom_start=12)
for idx, row in target_cluster.iterrows():
        folium.CircleMarker([row['lat'], row['lon']], radius=0.1, color="red").add_to(m) 
print(target_cluster.shape[0])
m

In [None]:
tag_df = flickr_df_denoised[flickr_df_denoised['tag']=='museum']
clust_tag_count = tag_df.groupby('cluster')['cluster'].count().sort_values(ascending=False)
tag_clusts = list(clust_tag_count[clust_tag_count > 40].index)
clust_tag_count

In [None]:
target_cluster = flickr_df[flickr_df['cluster'].isin(tag_clusts)]
m = folium.Map(location=[target_cluster['lat'].mean(), target_cluster['lon'].mean()], zoom_start=12)
for idx, row in target_cluster.iterrows():
        folium.CircleMarker([row['lat'], row['lon']], radius=0.1, color="red").add_to(m) 
print(target_cluster.shape[0])
m