In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import hdbscan
import seaborn as sns
import plotly
import plotly.express as px

In [2]:
loc_df = pd.read_csv("FL_insurance_sample_heatmap.csv")

In [3]:
loc_df.head()

Unnamed: 0,policyID,statecode,county,eq_site_limit,hu_site_limit,fl_site_limit,fr_site_limit,tiv_2011,tiv_2012,eq_site_deductible,hu_site_deductible,fl_site_deductible,fr_site_deductible,point_latitude,point_longitude,line,construction,point_granularity
0,119736,FL,CLAY COUNTY,498960.0,498960.0,498960.0,498960.0,498960.0,792148.9,0.0,9979.2,0.0,0,30.102261,-81.711777,Residential,Masonry,1
1,448094,FL,CLAY COUNTY,1322376.3,1322376.3,1322376.3,1322376.3,1322376.3,1438163.57,0.0,0.0,0.0,0,30.063936,-81.707664,Residential,Masonry,3
2,206893,FL,CLAY COUNTY,190724.4,190724.4,190724.4,190724.4,190724.4,192476.78,0.0,0.0,0.0,0,30.089579,-81.700455,Residential,Wood,1
3,333743,FL,CLAY COUNTY,0.0,79520.76,0.0,0.0,79520.76,86854.48,0.0,0.0,0.0,0,30.063236,-81.707703,Residential,Wood,3
4,172534,FL,CLAY COUNTY,0.0,254281.5,0.0,254281.5,254281.5,246144.49,0.0,0.0,0.0,0,30.060614,-81.702675,Residential,Wood,1


In [4]:
loc_df = loc_df[['point_latitude','point_longitude']]

In [5]:
loc_df

Unnamed: 0,point_latitude,point_longitude
0,30.102261,-81.711777
1,30.063936,-81.707664
2,30.089579,-81.700455
3,30.063236,-81.707703
4,30.060614,-81.702675
...,...,...
36629,28.122885,-82.770218
36630,28.080900,-82.758800
36631,28.110550,-82.766360
36632,28.089415,-82.697411


In [6]:
def get_points_inside_box(df, box):
    return df.loc[(df['point_longitude']>box[0][0]) & (df['point_longitude']<box[1][0]) & (df['point_latitude']>box[0][1]) & (df['point_latitude']<box[1][1])]

In [7]:
px.set_mapbox_access_token('pk.eyJ1Ijoic2hsb2trayIsImEiOiJjazhlMjhkemoxMjNkM2ZwZXN0Y2djaWgwIn0.ZVV36p2a-J8Z8yqbsO0r6g')

In [12]:
def get_cluster_map(locations, bounding_box):
    loc_filtered = get_points_inside_box(locations, bounding_box)
    print(loc_filtered.shape)
    clusterer = hdbscan.HDBSCAN(min_cluster_size=10, metric='haversine')
    cluster_labels = clusterer.fit_predict(np.radians(loc_filtered[['point_latitude','point_longitude']]))
    loc_filtered['cluster'] = cluster_labels
    cluster_count = pd.DataFrame(loc_filtered.groupby('cluster')['point_latitude'].agg('count')).reset_index().rename(columns={'point_latitude': 'Count'})
    cluster_lat_mean = pd.DataFrame(loc_filtered.groupby('cluster')['point_latitude'].agg('mean')).reset_index().rename(columns={'point_latitude': 'Latitude'})
    cluster_lon_mean = pd.DataFrame(loc_filtered.groupby('cluster')['point_longitude'].agg('mean')).reset_index().rename(columns={'point_longitude': 'Longitude'})
    cluster = cluster_count.merge(cluster_lat_mean, on='cluster').merge(cluster_lon_mean, on='cluster')
    
    print(cluster.head())
    
    fig = px.scatter_mapbox(cluster.loc[cluster['cluster']>-1], lat="Latitude", lon="Longitude", size='Count', color='Count',
                           color_continuous_scale=px.colors.sequential.Aggrnyl, zoom=10)
    
    return fig
    

In [13]:
florida_box = [(-88,24), (-76,30)]
miami_box = [(-81.22,25.5), (-79.8,26.2)]

In [14]:
out_map = get_cluster_map(loc_df, florida_box)
out_map.show()
plotly.offline.plot(out_map, filename='florida_heatmap.html')

(29239, 2)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



   cluster  Count   Latitude  Longitude
0       -1   8997  27.407659 -81.245352
1        0     71  24.562263 -81.774501
2        1     17  24.667279 -81.379263
3        2     18  24.717138 -81.068229
4        3     17  29.708477 -83.370367


'florida_heatmap.html'

In [15]:
out_map = get_cluster_map(loc_df, miami_box)
out_map.show()
plotly.offline.plot(out_map, filename='miami_heatmap.html')

(6402, 2)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



   cluster  Count   Latitude  Longitude
0       -1   2130  25.912110 -80.263759
1        0     52  25.723009 -80.158201
2        1     14  25.761961 -80.479041
3        2     15  25.533299 -80.493898
4        3     12  25.521729 -80.466915


'miami_heatmap.html'