### Pulling in data

In [39]:
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os

In [40]:
load_dotenv()
db = os.getenv('DB_URL')
engine = create_engine(db)

In [41]:
sql = '''

select
*
from west_philly_targets
where hav_dist(latitude,longitude, 39.9567,-75.23375) <= 1600 

'''

df = pd.read_sql(sql,engine)

In [42]:
df.head()

Unnamed: 0,address,zip_code,coordinates,market_value,opa_account_number,opa_owner,rental_license_number,most_recent_license_issue_date,license_expiration_date,license_inactive_date,...,properties_recording_date,sale_price,taxable_building,taxable_land,total_area,total_livable_area,zoning,target_type,longitude,latitude
0,141 COBBS CREEK PKWY,19139,"[-75.246952, 39.960049]",69300,31206500,"COACH ELOISE, COACH HASTINGS",703482,2016-06-01T13:00:51Z,2017-05-31T00:00:00Z,2017-07-30T00:00:00Z,...,2006-08-01T00:00:00Z,10,58910,10390,1578.99,1334.0,RM1,smaller unit,-75.246952,39.960049
1,247 COBBS CREEK PKWY,19139,"[-75.247354, 39.958344]",136800,31207800,ANDERSON CHRISTINA,727983,2020-06-17T00:00:00Z,2021-02-07T00:00:00Z,,...,2016-12-15T00:00:00Z,199000,72306,19494,1280.0,1432.0,RM1,smaller unit,-75.247354,39.958344
2,115 DEARBORN ST,19139-0000,"[-75.221855, 39.961644]",38400,441082700,BUTLER HARRIET,226456,2003-06-12T00:00:00Z,2004-02-29T00:00:00Z,,...,2004-07-08T00:00:00Z,45000,0,0,637.5,900.0,RSA5,smaller unit,-75.221855,39.961644
3,12 DEARBORN ST,19139-0000,"[-75.222452, 39.960069]",37400,441085500,CHATHAM HOLDINGS SERVICES,736197,2020-08-28T00:00:00Z,2021-04-20T00:00:00Z,,...,2017-03-09T00:00:00Z,26500,31790,5610,667.5,900.0,RSA5,smaller unit,-75.222452,39.960069
4,120 DEARBORN ST,19139-0000,"[-75.222097, 39.961756]",38400,441088800,WALKER ARCELLA,367192,2020-03-11T00:00:00Z,2021-02-28T00:00:00Z,,...,2006-05-10T00:00:00Z,1,32640,5760,637.5,900.0,RSA5,smaller unit,-75.222097,39.961756


### Clustering

In [43]:
from sklearn.cluster import KMeans

In [44]:
df = df.set_index('address')
cols = ['latitude','longitude']

In [45]:
kmeans = KMeans(n_clusters = 5,random_state=0).fit(df[cols])

In [48]:
clusters = kmeans.predict(X)
clusters = pd.DataFrame({'cluster':list(clusters)},index=df.index)

In [52]:
final = pd.concat([df,clusters],axis=1)

In [58]:
final.groupby('cluster').count()['coordinates']

cluster
0    2193
1    1896
2    2248
3    1987
4    1439
Name: coordinates, dtype: int64

In [59]:
final.head()

Unnamed: 0_level_0,zip_code,coordinates,market_value,opa_account_number,opa_owner,rental_license_number,most_recent_license_issue_date,license_expiration_date,license_inactive_date,license_status,...,sale_price,taxable_building,taxable_land,total_area,total_livable_area,zoning,target_type,longitude,latitude,cluster
address,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
141 COBBS CREEK PKWY,19139,"[-75.246952, 39.960049]",69300,31206500,"COACH ELOISE, COACH HASTINGS",703482,2016-06-01T13:00:51Z,2017-05-31T00:00:00Z,2017-07-30T00:00:00Z,Inactive,...,10,58910,10390,1578.99,1334.0,RM1,smaller unit,-75.246952,39.960049,0
247 COBBS CREEK PKWY,19139,"[-75.247354, 39.958344]",136800,31207800,ANDERSON CHRISTINA,727983,2020-06-17T00:00:00Z,2021-02-07T00:00:00Z,,Active,...,199000,72306,19494,1280.0,1432.0,RM1,smaller unit,-75.247354,39.958344,3
115 DEARBORN ST,19139-0000,"[-75.221855, 39.961644]",38400,441082700,BUTLER HARRIET,226456,2003-06-12T00:00:00Z,2004-02-29T00:00:00Z,,Inactive,...,45000,0,0,637.5,900.0,RSA5,smaller unit,-75.221855,39.961644,2
12 DEARBORN ST,19139-0000,"[-75.222452, 39.960069]",37400,441085500,CHATHAM HOLDINGS SERVICES,736197,2020-08-28T00:00:00Z,2021-04-20T00:00:00Z,,Active,...,26500,31790,5610,667.5,900.0,RSA5,smaller unit,-75.222452,39.960069,2
120 DEARBORN ST,19139-0000,"[-75.222097, 39.961756]",38400,441088800,WALKER ARCELLA,367192,2020-03-11T00:00:00Z,2021-02-28T00:00:00Z,,Active,...,1,32640,5760,637.5,900.0,RSA5,smaller unit,-75.222097,39.961756,2


In [66]:
final.loc[final.number_of_units.astype(float) > 10].shape

(12, 36)

In [68]:
final.to_sql('west_philly_clusters_20210211',engine,if_exists='replace',index=False,chunksize=10000,method='multi')

### Mapping it all out

In [70]:
import keplergl
import geopandas as gp

In [74]:
points = final[['latitude','longitude','cluster']]
mapped = gp.GeoDataFrame(geometry=gp.points_from_xy(points.longitude, points.latitude),data=points)
mapped.head()

Unnamed: 0_level_0,latitude,longitude,cluster,geometry
address,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
141 COBBS CREEK PKWY,39.960049,-75.246952,0,POINT (-75.24695 39.96005)
247 COBBS CREEK PKWY,39.958344,-75.247354,3,POINT (-75.24735 39.95834)
115 DEARBORN ST,39.961644,-75.221855,2,POINT (-75.22186 39.96164)
12 DEARBORN ST,39.960069,-75.222452,2,POINT (-75.22245 39.96007)
120 DEARBORN ST,39.961756,-75.222097,2,POINT (-75.22210 39.96176)


In [75]:
m = keplergl.KeplerGl()

m.add_data(data=mapped.loc[mapped.cluster==0],name=f"Cluster 1")
m.add_data(data=mapped.loc[mapped.cluster==1],name=f"Cluster 2")
m.add_data(data=mapped.loc[mapped.cluster==2],name=f"Cluster 3")
m.add_data(data=mapped.loc[mapped.cluster==3],name=f"Cluster 4")
m.add_data(data=mapped.loc[mapped.cluster==4],name=f"Cluster 5")

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


In [79]:
m

KeplerGl(config={'version': 'v1', 'config': {'visState': {'filters': [], 'layers': [{'id': 'lrwaer9', 'type': …

In [80]:
m.save_to_html(file_name="west_philly_turf_cluster_attempt.html")

Map saved to west_philly_turf_cluster_attempt.html!
