In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import datetime as dt
import scipy

from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist

# visualization
import matplotlib.pyplot as plt
from matplotlib import colors as mcolors
import seaborn as sns

# system and utility
import warnings
import os
import io
from IPython.display import IFrame
from google.colab import files

# suppress warnings
warnings.filterwarnings('ignore')

# inline
%matplotlib inline

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# data source:
file_path1 = '/content/drive/My Drive/X999/evictions_covid.csv'
file_path2 = '/content/drive/My Drive/X999/bbl_cleaned.csv'

In [None]:
evictions_covid_raw = pd.read_csv(file_path1)

In [None]:
evictions_covid = evictions_covid_raw.copy()

In [None]:
bbl = pd.read_csv(file_path2)

In [None]:
bbl_df = bbl.copy()

In [None]:
len(list(bbl_df.columns))
# correct

97

In [None]:
bbl_df.head(4)

Unnamed: 0,borough,block,lot,community board,census tract 2010,cb2010,schooldist,council district,postcode,firecomp,policeprct,healtharea,sanitboro,sanitsub,address,zonedist1,zonedist2,zonedist3,overlay1,overlay2,spdist1,ltdheight,splitzone,bldgclass,landuse,easements,ownertype,ownername,lotarea,bldgarea,comarea,resarea,officearea,retailarea,garagearea,strgearea,factryarea,otherarea,areasource,numbldgs,numfloors,unitsres,unitstotal,lotfront,lotdepth,bldgfront,bldgdepth,ext,proxcode,irrlotcode,lottype,bsmtcode,assessland,assesstot,exempttot,yearbuilt,yearalter1,yearalter2,histdist,landmark,builtfar,residfar,commfar,facilfar,borocode,bbl,condono,tract2010,xcoord,ycoord,latitude,longitude,zonemap,zmcode,sanborn,taxmap,appbbl,appdate,plutomapid,version,sanitdistrict,healthcenterdistrict,firm07_flag,pfirm15_flag,dcpedited,building_category,building_type,is_condo,floor_category,rent_era,architectural_style,economic_period,residential_units_category,is_llc,building_size_category,size_quartile,decade
0,BK,8366,222,318.0,696.02,2002.0,22.0,46.0,11234.0,E323,63.0,8822.0,3.0,4E,6815 AVENUE N,R3-1,,,,,,,False,A5,1.0,0.0,,"EAST 69 AVENUE N DEVELOPMENT, LLC",2241.0,1288.0,0.0,1288.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2,1,1.0,27.0,83.0,19.0,33.0,,2.0,False,0.0,2.0,5918.0,5918.0,0.0,2019.0,0.0,0.0,,,0.57,0.5,0.0,1.0,3,3083660222,,69602.0,1008419.0,165883.0,40.621954,-73.912938,23b,,315 067,3.0,3083660000.0,04/26/2019,1,20v5,18.0,35.0,,,,single-family,post-war,False,low-rise,"1994–Present, vacancy decontrol","2001-present, New Architecture","2009–present, post-financial crisis",single-unit,True,very small,Q1 (smallest 25%),2010-2020
1,BK,2571,28,301.0,561.0,1005.0,14.0,33.0,11222.0,L106,94.0,100.0,3.0,1A,87 CALYER STREET,M1-2/R6B,,,,,MX-8,,False,A5,1.0,0.0,,85 CALYER STREET LLC,1862.0,3478.0,0.0,3478.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,3,1,1.0,18.0,100.0,18.0,60.0,N,3.0,False,5.0,2.0,51000.0,66780.0,0.0,2018.0,0.0,2017.0,,,1.87,2.0,2.0,2.0,3,3025710028,,561.0,995995.0,204223.0,40.727214,-73.957625,12c,,304 033,30902.0,3025710000.0,05/09/2019,1,20v5,1.0,30.0,,,,single-family,post-war,False,low-rise,"1994–Present, vacancy decontrol","2001-present, New Architecture","2009–present, post-financial crisis",single-unit,True,medium-small,Q4 (largest 25%),2010-2020
2,BK,3197,8,304.0,429.0,1002.0,32.0,34.0,11237.0,E218,83.0,3200.0,3.0,1B,109 WILSON AVENUE,R6,,,,,,,False,S4,4.0,0.0,,SERLIN BUILDING LIMITED PARTNERSHIP,2500.0,4125.0,1375.0,2750.0,0.0,1375.0,0.0,0.0,0.0,0.0,2.0,1.0,3,4,5.0,25.0,100.0,25.0,55.0,N,0.0,False,3.0,5.0,158850.0,381150.0,45280.0,1931.0,2001.0,0.0,,,1.65,2.43,0.0,4.8,3,3031970008,,429.0,1004619.0,194842.0,40.70145,-73.926539,13b,,309 037,31102.0,,,1,20v5,4.0,34.0,,,,primarily_res_with_mixed_use,pre-war,False,low-rise,"Pre-1947, pre-rent-control","1931–1950, Manhattan Modern","1930-1945, great depression and WWII",3-5 units,False,medium-small,Q4 (largest 25%),1930-1939
3,QN,52,7,402.0,7.0,1000.0,30.0,26.0,11101.0,L115,108.0,720.0,4.0,2A,11-43 45 AVENUE,M1-4/R6A,,,,,LIC,,False,C1,2.0,0.0,,"TRIBECA TREASURES, LLC",2500.0,7416.0,380.0,7036.0,0.0,0.0,0.0,0.0,0.0,380.0,2.0,1.0,5,7,7.0,25.0,100.0,25.0,100.0,N,0.0,False,5.0,0.0,10350.0,977850.0,923360.0,1958.0,2007.0,2007.0,,,2.97,3.0,2.0,3.0,4,4000520007,,7.0,998601.0,211689.0,40.747702,-73.948207,9b,,401 019,40101.0,,,1,20v5,2.0,41.0,,,,walk-up,post-war,False,mid-rise,"1947–1969, rent-control","1951–1980, the International Style, Alternativ...","1946–1975, pst war economic boom",6-20 units,True,medium,Q4 (largest 25%),1950-1959


In [None]:
evictions_covid.columns

Index(['primary_key', 'court_index_number', 'docket_number',
       'eviction_address', 'eviction_apartment_number', 'executed_date',
       'borough', 'eviction_postcode', 'ejectment',
       'eviction/legal_possession', 'latitude', 'longitude', 'community_board',
       'council_district', 'census_tract', 'bin', 'bbl', 'nta', 'year',
       'month_year', 'geometry', 'average_year_eviction_count'],
      dtype='object')

In [None]:
evictions_covid.shape
# correct, 20 - 2 + 4

(6564, 22)

In [None]:
'bin' in bbl_df.columns
# therefore use bbl for merge

False

In [None]:
# only check the relevant ones
bbl_cleaned = bbl_df[['bbl', 'yearbuilt', 'bldgclass', 'numfloors', 'unitsres', 'ownername', 'bldgarea','building_type', 'building_category', 'is_condo', 'floor_category',
           'rent_era', 'architectural_style', 'economic_period',
           'residential_units_category', 'is_llc', 'building_size_category', 'size_quartile', 'decade']]
bbl_cleaned.head()

Unnamed: 0,bbl,yearbuilt,bldgclass,numfloors,unitsres,ownername,bldgarea,building_type,building_category,is_condo,floor_category,rent_era,architectural_style,economic_period,residential_units_category,is_llc,building_size_category,size_quartile,decade
0,3083660222,2019.0,A5,2,1,"EAST 69 AVENUE N DEVELOPMENT, LLC",1288.0,post-war,single-family,False,low-rise,"1994–Present, vacancy decontrol","2001-present, New Architecture","2009–present, post-financial crisis",single-unit,True,very small,Q1 (smallest 25%),2010-2020
1,3025710028,2018.0,A5,3,1,85 CALYER STREET LLC,3478.0,post-war,single-family,False,low-rise,"1994–Present, vacancy decontrol","2001-present, New Architecture","2009–present, post-financial crisis",single-unit,True,medium-small,Q4 (largest 25%),2010-2020
2,3031970008,1931.0,S4,3,4,SERLIN BUILDING LIMITED PARTNERSHIP,4125.0,pre-war,primarily_res_with_mixed_use,False,low-rise,"Pre-1947, pre-rent-control","1931–1950, Manhattan Modern","1930-1945, great depression and WWII",3-5 units,False,medium-small,Q4 (largest 25%),1930-1939
3,4000520007,1958.0,C1,5,7,"TRIBECA TREASURES, LLC",7416.0,post-war,walk-up,False,mid-rise,"1947–1969, rent-control","1951–1980, the International Style, Alternativ...","1946–1975, pst war economic boom",6-20 units,True,medium,Q4 (largest 25%),1950-1959
4,3067140055,1931.0,C3,2,4,"RAMBOD, SHAHROKH",2112.0,pre-war,walk-up,False,low-rise,"Pre-1947, pre-rent-control","1931–1950, Manhattan Modern","1930-1945, great depression and WWII",3-5 units,False,small,Q3 (50-75%),1930-1939


In [None]:
len(bbl_cleaned)

752619

In [None]:
bbl_df.columns

Index(['borough', 'block', 'lot', 'community board', 'census tract 2010',
       'cb2010', 'schooldist', 'council district', 'postcode', 'firecomp',
       'policeprct', 'healtharea', 'sanitboro', 'sanitsub', 'address',
       'zonedist1', 'zonedist2', 'zonedist3', 'overlay1', 'overlay2',
       'spdist1', 'ltdheight', 'splitzone', 'bldgclass', 'landuse',
       'easements', 'ownertype', 'ownername', 'lotarea', 'bldgarea', 'comarea',
       'resarea', 'officearea', 'retailarea', 'garagearea', 'strgearea',
       'factryarea', 'otherarea', 'areasource', 'numbldgs', 'numfloors',
       'unitsres', 'unitstotal', 'lotfront', 'lotdepth', 'bldgfront',
       'bldgdepth', 'ext', 'proxcode', 'irrlotcode', 'lottype', 'bsmtcode',
       'assessland', 'assesstot', 'exempttot', 'yearbuilt', 'yearalter1',
       'yearalter2', 'histdist', 'landmark', 'builtfar', 'residfar', 'commfar',
       'facilfar', 'borocode', 'bbl', 'condono', 'tract2010', 'xcoord',
       'ycoord', 'latitude', 'longitude', 'z

In [None]:
evictions_covid.columns

Index(['primary_key', 'court_index_number', 'docket_number',
       'eviction_address', 'eviction_apartment_number', 'executed_date',
       'borough', 'eviction_postcode', 'ejectment',
       'eviction/legal_possession', 'latitude', 'longitude', 'community_board',
       'council_district', 'census_tract', 'bin', 'bbl', 'nta', 'year',
       'month_year', 'geometry', 'average_year_eviction_count'],
      dtype='object')

In [None]:
bbl_cleaned.bbl.dtype, evictions_covid.bbl.dtype, len(bbl_cleaned.bbl.unique()), len(evictions_covid.bin.unique()), \
len(bbl_df.postcode.unique()), len(bbl_df.bbl.unique()), len(evictions_covid.eviction_postcode.unique())
# cleaned
# makes sense to use bbl as merge base

(dtype('int64'), dtype('int64'), 752619, 5326, 183, 752619, 176)

In [None]:
type(evictions_covid['bbl'][0]), type(bbl_cleaned['bbl'][0])

(numpy.int64, numpy.int64)

In [None]:
# convert both to strings
# not necesssary, as they both have been int
evictions_covid['bbl'] = evictions_covid['bbl'].astype(str)
bbl_cleaned['bbl'] = bbl_cleaned['bbl'].astype(str)
type(evictions_covid['bbl'][0]), type(bbl_cleaned['bbl'][0])

(str, str)

In [71]:
# use set to check common ones
eviction_bbls = set(evictions_covid['bbl'])
building_bbls = set(bbl_cleaned['bbl'])
common_bbls = eviction_bbls.intersection(building_bbls)

In [75]:
data = {'columns': ['evictions data unique BBLs','building data unique BBLs','number of common BBLs',
                'percentage of eviction BBLs in building data'],
    'value': [len(eviction_bbls), len(building_bbls), len(common_bbls),f"{len(common_bbls)/len(eviction_bbls)*100:.2f}%"]}

In [76]:
df = pd.DataFrame(data)
df
# pretty good overlapping rate, 95.33%

Unnamed: 0,columns,value
0,evictions data unique BBLs,5140
1,building data unique BBLs,752619
2,number of common BBLs,4900
3,percentage of eviction BBLs in building data,95.33%


In [None]:
print("eveiction bbls: ", evictions_covid['bbl'].head(10).tolist())
print("building bbls: ", bbl_cleaned['bbl'].head(10).tolist())
# same length and same type of bbls.

eveiction bbls:  ['2032140141', '4031560133', '3051370021', '3011850034', '1010907501', '4033220043', '1021330013', '2025180001', '3042920061', '2024080052']
building bbls:  ['3083660222', '3025710028', '3031970008', '4000520007', '3067140055', '3032840022', '3079650040', '4101330113', '3082137501', '4007030017']


In [None]:
print("evictions BBL string length:", evictions_covid['bbl'].str.len().value_counts())
print("building BBL string length:", bbl_cleaned['bbl'].str.len().value_counts())

evictions BBL string length: bbl
10    6564
Name: count, dtype: int64
building BBL string length: bbl
10    752619
Name: count, dtype: int64


In [None]:
evictions_covid['bbl'] = evictions_covid['bbl'].astype(str)
length_1_bbls = evictions_covid[evictions_covid['bbl'].str.len() == 1]
length_1_bbls
# no bbl is 0 in covid dataframe

Unnamed: 0,primary_key,court_index_number,docket_number,eviction_address,eviction_apartment_number,executed_date,borough,eviction_postcode,ejectment,eviction/legal_possession,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta,year,month_year,geometry,average_year_eviction_count


In [None]:
evictions_covid = evictions_covid[evictions_covid['bbl'] != '0']

In [None]:
# no longer necessary
# def clean_bbl(bbl_val):
#     bbl_as_string = str(bbl_val)
#     digits_only = ""
#     for character in bbl_as_string:
#         if character.isdigit():
#             digits_only = digits_only + character
#     first_ten_digits = digits_only[:10]
#     final_bbl = first_ten_digits.zfill(10)
#     return final_bbl

In [None]:
evictions_covid['bbl'].isna().sum(), bbl_cleaned['bbl'].isna().sum()

(np.int64(0), np.int64(0))

In [None]:
eviction_bbls_set = set(evictions_covid['bbl'])
building_bbls_set = set(bbl_cleaned['bbl'])
common_bbls_clean = eviction_bbls_set.intersection(building_bbls_set)

f"number of common BBLs after thorough cleaning: {len(common_bbls_clean)}", \
len(bbl_cleaned['bbl'].unique()), len(evictions_covid['bbl'].unique()), \
len(set(evictions_covid['bbl'])), \
len(set(bbl_cleaned['bbl'])), \
4900/5140 # 95.33% common bbls, pretty good ratio of matched bbls in eviction data

('number of common BBLs after thorough cleaning: 4900',
 752619,
 5140,
 5140,
 752619,
 0.953307392996109)

In [None]:
evictions_covid['bbl'] = evictions_covid['bbl'].astype(str).str.strip()
bbl_cleaned['bbl'] = bbl_cleaned['bbl'].astype(str).str.strip()
# easy cleaning without functions

In [None]:
eviction_bbls_std = set(evictions_covid['bbl'])
building_bbls_std = set(bbl_cleaned['bbl'])
common_bbls_std = eviction_bbls_std.intersection(building_bbls_std)
print(f"number of common bbl after standardization: {len(common_bbls_std)}")

number of common bbl after standardization: 4900


In [None]:
bbl_df.postcode.nunique(), evictions_covid.eviction_postcode.nunique()

(182, 176)

In [None]:
bbl_df.postcode.dtype, evictions_covid.eviction_postcode.dtype, \
bbl_df.postcode.isna().sum(), evictions_covid.eviction_postcode.isna().sum()

(dtype('float64'), dtype('int64'), np.int64(44), np.int64(0))

### **Of course, we could also merge on zipcodes, but BBL seems more fitting as BBL is a more granular unit of the records and doing so keeps more records. Once merged, we will still have the zipcode columns kept from evictions_covid dataframe. Therefore, it is safe to ignore the fact that bbl_df had some nan values in zipcodes. See the few cells below (before merge).**

In [None]:
bbl_df_copy = bbl_df.copy()

In [None]:
bbl_df_copy = bbl_df_copy.dropna(subset=['postcode'])

In [None]:
bbl_df_copy.shape, bbl_cleaned.shape, \
752619 > 752575

((752575, 97), (752619, 19), True)

In [None]:
cleaned_merge = pd.merge(
    evictions_covid,
    bbl_cleaned[['bbl', 'yearbuilt', 'bldgclass', 'numfloors', 'unitsres', 'ownername', 'bldgarea','building_type', 'building_category',
            'is_condo', 'floor_category','rent_era', 'architectural_style', 'economic_period', 'residential_units_category', 'is_llc',
                 'building_size_category', 'size_quartile', 'decade']],
    on='bbl',
    how='left')

In [None]:
f"number of non null building_type values in cleaned merge: {cleaned_merge['building_type'].notna().sum()}", \
len(cleaned_merge), len(bbl_cleaned), len(evictions_covid)
# there are way more buildings than the ones that had the evictions as expected
# and the merged df keeps the same records as the eviction ones.

('number of non null building_type values in cleaned merge: 6218',
 6564,
 752619,
 6564)

In [None]:
display(cleaned_merge.head())
# shows all the columns

Unnamed: 0,primary_key,court_index_number,docket_number,eviction_address,eviction_apartment_number,executed_date,borough,eviction_postcode,ejectment,eviction/legal_possession,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta,year,month_year,geometry,average_year_eviction_count,yearbuilt,bldgclass,numfloors,unitsres,ownername,bldgarea,building_type,building_category,is_condo,floor_category,rent_era,architectural_style,economic_period,residential_units_category,is_llc,building_size_category,size_quartile,decade
0,004123/20_209969,004123/20,209969,2541 A GRAND AVE,ROOM 3B,2022-08-22,BRONX,10468,Not an Ejectment,Possession,40.865396,-73.901317,7.0,14.0,265.0,2113173,2032140141,Kingsbridge Heights,2022,2022-08,POINT (-73.901317 40.865396),0.2,2004.0,C0,3.0,3.0,MONJU SARKER,3420.0,post-war,walk-up,False,low-rise,"1994–Present, vacancy decontrol","2001-present, New Architecture","1991–2008, modern economic growth",3-5 units,False,medium-small,Q4 (largest 25%),2000-2009
1,0050153/20_106030,0050153/20,106030,98-05 67TH AVENUE,12F,2022-04-14,QUEENS,11375,Not an Ejectment,Possession,40.724241,-73.855552,6.0,29.0,71306.0,4074666,4031560133,Forest Hills,2022,2022-04,POINT (-73.855552 40.724241),0.2,1960.0,D3,13.0,181.0,MARSEILLES LEASING LIMITED PARTNERSHIP,177710.0,post-war,elevator,False,high-rise,"1947–1969, rent-control","1951–1980, the International Style, Alternativ...","1946–1975, pst war economic boom",100+ units,False,mega,Q4 (largest 25%),1960-1969
2,0052002/19_101926,0052002/19,101926,199 VERONICA PLACE,1ST FLOOR,2020-03-02,BROOKLYN,11226,Not an Ejectment,Possession,40.645404,-73.952578,17.0,40.0,792.0,3117969,3051370021,Erasmus,2020,2020-03,POINT (-73.952578 40.645404),0.6,1920.0,B3,2.0,2.0,"AANS, LLC.",1496.0,pre-war,two-family,False,low-rise,"Pre-1947, pre-rent-control","1900–1920, Beaux-Arts","Pre-1929, pre-great depression",2-unit,True,very small,Q2 (25-50%),1920-1929
3,0057757/18_100889,0057757/18,100889,302 EASTERN PARKWAY,4B,2020-02-03,BROOKLYN,11225,Not an Ejectment,Possession,40.670832,-73.958843,9.0,35.0,213.0,3029673,3011850034,Crown Heights South,2020,2020-02,POINT (-73.958843 40.670832),0.8,1923.0,D1,6.0,48.0,302 EASTERN CORP,42984.0,pre-war,elevator,False,mid-rise,"Pre-1947, pre-rent-control","1921–1930, Art Deco Skyscrapers","Pre-1929, pre-great depression",21-100 units,False,very large,Q4 (largest 25%),1920-1929
4,0058466/19_104327,0058466/19,104327,635 WEST 42ND STREET,UNIT 18B,2020-03-12,MANHATTAN,10036,Not an Ejectment,Possession,40.761463,-73.999816,4.0,3.0,129.0,1087539,1010907501,Clinton,2020,2020-03,POINT (-73.999816 40.761463),0.2,,,,,,,,,,,,,,,,,,


In [None]:
cleaned_merge.shape

(6564, 40)

In [None]:
# check nan:
cleaned_merge.isna().sum(), \
346/cleaned_merge.shape[0]
# 5.27% of the records have nans

(primary_key                      0
 court_index_number               0
 docket_number                    0
 eviction_address                 0
 eviction_apartment_number        0
 executed_date                    0
 borough                          0
 eviction_postcode                0
 ejectment                        0
 eviction/legal_possession        0
 latitude                         0
 longitude                        0
 community_board                  0
 council_district                 0
 census_tract                     0
 bin                              0
 bbl                              0
 nta                              0
 year                             0
 month_year                       0
 geometry                         0
 average_year_eviction_count      0
 yearbuilt                      346
 bldgclass                      346
 numfloors                      346
 unitsres                       346
 ownername                      346
 bldgarea                   

In [None]:
cleaned_merge.isna().sum().sort_values(ascending=False)

Unnamed: 0,0
rent_era,346
architectural_style,346
economic_period,346
residential_units_category,346
is_llc,346
building_size_category,346
yearbuilt,346
bldgclass,346
size_quartile,346
decade,346


## **At this stage, we will keep those nan values carried over from bbl.csv. When doing analysis with building features, we will just drop them. When doing solely evictions, or evictions with other relevant data, we will pick the columns that don't have rows that had nans in them to work with.**

In [None]:
cleaned_merge.duplicated().sum()
# no duplicates

np.int64(0)

In [None]:
cleaned_merge.to_csv('/content/drive/My Drive/X999/bbl_evictions_merged_covid.csv', index=False)

In [None]:
file_path1 = '/content/drive/My Drive/X999/bbl_evictions_merged_covid.csv'

In [None]:
df = pd.read_csv(file_path1)

In [None]:
df.columns, df.shape
# correct

(Index(['primary_key', 'court_index_number', 'docket_number',
        'eviction_address', 'eviction_apartment_number', 'executed_date',
        'borough', 'eviction_postcode', 'ejectment',
        'eviction/legal_possession', 'latitude', 'longitude', 'community_board',
        'council_district', 'census_tract', 'bin', 'bbl', 'nta', 'year',
        'month_year', 'geometry', 'average_year_eviction_count', 'yearbuilt',
        'bldgclass', 'numfloors', 'unitsres', 'ownername', 'bldgarea',
        'building_type', 'building_category', 'is_condo', 'floor_category',
        'rent_era', 'architectural_style', 'economic_period',
        'residential_units_category', 'is_llc', 'building_size_category',
        'size_quartile', 'decade'],
       dtype='object'),
 (6564, 40))