In [3]:
import pandas as pd
import geopandas as gpd
import numpy as np
import datetime as dt
import scipy

from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist

# visualization
import matplotlib.pyplot as plt
from matplotlib import colors as mcolors
import seaborn as sns

# system and utility
import warnings
import os
import io
from IPython.display import IFrame
from google.colab import files

# suppress warnings
warnings.filterwarnings('ignore')

# inline
%matplotlib inline

In [118]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# data source:
file_path1 = '/content/drive/My Drive/X999/evictions_pre_post_covid.csv'
file_path2 = '/content/drive/My Drive/X999/bbl_cleaned.csv'

In [6]:
evictions_pre_post_raw = pd.read_csv(file_path1)

In [7]:
evictions_pre_post = evictions_pre_post_raw.copy()

In [8]:
bbl = pd.read_csv(file_path2)

In [9]:
bbl_df = bbl.copy()

In [10]:
len(list(bbl_df.columns))
# correct

97

In [11]:
bbl_df.head(4)

Unnamed: 0,borough,block,lot,community board,census tract 2010,cb2010,schooldist,council district,postcode,firecomp,...,is_condo,floor_category,rent_era,architectural_style,economic_period,residential_units_category,is_llc,building_size_category,size_quartile,decade
0,BK,8366,222,318.0,696.02,2002.0,22.0,46.0,11234.0,E323,...,False,low-rise,"1994–Present, vacancy decontrol","2001-present, New Architecture","2009–present, post-financial crisis",single-unit,True,very small,Q1 (smallest 25%),2010-2020
1,BK,2571,28,301.0,561.0,1005.0,14.0,33.0,11222.0,L106,...,False,low-rise,"1994–Present, vacancy decontrol","2001-present, New Architecture","2009–present, post-financial crisis",single-unit,True,medium-small,Q4 (largest 25%),2010-2020
2,BK,3197,8,304.0,429.0,1002.0,32.0,34.0,11237.0,E218,...,False,low-rise,"Pre-1947, pre-rent-control","1931–1950, Manhattan Modern","1930-1945, great depression and WWII",3-5 units,False,medium-small,Q4 (largest 25%),1930-1939
3,QN,52,7,402.0,7.0,1000.0,30.0,26.0,11101.0,L115,...,False,mid-rise,"1947–1969, rent-control","1951–1980, the International Style, Alternativ...","1946–1975, pst war economic boom",6-20 units,True,medium,Q4 (largest 25%),1950-1959


In [None]:
# list(bbl_df.columns)

In [15]:
evictions_pre_post.shape
# correct 20 - 2 + 4

(76718, 22)

In [17]:
evictions_pre_post.columns

Index(['primary_key', 'court_index_number', 'docket_number',
       'eviction_address', 'eviction_apartment_number', 'executed_date',
       'borough', 'eviction_postcode', 'ejectment',
       'eviction/legal_possession', 'latitude', 'longitude', 'community_board',
       'council_district', 'census_tract', 'bin', 'bbl', 'nta', 'year',
       'month_year', 'geometry', 'average_year_eviction_count'],
      dtype='object')

In [18]:
'bin' in bbl_df.columns
# therefore use bbl for merge

False

In [19]:
# only check the relevant ones
bbl_cleaned = bbl_df[['bbl', 'yearbuilt', 'bldgclass', 'numfloors', 'unitsres', 'ownername', 'bldgarea','building_type', 'building_category', 'is_condo', 'floor_category',
           'rent_era', 'architectural_style', 'economic_period',
           'residential_units_category', 'is_llc', 'building_size_category', 'size_quartile', 'decade']]
bbl_cleaned.head()

Unnamed: 0,bbl,yearbuilt,bldgclass,numfloors,unitsres,ownername,bldgarea,building_type,building_category,is_condo,floor_category,rent_era,architectural_style,economic_period,residential_units_category,is_llc,building_size_category,size_quartile,decade
0,3083660222,2019.0,A5,2,1,"EAST 69 AVENUE N DEVELOPMENT, LLC",1288.0,post-war,single-family,False,low-rise,"1994–Present, vacancy decontrol","2001-present, New Architecture","2009–present, post-financial crisis",single-unit,True,very small,Q1 (smallest 25%),2010-2020
1,3025710028,2018.0,A5,3,1,85 CALYER STREET LLC,3478.0,post-war,single-family,False,low-rise,"1994–Present, vacancy decontrol","2001-present, New Architecture","2009–present, post-financial crisis",single-unit,True,medium-small,Q4 (largest 25%),2010-2020
2,3031970008,1931.0,S4,3,4,SERLIN BUILDING LIMITED PARTNERSHIP,4125.0,pre-war,primarily_res_with_mixed_use,False,low-rise,"Pre-1947, pre-rent-control","1931–1950, Manhattan Modern","1930-1945, great depression and WWII",3-5 units,False,medium-small,Q4 (largest 25%),1930-1939
3,4000520007,1958.0,C1,5,7,"TRIBECA TREASURES, LLC",7416.0,post-war,walk-up,False,mid-rise,"1947–1969, rent-control","1951–1980, the International Style, Alternativ...","1946–1975, pst war economic boom",6-20 units,True,medium,Q4 (largest 25%),1950-1959
4,3067140055,1931.0,C3,2,4,"RAMBOD, SHAHROKH",2112.0,pre-war,walk-up,False,low-rise,"Pre-1947, pre-rent-control","1931–1950, Manhattan Modern","1930-1945, great depression and WWII",3-5 units,False,small,Q3 (50-75%),1930-1939


In [20]:
len(bbl_cleaned)

752619

In [30]:
bbl_df.columns

Index(['borough', 'block', 'lot', 'community board', 'census tract 2010',
       'cb2010', 'schooldist', 'council district', 'postcode', 'firecomp',
       'policeprct', 'healtharea', 'sanitboro', 'sanitsub', 'address',
       'zonedist1', 'zonedist2', 'zonedist3', 'overlay1', 'overlay2',
       'spdist1', 'ltdheight', 'splitzone', 'bldgclass', 'landuse',
       'easements', 'ownertype', 'ownername', 'lotarea', 'bldgarea', 'comarea',
       'resarea', 'officearea', 'retailarea', 'garagearea', 'strgearea',
       'factryarea', 'otherarea', 'areasource', 'numbldgs', 'numfloors',
       'unitsres', 'unitstotal', 'lotfront', 'lotdepth', 'bldgfront',
       'bldgdepth', 'ext', 'proxcode', 'irrlotcode', 'lottype', 'bsmtcode',
       'assessland', 'assesstot', 'exempttot', 'yearbuilt', 'yearalter1',
       'yearalter2', 'histdist', 'landmark', 'builtfar', 'residfar', 'commfar',
       'facilfar', 'borocode', 'bbl', 'condono', 'tract2010', 'xcoord',
       'ycoord', 'latitude', 'longitude', 'z

In [34]:
evictions_pre_post.columns

Index(['primary_key', 'court_index_number', 'docket_number',
       'eviction_address', 'eviction_apartment_number', 'executed_date',
       'borough', 'eviction_postcode', 'ejectment',
       'eviction/legal_possession', 'latitude', 'longitude', 'community_board',
       'council_district', 'census_tract', 'bin', 'bbl', 'nta', 'year',
       'month_year', 'geometry', 'average_year_eviction_count'],
      dtype='object')

In [36]:
bbl_cleaned.bbl.dtype, evictions_pre_post.bbl.dtype, len(bbl_cleaned.bbl.unique()), len(evictions_pre_post.bin.unique()), \
len(bbl_df.postcode.unique()), len(bbl_df.bbl.unique()), len(evictions_pre_post.eviction_postcode.unique())
# cleaned
# makes sense to use bbl as merge base

(dtype('int64'), dtype('int64'), 752619, 36029, 183, 752619, 204)

In [37]:
type(evictions_pre_post['bbl'][0]), type(bbl_cleaned['bbl'][0])

(numpy.int64, numpy.int64)

In [77]:
# convert both to strings
# not necesssary, as they both have been int
evictions_pre_post['bbl'] = evictions_pre_post['bbl'].astype(str)
bbl_cleaned['bbl'] = bbl_cleaned['bbl'].astype(str)
type(evictions_pre_post['bbl'][0]), type(bbl_cleaned['bbl'][0])

(str, str)

In [78]:
# use set to check common ones
eviction_bbls = set(evictions_pre_post['bbl'])
building_bbls = set(bbl_cleaned['bbl'])
common_bbls = eviction_bbls.intersection(building_bbls)
print(f"evictions data unique bbls: {len(eviction_bbls)}")
print(f"building data unqiue bbls: {len(building_bbls)}")
print(f"number of common bbls: {len(common_bbls)}")
print(f"percentage of eviction bbls found in building data: {len(common_bbls)/len(eviction_bbls)*100:.2f}%")
# so, at first, there were no common ones

evictions data unique bbls: 33766
building data unqiue bbls: 752619
number of common bbls: 32379
percentage of eviction bbls found in building data: 95.89%


In [79]:
print("eveiction bbls: ", evictions_pre_post['bbl'].head(10).tolist())
print("building bbls: ", bbl_cleaned['bbl'].head(10).tolist())
# same length and same type of bbls.

eveiction bbls:  ['3037420029', '3057940012', '3057820030', '2032510420', '2025770038', '2031770041', '2033020071', '2030500008', '3040560040', '3087110015']
building bbls:  ['3083660222', '3025710028', '3031970008', '4000520007', '3067140055', '3032840022', '3079650040', '4101330113', '3082137501', '4007030017']


In [80]:
print("evictions BBL string length:", evictions_pre_post['bbl'].str.len().value_counts())
print("building BBL string length:", bbl_cleaned['bbl'].str.len().value_counts())

evictions BBL string length: bbl
10    76715
Name: count, dtype: int64
building BBL string length: bbl
10    752619
Name: count, dtype: int64


In [81]:
evictions_pre_post['bbl'] = evictions_pre_post['bbl'].astype(str)
length_1_bbls = evictions_pre_post[evictions_pre_post['bbl'].str.len() == 1]
length_1_bbls
# these evictions' bbl is 0. Will remove them, as they were not properly recorded and there were only 3 of them.

Unnamed: 0,primary_key,court_index_number,docket_number,eviction_address,eviction_apartment_number,executed_date,borough,eviction_postcode,ejectment,eviction/legal_possession,...,community_board,council_district,census_tract,bin,bbl,nta,year,month_year,geometry,average_year_eviction_count


In [82]:
evictions_pre_post = evictions_pre_post[evictions_pre_post['bbl'] != '0']

In [83]:
# no longer necessary
# def clean_bbl(bbl_val):
#     bbl_as_string = str(bbl_val)
#     digits_only = ""
#     for character in bbl_as_string:
#         if character.isdigit():
#             digits_only = digits_only + character
#     first_ten_digits = digits_only[:10]
#     final_bbl = first_ten_digits.zfill(10)
#     return final_bbl

In [84]:
evictions_pre_post['bbl'].isna().sum(), bbl_cleaned['bbl'].isna().sum()

(np.int64(0), np.int64(0))

In [85]:
# evictions_pre_post['bbl_clean'] = evictions_pre_post['bbl'].apply(clean_bbl)
# bbl_cleaned['bbl_clean'] = bbl_cleaned['bbl'].apply(clean_bbl)

eviction_bbls_set = set(evictions_pre_post['bbl'])
building_bbls_set = set(bbl_cleaned['bbl'])
common_bbls_clean = eviction_bbls_set.intersection(building_bbls_set)

f"number of common BBLs after thorough cleaning: {len(common_bbls_clean)}", \
len(bbl_cleaned['bbl'].unique()), len(evictions_pre_post['bbl'].unique()), \
len(set(evictions_pre_post['bbl'])), \
len(set(bbl_cleaned['bbl'])), \
32379/33766 # 95.89% common bbls, pretty good ratio of matched bbls in eviction data

('number of common BBLs after thorough cleaning: 32379',
 752619,
 33766,
 33766,
 752619,
 0.9589231771604573)

In [75]:
# len(evictions_pre_post['bbl']), \
# len(bbl_cleaned['bbl']), \
# # 33766 should be the number we aim for common bbls

In [87]:
evictions_pre_post['bbl'] = evictions_pre_post['bbl'].astype(str).str.strip()
bbl_cleaned['bbl'] = bbl_cleaned['bbl'].astype(str).str.strip()
# easy cleaning without functions

In [89]:
eviction_bbls_std = set(evictions_pre_post['bbl'])
building_bbls_std = set(bbl_cleaned['bbl'])
common_bbls_std = eviction_bbls_std.intersection(building_bbls_std)
print(f"number of common bbl after standardization: {len(common_bbls_std)}")

number of common bbl after standardization: 32379


In [107]:
bbl_df.postcode.nunique(), evictions_pre_post.eviction_postcode.nunique()

(182, 204)

In [100]:
bbl_df.postcode.dtype, evictions_pre_post.eviction_postcode.dtype, \
bbl_df.postcode.isna().sum(), evictions_pre_post.eviction_postcode.isna().sum()

(dtype('float64'), dtype('int64'), np.int64(44), np.int64(0))

### **Of course, we could also merge on zipcodes, but bbl seems more fitting as bbl is a more granular unit of the records and doing so keeps more records. Once merged, we will still have the zipcode columns kept from evictions_pre_post dataframe. Therefore, it is safe to ignore the fact that bbl_df had some nan values in zipcodes. See the few cells below (before merge).**

In [101]:
bbl_df_copy = bbl_df.copy()

In [102]:
bbl_df_copy = bbl_df_copy.dropna(subset=['postcode'])

In [104]:
bbl_df_copy = bbl_df_copy.dropna(subset=['postcode'])

In [109]:
bbl_df_copy.shape, bbl_cleaned.shape, \
752619 > 752575

((752575, 97), (752619, 19), True)

In [112]:
cleaned_merge = pd.merge(
    evictions_pre_post,
    bbl_cleaned[['bbl', 'yearbuilt', 'bldgclass', 'numfloors', 'unitsres', 'ownername', 'bldgarea','building_type', 'building_category',
            'is_condo', 'floor_category','rent_era', 'architectural_style', 'economic_period', 'residential_units_category', 'is_llc',
                 'building_size_category', 'size_quartile', 'decade']],
    on='bbl',
    how='left')

In [115]:
f"number of non null building_type values in cleaned merge: {cleaned_merge['building_type'].notna().sum()}", \
len(cleaned_merge), len(bbl_cleaned), len(evictions_pre_post)
# there are way more buildings than the ones that had the evictions as expected
# and the merged df keeps the same records as the eviction ones.

('number of non null building_type values in cleaned merge: 72551',
 76715,
 752619,
 76715)

In [119]:
display(cleaned_merge.head())
# shows all the columns

Unnamed: 0,primary_key,court_index_number,docket_number,eviction_address,eviction_apartment_number,executed_date,borough,eviction_postcode,ejectment,eviction/legal_possession,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta,year,month_year,geometry,average_year_eviction_count,yearbuilt,bldgclass,numfloors,unitsres,ownername,bldgarea,building_type,building_category,is_condo,floor_category,rent_era,architectural_style,economic_period,residential_units_category,is_llc,building_size_category,size_quartile,decade
0,*308072/22_5865,*308072/22,5865,356 MILLER AVE,1 AND BASEMENT,2024-12-04,BROOKLYN,11207,Not an Ejectment,Possession,40.672121,-73.891105,5.0,37.0,1152.0,3083989,3037420029,East New York,2024,2024-12,POINT (-73.891105 40.672121),0.8,1930.0,C0,3.0,3.0,356 MILLER LLC,2700.0,pre-war,walk-up,False,low-rise,"Pre-1947, pre-rent-control","1921–1930, Art Deco Skyscrapers","1930-1945, great depression and WWII",3-5 units,True,small,Q3 (50-75%),1930-1939
1,*313639/23_5202,*313639/23,5202,710 61ST STREET,2ND FLOOR,2024-03-04,BROOKLYN,11220,Not an Ejectment,Possession,40.635941,-74.011883,7.0,38.0,118.0,3143881,3057940012,Sunset Park East,2024,2024-03,POINT (-74.011883 40.635941),0.6,1920.0,B2,2.0,2.0,"A.R.M. PARKING, LLC",1204.0,pre-war,two-family,False,low-rise,"Pre-1947, pre-rent-control","1900–1920, Beaux-Arts","Pre-1929, pre-great depression",2-unit,True,very small,Q1 (smallest 25%),1920-1929
2,*324973/22_5308,*324973/22,5308,462 60TH STREET,FOURTH FLOOR APT AKA,2024-08-13,BROOKLYN,11220,Not an Ejectment,Possession,40.640008,-74.017068,7.0,38.0,122.0,3143435,3057820030,Sunset Park West,2024,2024-08,POINT (-74.017068 40.640008),0.6,1907.0,C3,4.0,4.0,"LIN, RONG LAN",4800.0,pre-war,walk-up,False,mid-rise,"Pre-1947, pre-rent-control","1900–1920, Beaux-Arts","Pre-1929, pre-great depression",3-5 units,False,medium-small,Q4 (largest 25%),1900-1909
3,*53336/16_170279,*53336/16,170279,3400 PAUL AVENUE,15D,2018-10-17,BRONX,10468,Not an Ejectment,Possession,40.87719,-73.889569,7.0,11.0,409.0,2015444,2032510420,Van Cortlandt Village,2018,2018-10,POINT (-73.889569 40.87719),0.8,1967.0,D4,21.0,352.0,SCOTT TOWER HOUSING CO INC,381213.0,post-war,condo-co-op,True,high-rise,"1947–1969, rent-control","1951–1980, the International Style, Alternativ...","1946–1975, pst war economic boom",100+ units,False,mega,Q4 (largest 25%),1960-1969
4,*5990/17_2703,*5990/17,2703,480 CONCORD AVENUE,4E,2019-08-30,BRONX,10455,Not an Ejectment,Possession,40.811197,-73.90881,1.0,8.0,35.0,2003900,2025770038,Mott Haven-Port Morris,2019,2019-08,POINT (-73.90881 40.811197),1.6,1928.0,D7,6.0,65.0,480 CONCORD AVE OWNER LLC,69102.0,pre-war,elevator,False,mid-rise,"Pre-1947, pre-rent-control","1921–1930, Art Deco Skyscrapers","Pre-1929, pre-great depression",21-100 units,True,very large,Q4 (largest 25%),1920-1929


In [133]:
cleaned_merge.shape

(76715, 40)

In [122]:
# check nan:
cleaned_merge.isna().sum()
4164/cleaned_merge.shape[0]
# 5.4278% of the records have nans

0.054278824219513785

In [126]:
cleaned_merge.isna().sum().sort_values(ascending=False)

Unnamed: 0,0
rent_era,4164
architectural_style,4164
economic_period,4164
residential_units_category,4164
is_llc,4164
building_size_category,4164
yearbuilt,4164
bldgclass,4164
size_quartile,4164
decade,4164


## **At this stage, we will keep those nan values carried over from bbl.csv. When doing analysis with building features, we will just drop them. When doing solely evictions, or evictions with other relevant data, we will pick the columns that don't have rows that had nans in them to work with.**

In [127]:
cleaned_merge.duplicated().sum()
# no duplicates

np.int64(0)

In [128]:
cleaned_merge.to_csv('/content/drive/My Drive/X999/bbl_evictions_merged.csv', index=False)

In [129]:
file_path1 = '/content/drive/My Drive/X999/bbl_evictions_merged.csv'

In [130]:
df = pd.read_csv(file_path1)

In [134]:
df.columns, df.shape
# correct

(Index(['primary_key', 'court_index_number', 'docket_number',
        'eviction_address', 'eviction_apartment_number', 'executed_date',
        'borough', 'eviction_postcode', 'ejectment',
        'eviction/legal_possession', 'latitude', 'longitude', 'community_board',
        'council_district', 'census_tract', 'bin', 'bbl', 'nta', 'year',
        'month_year', 'geometry', 'average_year_eviction_count', 'yearbuilt',
        'bldgclass', 'numfloors', 'unitsres', 'ownername', 'bldgarea',
        'building_type', 'building_category', 'is_condo', 'floor_category',
        'rent_era', 'architectural_style', 'economic_period',
        'residential_units_category', 'is_llc', 'building_size_category',
        'size_quartile', 'decade'],
       dtype='object'),
 (76715, 40))