In [3]:
# !pip install geopandas folium matplotlib seaborn scipy
# !pip install esda
# !pip install splot
# # for google colab, had to reinstall some pacakges.

In [4]:
import pandas as pd
import geopandas as gpd
import numpy as np
import datetime as dt
import scipy

from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist

# visualization
import matplotlib.pyplot as plt
from matplotlib import colors as mcolors
import seaborn as sns
import folium
from folium.plugins import HeatMap
from folium import Marker
from folium.plugins import MarkerCluster
import plotly.express as px
import plotly.io as pio

# spatial statistics
from esda.moran import Moran
from esda.getisord import G_Local
from libpysal.weights import Queen, Rook

# system and utility
import warnings
import os
import io
from IPython.display import IFrame
from google.colab import files

from libpysal.weights import Queen, Rook
from esda.moran import Moran
import matplotlib.pyplot as plt
from splot.esda import moran_scatterplot

# suppress warnings
warnings.filterwarnings('ignore')

# inline
%matplotlib inline

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [30]:
# data source:
file_path1 = '/content/drive/My Drive/X999/evictions_pre_post_covid.csv'
file_path2 = '/content/drive/My Drive/X999/evictions_covid.csv'
file_path3 = '/content/drive/My Drive/X999/bbl_cleaned.csv'

In [7]:
evictions_pre_post_raw = pd.read_csv(file_path1)
evictions_covid_raw = pd.read_csv(file_path2)

In [8]:
evictions_pre_post = evictions_pre_post_raw.copy()
evictions_covid = evictions_covid_raw.copy()

In [9]:
# evictions_pre_post.columns, \
# evictions_covid.shape

In [10]:
bbl = pd.read_csv(file_path3)

In [11]:
bbl_df = bbl.copy()

In [12]:
len(list(bbl_df.columns))
# correct

110

In [13]:
bbl_df.head(4)

Unnamed: 0,borough,block,lot,community board,census tract 2010,cb2010,schooldist,council district,postcode,firecomp,...,building_category,is_condo,floor_category,rent_era,architectural_style,economic_period,residential_units_category,is_llc,building_size_category,size_quartile
0,BK,8366,222,318.0,696.02,2002.0,22.0,46.0,11234.0,E323,...,single-family,False,low-rise,"1994–Present, vacancy decontrol","2001-present, New Architecture","2009–present, post-financial crisis",single-unit,True,very small,Q1 (smallest 25%)
1,SI,2088,7501,502.0,273.01,1003.0,31.0,50.0,10314.0,E166,...,condo-co-op,True,low-rise,"Pre-1947, pre-rent-control",Pre-1900,"Pre-1929, pre-great depression",6-20 units,False,very large,Q4 (largest 25%)
2,QN,15932,7501,414.0,964.0,2015.0,27.0,31.0,11692.0,L121,...,condo-co-op,True,low-rise,"Pre-1947, pre-rent-control",Pre-1900,"Pre-1929, pre-great depression",6-20 units,False,large,Q4 (largest 25%)
3,BK,2571,28,301.0,561.0,1005.0,14.0,33.0,11222.0,L106,...,single-family,False,low-rise,"1994–Present, vacancy decontrol","2001-present, New Architecture","2009–present, post-financial crisis",single-unit,True,medium-small,Q4 (largest 25%)


In [14]:
# list(bbl_df.columns)

In [15]:
evictions_pre_post.columns

Index(['court_index_number', 'docket_number', 'eviction_address',
       'eviction_apartment_number', 'executed_date', 'borough',
       'eviction_postcode', 'ejectment', 'eviction/legal_possession',
       'latitude', 'longitude', 'community_board', 'council_district',
       'census_tract', 'bin', 'bbl', 'nta', 'geometry', 'eviction_count',
       'year', 'average_year_eviction_count'],
      dtype='object')

In [16]:
# only check the relevant ones
bbl_cleaned = bbl_df[['bbl', 'yearbuilt', 'bldgclass', 'numfloors', 'unitsres', 'ownername', 'bldgarea','building_type', 'building_category', 'is_condo', 'floor_category',
           'rent_era', 'architectural_style', 'economic_period',
           'residential_units_category', 'is_llc', 'building_size_category', 'size_quartile']]
bbl_cleaned.head()

Unnamed: 0,bbl,yearbuilt,bldgclass,numfloors,unitsres,ownername,bldgarea,building_type,building_category,is_condo,floor_category,rent_era,architectural_style,economic_period,residential_units_category,is_llc,building_size_category,size_quartile
0,3083660222,2019.0,A5,2.0,1.0,"EAST 69 AVENUE N DEVELOPMENT, LLC",1288.0,post-war,single-family,False,low-rise,"1994–Present, vacancy decontrol","2001-present, New Architecture","2009–present, post-financial crisis",single-unit,True,very small,Q1 (smallest 25%)
1,5020887501,0.0,R3,0.0,16.0,BAYBERRY WOODS CONDOMINIUM,26400.0,pre-war,condo-co-op,True,low-rise,"Pre-1947, pre-rent-control",Pre-1900,"Pre-1929, pre-great depression",6-20 units,False,very large,Q4 (largest 25%)
2,4159327501,0.0,R3,0.0,18.0,UNAVAILABLE OWNER,16599.0,pre-war,condo-co-op,True,low-rise,"Pre-1947, pre-rent-control",Pre-1900,"Pre-1929, pre-great depression",6-20 units,False,large,Q4 (largest 25%)
3,3025710028,2018.0,A5,3.0,1.0,85 CALYER STREET LLC,3478.0,post-war,single-family,False,low-rise,"1994–Present, vacancy decontrol","2001-present, New Architecture","2009–present, post-financial crisis",single-unit,True,medium-small,Q4 (largest 25%)
4,3015147501,0.0,R2,0.0,12.0,UNAVAILABLE OWNER,11134.0,pre-war,condo-co-op,True,low-rise,"Pre-1947, pre-rent-control",Pre-1900,"Pre-1929, pre-great depression",6-20 units,False,large,Q4 (largest 25%)


In [17]:
len(bbl_cleaned)

753110

In [18]:
type(evictions_pre_post['bbl'][0]), type(bbl_cleaned['bbl'][0])

(numpy.float64, numpy.int64)

In [19]:
# convert both to strings
evictions_pre_post['bbl'] = evictions_pre_post['bbl'].astype(str)
bbl_cleaned['bbl'] = bbl_cleaned['bbl'].astype(str)
type(evictions_pre_post['bbl'][0]), type(bbl_cleaned['bbl'][0])

(str, str)

In [20]:
# use set to check common ones
eviction_bbls = set(evictions_pre_post['bbl'])
building_bbls = set(bbl_cleaned['bbl'])
common_bbls = eviction_bbls.intersection(building_bbls)
print(f"evictions data unique bbls: {len(eviction_bbls)}")
print(f"building data unqiue bbls: {len(building_bbls)}")
print(f"number of common bbls: {len(common_bbls)}")
print(f"percentage of eviction bbls found in building data: {len(common_bbls)/len(eviction_bbls)*100:.2f}%")
# so, at first, there were no common ones

evictions data unique bbls: 32856
building data unqiue bbls: 753110
number of common bbls: 0
percentage of eviction bbls found in building data: 0.00%


In [21]:
print("eveiction bbls: ", evictions_pre_post['bbl'].head(10).tolist())
print("building bbls: ", bbl_cleaned['bbl'].head(10).tolist())
# 10 and 8/9 digits
# and evictions data has digits

eveiction bbls:  ['2047200001.0', '2029990111.0', '1016160001.0', '3046980037.0', '2026110033.0', '4098287501.0', '2030710025.0', '4084430001.0', '5012720011.0', '1010907502.0']
building bbls:  ['3083660222', '5020887501', '4159327501', '3025710028', '3015147501', '3015057501', '5054397501', '4010077501', '3031970008', '5020877501']


In [22]:
print("evictions BBL string length:", \
      evictions_pre_post['bbl'].str.len().value_counts())
print("building BBL string length:", \
      bbl_cleaned['bbl'].str.len().value_counts())
# so, indivisual lengths and total counts are different

evictions BBL string length: bbl
12    74079
3         3
Name: count, dtype: int64
building BBL string length: bbl
10    753110
Name: count, dtype: int64


In [23]:
def clean_bbl(bbl_val):
    if pd.isna(bbl_val):
        return None
    bbl_as_string = str(bbl_val)
    digits_only = ""
    for character in bbl_as_string:
        if character.isdigit():
            digits_only = digits_only + character
    first_ten_digits = digits_only[:10]
    final_bbl = first_ten_digits.zfill(10)
    return final_bbl

In [24]:
evictions_pre_post['bbl_clean'] = evictions_pre_post['bbl'].apply(clean_bbl)
bbl_cleaned['bbl_clean'] = bbl_cleaned['bbl'].apply(clean_bbl)

eviction_bbls_clean = set(evictions_pre_post['bbl_clean'].dropna())
building_bbls_clean = set(bbl_cleaned['bbl_clean'].dropna())
common_bbls_clean = eviction_bbls_clean.intersection(building_bbls_clean)

f"number of common BBLs after thorough cleaning: {len(common_bbls_clean)}", len(bbl_cleaned), len(evictions_pre_post), \
len(set(evictions_pre_post['bbl'])), \
len(set(bbl_cleaned['bbl'])), \
31545/32856 # pretty good ratio of matched bbl in eviction data

('number of common BBLs after thorough cleaning: 31545',
 753110,
 74082,
 32856,
 753110,
 0.9600986121256392)

In [25]:
eviction_bbls_std = set(evictions_pre_post['bbl_clean'])
building_bbls_std = set(bbl_cleaned['bbl_clean'])
len(evictions_pre_post['bbl']), \
len(evictions_pre_post['bbl_clean']), \
len(bbl_cleaned['bbl_clean']), \
len(bbl_cleaned['bbl']), \
len(eviction_bbls_std), \
len(building_bbls_std)
# 32856 should be the number we aim for common bbls

(74082, 74082, 753110, 753110, 32856, 753110)

In [26]:
evictions_pre_post['bbl_clean'] = evictions_pre_post['bbl_clean'].astype(str).str.strip()
bbl_cleaned['bbl_clean'] = bbl_cleaned['bbl_clean'].astype(str).str.strip()

In [27]:
eviction_bbls_std = set(evictions_pre_post['bbl_clean'])
building_bbls_std = set(bbl_cleaned['bbl_clean'])
common_bbls_std = eviction_bbls_std.intersection(building_bbls_std)
print(f"number of common bbl after standardization: {len(common_bbls_std)}")

number of common bbl after standardization: 31545


In [28]:
cleaned_merge = pd.merge(
    evictions_pre_post,
    bbl_cleaned[['bbl_clean', 'yearbuilt', 'bldgclass', 'numfloors', 'unitsres', 'ownername', 'bldgarea','building_type', 'building_category', 'is_condo', 'floor_category',
           'rent_era', 'architectural_style', 'economic_period',
           'residential_units_category', 'is_llc', 'building_size_category', 'size_quartile']],
    on='bbl_clean',
    how='left'
)

f"number of non null building_type values in cleaned merge: {cleaned_merge['building_type'].notna().sum()}", \
len(cleaned_merge), len(bbl_cleaned), len(evictions_pre_post)
# there are way more buildings than the ones that had the evictions.

('number of non null building_type values in cleaned merge: 70312',
 74082,
 753110,
 74082)

In [29]:
display(cleaned_merge.head())
# shows all the columns

Unnamed: 0,court_index_number,docket_number,eviction_address,eviction_apartment_number,executed_date,borough,eviction_postcode,ejectment,eviction/legal_possession,latitude,...,building_category,is_condo,floor_category,rent_era,architectural_style,economic_period,residential_units_category,is_llc,building_size_category,size_quartile
0,34859/16,53416,3476 SEYMOUR AVENUE,3-B,2017-01-03,BRONX,10469,Not an Ejectment,Possession,40.87762,...,walk-up,False,mid-rise,"Pre-1947, pre-rent-control","1931–1950, Manhattan Modern","1930-1945, great depression and WWII",100+ units,True,mega,Q4 (largest 25%)
1,B57808/16,74242,1426 BRYANT AVENUE,10 AKA 2ND FL UNIT,2017-01-03,BRONX,10459,Not an Ejectment,Possession,40.830691,...,two-family,False,low-rise,"1994–Present, vacancy decontrol","1981–2000, Post-Modernism","1991–2008, modern economic growth",2-unit,False,small,Q3 (50-75%)
2,N069212/14,355977,1309 5TH AVENUE,24H,2017-01-03,MANHATTAN,10029,Not an Ejectment,Possession,40.797309,...,elevator,False,high-rise,"1970–1993, deregularization","1951–1980, the International Style, Alternativ...","1946–1975, pst war economic boom",100+ units,False,mega,Q4 (largest 25%)
3,K065455/16,367441,458 EAST 51 STREET,6A,2017-01-03,BROOKLYN,11203,Not an Ejectment,Possession,40.650624,...,elevator,False,mid-rise,"Pre-1947, pre-rent-control","1931–1950, Manhattan Modern","1930-1945, great depression and WWII",21-100 units,True,very large,Q4 (largest 25%)
4,33992/16,458984,580 EAST 168TH STREE T,*,2017-01-03,BRONX,10456,Not an Ejectment,Possession,40.830494,...,,,,,,,,,,


In [31]:
cleaned_merge.to_csv('/content/drive/My Drive/X999/bbl_evictions_merged.csv', index=False)