In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib
import matplotlib.pyplot as plt
import os
import io
import geopandas as gpd
import seaborn as sns

# suppress warning
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# get them back from the cloud
# the reduced version should only have 10 columns
saved_2017 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2017_reduced.csv"
saved_2018 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2018_reduced.csv"
saved_2019 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2019_reduced.csv"
saved_2020 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2020_reduced.csv"
saved_2021 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2021_reduced.csv"
saved_2022 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2022_reduced.csv"
saved_2023 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2023_reduced.csv"
saved_2024 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2024_reduced.csv"


In [None]:
df_2017 = pd.read_csv(saved_2017)
df_2018 = pd.read_csv(saved_2018)
df_2019 = pd.read_csv(saved_2019)
df_2020 = pd.read_csv(saved_2020)
df_2021 = pd.read_csv(saved_2021)
df_2022 = pd.read_csv(saved_2022)
df_2023 = pd.read_csv(saved_2023)
df_2024 = pd.read_csv(saved_2024)

In [None]:
df_2017.shape, df_2018.shape, df_2019.shape, df_2020.shape, df_2021.shape, df_2022.shape, df_2023.shape, df_2024.shape

((1032946, 10),
 (1066249, 10),
 (996996, 10),
 (1263463, 10),
 (1355688, 10),
 (1433295, 10),
 (1412135, 10),
 (1527906, 10))

In [None]:
covid_311_df = pd.concat([df_2020, df_2021, df_2022])

In [None]:
covid_311_df.shape, covid_311_df.columns

((4052446, 10),
 Index(['unique_key', 'created_date', 'closed_date', 'complaint_type',
        'incident_zip', 'incident_address', 'bbl', 'borough', 'latitude',
        'longitude'],
       dtype='object'))

In [None]:
covid_311_df.bbl[0].dtype

dtype('int64')

In [None]:
covid_311_df.bbl = covid_311_df.bbl.astype('int64')

In [None]:
saved_bbl_evictions = "/content/drive/My Drive/X999/bbl_evictions_merged_covid.csv"

In [None]:
bbl_evictions = pd.read_csv(saved_bbl_evictions)

In [None]:
bbl_evictions.shape

(6338, 40)

In [None]:
bbl_evictions.columns

Index(['court_index_number', 'docket_number', 'eviction_address',
       'eviction_apartment_number', 'executed_date', 'borough',
       'eviction_postcode', 'ejectment', 'eviction/legal_possession',
       'latitude', 'longitude', 'community_board', 'council_district',
       'census_tract', 'bin', 'bbl', 'nta', 'geometry', 'eviction_count',
       'year', 'average_year_eviction_count', 'bbl_clean', 'yearbuilt',
       'bldgclass', 'numfloors', 'unitsres', 'ownername', 'bldgarea',
       'building_type', 'building_category', 'is_condo', 'floor_category',
       'rent_era', 'architectural_style', 'economic_period',
       'residential_units_category', 'is_llc', 'building_size_category',
       'size_quartile', 'decade'],
      dtype='object')

In [None]:
bbl_evictions.bbl_clean[0].dtype, covid_311_df.bbl[0].dtype, covid_311_df.bbl[0]

(dtype('int64'),
 dtype('int64'),
 0    2029820027
 0    2027030018
 0    1021650063
 Name: bbl, dtype: int64)

In [None]:
covid_311_df.bbl = covid_311_df.bbl.astype('int64')

In [None]:
covid_311_df.bbl[0].dtype

dtype('int64')

In [None]:
covid_311_df.bbl, bbl_evictions.bbl_clean

(0          2029820027
 1          1015290018
 2          2031990003
 3          1004530034
 4          1022217501
               ...    
 1433290    3082240022
 1433291    1018240016
 1433292    2024550014
 1433293    1018810032
 1433294    3068667501
 Name: bbl, Length: 4052446, dtype: int64,
 0       2028200035
 1       1016447502
 2       4031500075
 3       2037760056
 4       4007200032
            ...    
 6333    3074220101
 6334    3050990021
 6335    3072800188
 6336    3088150042
 6337    3051680049
 Name: bbl_clean, Length: 6338, dtype: int64)

In [None]:
# pandas.Series.isin() returns a boolean mask of values in the first series that also appear in the second
overlap_mask = bbl_evictions.bbl_clean.isin(covid_311_df.bbl)
overlapping_values = bbl_evictions.bbl_clean[overlap_mask]
num_overlaps = overlap_mask.sum()
num_overlaps
# pretty good, given the bbl_evictions_df.shape is (6338, 40)

np.int64(5602)

In [None]:
covid_311_df.head()

Unnamed: 0,unique_key,created_date,closed_date,complaint_type,incident_zip,incident_address,bbl,borough,latitude,longitude
0,48538697,2020-12-31 23:59:55,2021-01-01 01:07:04,Noise - Vehicle,10460.0,1569 HOE AVENUE,2029820027,BRONX,40.83582,-73.887516
1,48536596,2020-12-31 23:59:28,2021-01-01 01:33:12,Noise - Residential,10028.0,235 EAST 83 STREET,1015290018,MANHATTAN,40.776503,-73.954525
2,48536500,2020-12-31 23:58:55,2021-01-01 00:24:54,Noise - Residential,10468.0,2380 GRAND AVENUE,2031990003,BRONX,40.861553,-73.904168
3,48542024,2020-12-31 23:58:45,2021-01-14 16:49:17,Noise - Helicopter,10003.0,195 1 AVENUE,1004530034,MANHATTAN,40.729916,-73.983616
4,48543542,2020-12-31 23:58:39,2021-01-01 00:13:47,Noise - Residential,10034.0,571 ACADEMY STREET,1022217501,MANHATTAN,40.863565,-73.923221


### 68.66% of the buildings that had evictions also had 311 complaints

In [None]:
# or use sets, but this only shows the unique overlapped bbl numbers
# (faster for large datasets)
set1 = set(bbl_evictions.bbl_clean)
set2 = set(covid_311_df.bbl)
unique_overlap = set1.intersection(set2)
len(unique_overlap), 4352/6338 # 68.67 evictions during covid had 311 complaints

(4352, 0.6866519406752919)

In [None]:
# numpy.intersect1d()
overlapping_values = np.intersect1d(bbl_evictions.bbl_clean, covid_311_df.bbl)
len(overlapping_values)

4352

In [None]:
bbl_evictions.head()

Unnamed: 0,court_index_number,docket_number,eviction_address,eviction_apartment_number,executed_date,borough,eviction_postcode,ejectment,eviction/legal_possession,latitude,...,is_condo,floor_category,rent_era,architectural_style,economic_period,residential_units_category,is_llc,building_size_category,size_quartile,decade
0,35484/19,494073,184 MT. EDEN PARKWAY,UNIT 1D,2020-01-02,BRONX,10457,Not an Ejectment,Possession,40.842943,...,False,mid-rise,"Pre-1947, pre-rent-control","1921–1930, Art Deco Skyscrapers","Pre-1929, pre-great depression",21-100 units,True,large,Q4 (largest 25%),1920-1929
1,251388/19,117473,160 EAST 117 STREET,3-B,2020-01-02,MANHATTAN,10035,Not an Ejectment,Possession,40.799094,...,,,,,,,,,,
2,66822/19,25733,66-07 ALDERTON ST,unknown,2020-01-02,QUEENS,11374,Not an Ejectment,Possession,40.719316,...,False,low-rise,"1994–Present, vacancy decontrol","1981–2000, Post-Modernism","1991–2008, modern economic growth",3-5 units,True,small,Q4 (largest 25%),1990-1999
3,68501/18,91505,1245 STRATFORD AVE,D12,2020-01-02,BRONX,10472,Not an Ejectment,Possession,40.830623,...,False,mid-rise,"Pre-1947, pre-rent-control","1921–1930, Art Deco Skyscrapers","Pre-1929, pre-great depression",21-100 units,True,very large,Q4 (largest 25%),1920-1929
4,68498/19,26147,28-16 47TH STREET,1-L,2020-01-02,QUEENS,11103,Not an Ejectment,Possession,40.76239,...,False,low-rise,"Pre-1947, pre-rent-control","1921–1930, Art Deco Skyscrapers","Pre-1929, pre-great depression",6-20 units,True,medium,Q4 (largest 25%),1920-1929


### Buildings that received most frequent complaints

In [None]:
complaint_counts_by_bbl = covid_311_df.groupby('bbl').size().reset_index(name='complaint_count')
complaint_counts_by_bbl = complaint_counts_by_bbl.sort_values('complaint_count', ascending=False)

In [None]:
complaint_counts_by_bbl.head(1)

Unnamed: 0,bbl,complaint_count
64307,2048330080,115560


In [5]:
saved_bbl = "/content/drive/My Drive/X999/bbl_cleaned.csv"

In [8]:
bbl_df = pd.read_csv(saved_bbl)

In [10]:
building_info = bbl_df[bbl_df['bbl'] == 2048330080]
building_info[['numfloors', 'unitsres', 'yearbuilt', 'building_size_category']]

Unnamed: 0,numfloors,unitsres,yearbuilt,building_size_category
59543,2.0,2.0,1920.0,medium-small


In [11]:
display(building_info)

Unnamed: 0,borough,block,lot,community board,census tract 2010,cb2010,schooldist,council district,postcode,firecomp,...,floor_category,building_age,decade,rent_era,architectural_style,economic_period,residential_units_category,is_llc,building_size_category,size_quartile
59543,BX,4833,80,212.0,420.0,3000.0,11.0,12.0,10466.0,E063,...,low-rise,105.0,1920-1929,"Pre-1947, pre-rent-control","1900–1920, Beaux-Arts","Pre-1929, pre-great depression",2-unit,False,medium-small,Q4 (largest 25%)


Only 2 units 2-floor (possibly previously factory building illegally expanded or renovated into residential building), about top 25% large building (size-wise).

In [None]:
complaint_stats = complaint_counts_by_bbl['complaint_count'].describe()
complaint_stats

Unnamed: 0,complaint_count
count,282098.0
mean,14.365384
std,259.999719
min,1.0
25%,1.0
50%,2.0
75%,7.0
max,115560.0


In [None]:
additional_stats = {
    'median': f"{complaint_counts_by_bbl['complaint_count'].median():.0f}",
    'mode': f"{complaint_counts_by_bbl['complaint_count'].mode()[0]:.0f}",
    'range': f"{complaint_counts_by_bbl['complaint_count'].max() - complaint_counts_by_bbl['complaint_count'].min():.0f}",
    'iqr': f"{complaint_counts_by_bbl['complaint_count'].quantile(0.75) - complaint_counts_by_bbl['complaint_count'].quantile(0.25):.0f}",
    'skew': f"{complaint_counts_by_bbl['complaint_count'].skew():.2f}",
    'kurtosis': f"{complaint_counts_by_bbl['complaint_count'].kurtosis():.2f}",
    'sum': f"{complaint_counts_by_bbl['complaint_count'].sum():.0f}",
    'variance': f"{complaint_counts_by_bbl['complaint_count'].var():.0f}"
}
additional_stats

{'median': '2',
 'mode': '1',
 'range': '115559',
 'iqr': '6',
 'skew': '359.40',
 'kurtosis': '148949.83',
 'sum': '4052446',
 'variance': '67600'}

In [None]:
top_complaint_buildings = complaint_counts_by_bbl.head(400)
top_complaint_buildings

Unnamed: 0,bbl,complaint_count
64307,2048330080,115560
64293,2048330028,60661
221106,4068290001,17739
64308,2048330083,16570
46449,2031430130,15514
...,...,...
237830,4098510052,634
49627,2033040035,630
32200,1021270015,630
42542,2028790263,629


In [None]:
bbl_evictions.bbl.dtype, bbl_evictions.bbl_clean.dtype, covid_311_df.bbl.dtype

(dtype('float64'), dtype('int64'), dtype('int64'))

In [None]:
bbl_evictions = bbl_evictions.drop(columns=['bbl'])
bbl_evictions = bbl_evictions.rename(columns={'bbl_clean': 'bbl'})

In [None]:
bbl_evictions['bbl'] = bbl_evictions['bbl'].astype('int64')
covid_311_df['bbl'] = covid_311_df['bbl'].astype('int64')

In [None]:
complaints_by_bbl = covid_311_df.groupby('bbl').size().reset_index(name='complaint_count')

In [None]:
complaints_by_bbl.head()

Unnamed: 0,bbl,complaint_count
0,0,459
1,1000010010,12
2,1000010101,5
3,1000010201,1
4,1000020001,10


In [None]:
bbl_evcitions_311_df = pd.merge(
    bbl_evictions,
    complaints_by_bbl,
    on='bbl',
    how='left'
)

In [None]:
bbl_evcitions_311_df.head()

Unnamed: 0,court_index_number,docket_number,eviction_address,eviction_apartment_number,executed_date,borough,eviction_postcode,ejectment,eviction/legal_possession,latitude,...,floor_category,rent_era,architectural_style,economic_period,residential_units_category,is_llc,building_size_category,size_quartile,decade,complaint_count
0,35484/19,494073,184 MT. EDEN PARKWAY,UNIT 1D,2020-01-02,BRONX,10457,Not an Ejectment,Possession,40.842943,...,mid-rise,"Pre-1947, pre-rent-control","1921–1930, Art Deco Skyscrapers","Pre-1929, pre-great depression",21-100 units,True,large,Q4 (largest 25%),1920-1929,21.0
1,251388/19,117473,160 EAST 117 STREET,3-B,2020-01-02,MANHATTAN,10035,Not an Ejectment,Possession,40.799094,...,,,,,,,,,,7.0
2,66822/19,25733,66-07 ALDERTON ST,unknown,2020-01-02,QUEENS,11374,Not an Ejectment,Possession,40.719316,...,low-rise,"1994–Present, vacancy decontrol","1981–2000, Post-Modernism","1991–2008, modern economic growth",3-5 units,True,small,Q4 (largest 25%),1990-1999,2.0
3,68501/18,91505,1245 STRATFORD AVE,D12,2020-01-02,BRONX,10472,Not an Ejectment,Possession,40.830623,...,mid-rise,"Pre-1947, pre-rent-control","1921–1930, Art Deco Skyscrapers","Pre-1929, pre-great depression",21-100 units,True,very large,Q4 (largest 25%),1920-1929,346.0
4,68498/19,26147,28-16 47TH STREET,1-L,2020-01-02,QUEENS,11103,Not an Ejectment,Possession,40.76239,...,low-rise,"Pre-1947, pre-rent-control","1921–1930, Art Deco Skyscrapers","Pre-1929, pre-great depression",6-20 units,True,medium,Q4 (largest 25%),1920-1929,1.0


In [None]:
nan_count = bbl_evcitions_311_df['complaint_count'].isna().sum()
nan_count, \
print(f"percentage of rows with missing complaint_count: {nan_count / len(bbl_evcitions_311_df) * 100:.2f}%")
# about 6.12 of buildings do not have any 311 complaints in those five years.
# safe to ignore them and just fillna

percentage of rows with missing complaint_count: 11.61%


(np.int64(736), None)

In [None]:
bbl_evcitions_311_df['complaint_count'] = bbl_evcitions_311_df['complaint_count'].fillna(0)
# fill the nan cells with 0

In [None]:
# correlation between eviction counts and complaint counts
eviction_complaint_corr = bbl_evcitions_311_df.groupby('bbl').agg({
    'court_index_number': 'count',
    'complaint_count': 'first'
}).corr()
print("Correlation between evictions and complaints:")
print(eviction_complaint_corr)

Correlation between evictions and complaints:
                    court_index_number  complaint_count
court_index_number            1.000000         0.115233
complaint_count               0.115233         1.000000


In [None]:
bbl_evcitions_311_df.to_csv('/content/drive/My Drive/X999/bbl_evcitions_311_df_groupby_bbl_covid.csv', index=False)

## To preserve all the complaint type details and find their correlations between the complaint types and the number of evictions, we need to do a simple merge that will result with most details.

In [None]:
covid_311_df.shape, bbl_evictions.shape

((4052446, 10), (6338, 39))

In [None]:
merged_df = covid_311_df.merge(bbl_evictions, on='bbl', how='inner')

In [None]:
# too large, decided not to save
# merged_df.to_csv('/content/drive/My Drive/X999/bbl_evcitions_311_df_outside_covid.csv', index=False)

In [None]:
merged_df.shape

(1154173, 48)

In [None]:
evictions_per_bbl = merged_df.groupby('bbl')['court_index_number'].nunique().reset_index(name='eviction_count')
evictions_per_bbl

Unnamed: 0,bbl,eviction_count
0,1000160015,1
1,1000160100,3
2,1000167501,1
3,1000167511,1
4,1000170029,2
...,...,...
4347,5073560241,1
4348,5075900054,1
4349,5079080030,1
4350,5079140063,1


In [None]:
# create counts of each complaint type per bbl
complaint_types = merged_df[['bbl', 'complaint_type']].copy()
complaint_dummies = pd.get_dummies(complaint_types, columns=['complaint_type'])
complaint_counts = complaint_dummies.groupby('bbl').sum()

In [None]:
analysis_df = evictions_per_bbl.set_index('bbl').join(complaint_counts)

In [None]:
dict_of_correlations = {}
for column in complaint_counts.columns:
    correlation = analysis_df['eviction_count'].corr(analysis_df[column])
    dict_of_correlations[column.replace('complaint_type_', '')] = correlation

In [None]:
sorted_correlations = dict(sorted(dict_of_correlations.items(), key=lambda item: item[1], reverse=True))
for complaint_type, correlation in sorted_correlations.items():
    print(f"Complaint type: {complaint_type}, correlation with evictions: {correlation:.3f}")

Complaint type: FLOORING/STAIRS, correlation with evictions: 0.656
Complaint type: ELECTRIC, correlation with evictions: 0.644
Complaint type: PAINT/PLASTER, correlation with evictions: 0.643
Complaint type: UNSANITARY CONDITION, correlation with evictions: 0.632
Complaint type: GENERAL, correlation with evictions: 0.632
Complaint type: DOOR/WINDOW, correlation with evictions: 0.618
Complaint type: WATER LEAK, correlation with evictions: 0.610
Complaint type: Rodent, correlation with evictions: 0.606
Complaint type: Indoor Air Quality, correlation with evictions: 0.599
Complaint type: SAFETY, correlation with evictions: 0.578
Complaint type: Animal-Abuse, correlation with evictions: 0.566
Complaint type: General Construction/Plumbing, correlation with evictions: 0.562
Complaint type: PLUMBING, correlation with evictions: 0.538
Complaint type: Elevator, correlation with evictions: 0.537
Complaint type: Illegal Fireworks, correlation with evictions: 0.537
Complaint type: OUTSIDE BUILDING

## Compare these findings with the normal time correlations:

**Similarities**

**Building infrastructure issues** still dominate the top correlations in both datasets:

FLOORING/STAIRS remains at the top (0.656 now vs. 0.732 previously)\
ELECTRIC (0.644 now vs. 0.696)\
PAINT/PLASTER (0.643 now vs. 0.692)\
DOOR/WINDOW (0.618 now vs. 0.707)


**Indoor living conditions** continue to show strong correlation with evictions, reinforcing the connection between habitability concerns and displacement.

**Differences:**

**Rodent complaints** show a much stronger correlation in this dataset:

Now at 0.606 (in the top 10) vs. only 0.219 in the non-covid dataset
This suggests a potentially stronger relationship between pest infestations and evictions in this sample


**Noise residential** shows a much weaker correlation:

Now at 0.255 vs. 0.603 in the previous dataset
This is a substantial drop for a common tenant complaint


**General Construction/Plumbing** has a lower correlation:

Covid at 0.562 vs. 0.688 previously


New entries in this dataset include during covid:

COVID-19 Non-essential Construction (0.274)
Homeless Street Condition (0.096)

In summary, the correlations generally appear slightly lower in this covid dataset, with the top correlation at 0.656 compared to 0.732 in the previous data.