In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib
import matplotlib.pyplot as plt
import os
import io
import geopandas as gpd
import seaborn as sns

# suppress warning
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [36]:
# get them back from the cloud
# the reduced version should only have 10 columns
saved_2017 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2017_reduced.csv"
saved_2018 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2018_reduced.csv"
saved_2019 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2019_reduced.csv"
saved_2020 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2020_reduced.csv"
saved_2021 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2021_reduced.csv"
saved_2022 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2022_reduced.csv"
saved_2023 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2023_reduced.csv"
saved_2024 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2024_reduced.csv"


In [37]:
df_2017 = pd.read_csv(saved_2017)
df_2018 = pd.read_csv(saved_2018)
df_2019 = pd.read_csv(saved_2019)
df_2020 = pd.read_csv(saved_2020)
df_2021 = pd.read_csv(saved_2021)
df_2022 = pd.read_csv(saved_2022)
df_2023 = pd.read_csv(saved_2023)
df_2024 = pd.read_csv(saved_2024)

In [38]:
df_2017.shape, df_2018.shape, df_2019.shape, df_2020.shape, df_2021.shape, df_2022.shape, df_2023.shape, df_2024.shape

((1032946, 10),
 (1066249, 10),
 (996996, 10),
 (1263463, 10),
 (1355688, 10),
 (1433295, 10),
 (1412135, 10),
 (1527906, 10))

In [68]:
all_years_df = pd.concat([df_2017, df_2018, df_2019, df_2023, df_2024])

In [69]:
covid_311_df = pd.concat([df_2020, df_2021, df_2022])

In [70]:
all_years_df.shape, all_years_df.columns

((6036232, 10),
 Index(['unique_key', 'created_date', 'closed_date', 'complaint_type',
        'incident_zip', 'incident_address', 'bbl', 'borough', 'latitude',
        'longitude'],
       dtype='object'))

In [71]:
all_years_df.bbl[0].dtype

dtype('float64')

In [72]:
all_years_df.bbl = all_years_df.bbl.astype('int64')

In [43]:
saved_bbl_evictions = "/content/drive/My Drive/X999/bbl_evictions_merged.csv"

In [44]:
bbl_evictions = pd.read_csv(saved_bbl_evictions)

In [45]:
bbl_evictions.shape

(74082, 40)

In [46]:
bbl_evictions.columns

Index(['court_index_number', 'docket_number', 'eviction_address',
       'eviction_apartment_number', 'executed_date', 'borough',
       'eviction_postcode', 'ejectment', 'eviction/legal_possession',
       'latitude', 'longitude', 'community_board', 'council_district',
       'census_tract', 'bin', 'bbl', 'nta', 'geometry', 'eviction_count',
       'year', 'average_year_eviction_count', 'bbl_clean', 'yearbuilt',
       'bldgclass', 'numfloors', 'unitsres', 'ownername', 'bldgarea',
       'building_type', 'building_category', 'is_condo', 'floor_category',
       'rent_era', 'architectural_style', 'economic_period',
       'residential_units_category', 'is_llc', 'building_size_category',
       'size_quartile', 'decade'],
      dtype='object')

In [73]:
bbl_evictions.bbl_clean[0].dtype, all_years_df.bbl[0].dtype, all_years_df.bbl[0]

(dtype('int64'),
 dtype('int64'),
 0    1019200007
 0    2024830015
 0    3088440010
 0    4021460028
 0    1020570056
 Name: bbl, dtype: int64)

In [74]:
all_years_df.bbl = all_years_df.bbl.astype('int64')

In [75]:
all_years_df.bbl[0].dtype

dtype('int64')

In [76]:
all_years_df.bbl, bbl_evictions.bbl_clean

(0          1019200007
 1          1017540155
 2          2050540041
 3          3066870049
 4          3050860041
               ...    
 1527901    4090210002
 1527902    4013110066
 1527903    4091250055
 1527904    3043710001
 1527905    2032367501
 Name: bbl, Length: 6036232, dtype: int64,
 0        2047200001
 1        2029990111
 2        1016160001
 3        3046980037
 4        2026110033
             ...    
 74077    1020510138
 74078    1010420061
 74079    3013810057
 74080    1002060004
 74081    1021390329
 Name: bbl_clean, Length: 74082, dtype: int64)

In [77]:
# pandas.Series.isin() returns a boolean mask of values in the first series that also appear in the second
overlap_mask = bbl_evictions.bbl_clean.isin(all_years_df.bbl)
overlapping_values = bbl_evictions.bbl_clean[overlap_mask]
num_overlaps = overlap_mask.sum()
num_overlaps
# pretty good, given the bbl_evictions_df.shape is (74082, 40)
# (71149 / 74082)) # 96%

np.int64(69551)

In [78]:
all_years_df.head()

Unnamed: 0,unique_key,created_date,closed_date,complaint_type,incident_zip,incident_address,bbl,borough,latitude,longitude
0,38070156,2017-12-31 23:59:35,2018-01-04 19:27:02,HEAT/HOT WATER,10030,181 WEST 135 STREET,1019200007,MANHATTAN,41,-74
1,38067146,2017-12-31 23:59:34,2018-01-01 00:57:19,Noise - Residential,10035,2048 MADISON AVENUE,1017540155,MANHATTAN,41,-74
2,38066214,2017-12-31 23:59:15,2018-01-01 02:48:23,Noise - Residential,10466,1902 NEREID AVENUE,2050540041,BRONX,41,-74
3,38067041,2017-12-31 23:58:38,2018-01-01 02:53:28,Noise - Street/Sidewalk,11230,1201 AVENUE H,3066870049,BROOKLYN,41,-74
4,38068229,2017-12-31 23:58:33,2018-01-08 13:30:58,HEAT/HOT WATER,11226,70 LINDEN BOULEVARD,3050860041,BROOKLYN,41,-74


### 96% of the buildings that had evictions also had 311 complaints

In [79]:
# or use sets, but this only shows the unique overlapped bbl numbers
# (faster for large datasets)
set1 = set(bbl_evictions.bbl_clean)
set2 = set(all_years_df.bbl)
unique_overlap = set1.intersection(set2)
len(unique_overlap)

29168

In [80]:
# numpy.intersect1d()
overlapping_values = np.intersect1d(bbl_evictions.bbl_clean, all_years_df.bbl)
len(overlapping_values)

29168

In [81]:
bbl_evictions.head()

Unnamed: 0,court_index_number,docket_number,eviction_address,eviction_apartment_number,executed_date,borough,eviction_postcode,ejectment,eviction/legal_possession,latitude,...,is_condo,floor_category,rent_era,architectural_style,economic_period,residential_units_category,is_llc,building_size_category,size_quartile,decade
0,34859/16,53416,3476 SEYMOUR AVENUE,3-B,2017-01-03,BRONX,10469,Not an Ejectment,Possession,41,...,False,mid-rise,"Pre-1947, pre-rent-control","1931–1950, Manhattan Modern","1930-1945, great depression and WWII",100+ units,True,mega,Q4 (largest 25%),1930-1939
1,B57808/16,74242,1426 BRYANT AVENUE,10 AKA 2ND FL UNIT,2017-01-03,BRONX,10459,Not an Ejectment,Possession,41,...,False,low-rise,"1994–Present, vacancy decontrol","1981–2000, Post-Modernism","1991–2008, modern economic growth",2-unit,False,small,Q3 (50-75%),1990-1999
2,N069212/14,355977,1309 5TH AVENUE,24H,2017-01-03,MANHATTAN,10029,Not an Ejectment,Possession,41,...,False,high-rise,"1970–1993, deregularization","1951–1980, the International Style, Alternativ...","1946–1975, pst war economic boom",100+ units,False,mega,Q4 (largest 25%),1970-1979
3,K065455/16,367441,458 EAST 51 STREET,6A,2017-01-03,BROOKLYN,11203,Not an Ejectment,Possession,41,...,False,mid-rise,"Pre-1947, pre-rent-control","1931–1950, Manhattan Modern","1930-1945, great depression and WWII",21-100 units,True,very large,Q4 (largest 25%),1940-1949
4,33992/16,458984,580 EAST 168TH STREE T,*,2017-01-03,BRONX,10456,Not an Ejectment,Possession,41,...,,,,,,,,,,


### Buildings that received most frequent complaints

In [82]:
complaint_counts_by_bbl = all_years_df.groupby('bbl').size().reset_index(name='complaint_count')
complaint_counts_by_bbl = complaint_counts_by_bbl.sort_values('complaint_count', ascending=False)

In [84]:
complaint_counts_by_bbl.head(1)

Unnamed: 0,bbl,complaint_count
71563,2048330028,56960


A multi family 2 floor building in the Bronx

In [85]:
complaint_stats = complaint_counts_by_bbl['complaint_count'].describe()
complaint_stats

Unnamed: 0,complaint_count
count,342961
mean,18
std,132
min,1
25%,1
50%,2
75%,8
max,56960


In [88]:
additional_stats = {
    'median': f"{complaint_counts_by_bbl['complaint_count'].median():.0f}",
    'mode': f"{complaint_counts_by_bbl['complaint_count'].mode()[0]:.0f}",
    'range': f"{complaint_counts_by_bbl['complaint_count'].max() - complaint_counts_by_bbl['complaint_count'].min():.0f}",
    'iqr': f"{complaint_counts_by_bbl['complaint_count'].quantile(0.75) - complaint_counts_by_bbl['complaint_count'].quantile(0.25):.0f}",
    'skew': f"{complaint_counts_by_bbl['complaint_count'].skew():.2f}",
    'kurtosis': f"{complaint_counts_by_bbl['complaint_count'].kurtosis():.2f}",
    'sum': f"{complaint_counts_by_bbl['complaint_count'].sum():.0f}",
    'variance': f"{complaint_counts_by_bbl['complaint_count'].var():.0f}"
}
additional_stats

{'median': '2',
 'mode': '1',
 'range': '56959',
 'iqr': '7',
 'skew': '243.52',
 'kurtosis': '100599.53',
 'sum': '6036232',
 'variance': '17503'}

In [89]:
top_complaint_buildings = complaint_counts_by_bbl.head(400)
top_complaint_buildings

Unnamed: 0,bbl,complaint_count
71563,2048330028,56960
40164,2025110068,10091
226265,4015110001,9836
264459,4068290001,9662
46457,2029020036,8165
...,...,...
51831,2032140030,923
32724,1020360038,923
54547,2033280171,922
35440,1021490093,922


In [92]:
bbl_evictions.bbl.dtype, bbl_evictions.bbl_clean.dtype, all_years_df.bbl.dtype

(dtype('float64'), dtype('int64'), dtype('int64'))

In [93]:
bbl_evictions = bbl_evictions.drop(columns=['bbl'])
bbl_evictions = bbl_evictions.rename(columns={'bbl_clean': 'bbl'})

In [94]:
bbl_evictions['bbl'] = bbl_evictions['bbl'].astype('int64')
all_years_df['bbl'] = all_years_df['bbl'].astype('int64')

In [128]:
complaints_by_bbl = all_years_df.groupby('bbl').size().reset_index(name='complaint_count')

In [129]:
complaints_by_bbl.head()

Unnamed: 0,bbl,complaint_count
0,0,1335
1,144969020,2
2,1000010010,25
3,1000010101,1
4,1000020001,5


In [130]:
bbl_evcitions_311_df = pd.merge(
    bbl_evictions,
    complaints_by_bbl,
    on='bbl',
    how='left'
)

In [131]:
bbl_evcitions_311_df.head()

Unnamed: 0,court_index_number,docket_number,eviction_address,eviction_apartment_number,executed_date,borough,eviction_postcode,ejectment,eviction/legal_possession,latitude,...,floor_category,rent_era,architectural_style,economic_period,residential_units_category,is_llc,building_size_category,size_quartile,decade,complaint_count
0,34859/16,53416,3476 SEYMOUR AVENUE,3-B,2017-01-03,BRONX,10469,Not an Ejectment,Possession,40.88,...,mid-rise,"Pre-1947, pre-rent-control","1931–1950, Manhattan Modern","1930-1945, great depression and WWII",100+ units,True,mega,Q4 (largest 25%),1930-1939,191.0
1,B57808/16,74242,1426 BRYANT AVENUE,10 AKA 2ND FL UNIT,2017-01-03,BRONX,10459,Not an Ejectment,Possession,40.83,...,low-rise,"1994–Present, vacancy decontrol","1981–2000, Post-Modernism","1991–2008, modern economic growth",2-unit,False,small,Q3 (50-75%),1990-1999,2.0
2,N069212/14,355977,1309 5TH AVENUE,24H,2017-01-03,MANHATTAN,10029,Not an Ejectment,Possession,40.8,...,high-rise,"1970–1993, deregularization","1951–1980, the International Style, Alternativ...","1946–1975, pst war economic boom",100+ units,False,mega,Q4 (largest 25%),1970-1979,1724.0
3,K065455/16,367441,458 EAST 51 STREET,6A,2017-01-03,BROOKLYN,11203,Not an Ejectment,Possession,40.65,...,mid-rise,"Pre-1947, pre-rent-control","1931–1950, Manhattan Modern","1930-1945, great depression and WWII",21-100 units,True,very large,Q4 (largest 25%),1940-1949,224.0
4,33992/16,458984,580 EAST 168TH STREE T,*,2017-01-03,BRONX,10456,Not an Ejectment,Possession,40.83,...,,,,,,,,,,1.0


In [133]:
nan_count = bbl_evcitions_311_df['complaint_count'].isna().sum()
nan_count, \
print(f"percentage of rows with missing complaint_count: {nan_count / len(bbl_evcitions_311_df) * 100:.2f}%")
# about 6.12 of buildings do not have any 311 complaints in those five years.
# safe to ignore them and just fillna

percentage of rows with missing complaint_count: 6.12%


(np.int64(4531), None)

In [134]:
bbl_evcitions_311_df['complaint_count'] = bbl_evcitions_311_df['complaint_count'].fillna(0)
# fill the nan cells with 0

In [104]:
# correlation between eviction counts and complaint counts
eviction_complaint_corr = bbl_evcitions_311_df.groupby('bbl').agg({
    'court_index_number': 'count',
    'complaint_count': 'first'
}).corr()
print("Correlation between evictions and complaints:")
print(eviction_complaint_corr)

Correlation between evictions and complaints:
                    court_index_number  complaint_count
court_index_number                1.00             0.32
complaint_count                   0.32             1.00


In [162]:
bbl_evcitions_311_df.to_csv('/content/drive/My Drive/X999/bbl_evcitions_311_df_groupby_bbl.csv', index=False)

## To preserve all the complaint type details and find their correlations between the complaint types and the number of evictions, we need to do a simple merge that will result with most details.

In [135]:
all_years_df.shape, bbl_evictions.shape

((6036232, 10), (74082, 39))

In [142]:
merged_df = all_years_df.merge(bbl_evictions, on='bbl', how='inner')

In [165]:
# too large, decided not to save
# merged_df.to_csv('/content/drive/My Drive/X999/bbl_evcitions_311_df_outside_covid.csv', index=False)

In [166]:
merged_df.shape

(23687576, 48)

In [157]:
evictions_per_bbl = merged_df.groupby('bbl')['court_index_number'].nunique().reset_index(name='eviction_count')
evictions_per_bbl

Unnamed: 0,bbl,eviction_count
0,0,3
1,1000077501,2
2,1000157501,4
3,1000157502,3
4,1000160015,1
...,...,...
29163,5080220054,1
29164,5080260130,2
29165,5080280074,1
29166,5080460052,1


In [158]:
# create counts of each complaint type per bbl
complaint_types = merged_df[['bbl', 'complaint_type']].copy()
complaint_dummies = pd.get_dummies(complaint_types, columns=['complaint_type'])
complaint_counts = complaint_dummies.groupby('bbl').sum()

In [159]:
analysis_df = evictions_per_bbl.set_index('bbl').join(complaint_counts)

In [160]:
dict_of_correlations = {}
for column in complaint_counts.columns:
    correlation = analysis_df['eviction_count'].corr(analysis_df[column])
    dict_of_correlations[column.replace('complaint_type_', '')] = correlation

In [161]:
sorted_correlations = dict(sorted(dict_of_correlations.items(), key=lambda item: item[1], reverse=True))
for complaint_type, correlation in sorted_correlations.items():
    print(f"Complaint type: {complaint_type}, correlation with evictions: {correlation:.3f}")

Complaint type: FLOORING/STAIRS, correlation with evictions: 0.732
Complaint type: DOOR/WINDOW, correlation with evictions: 0.707
Complaint type: ELECTRIC, correlation with evictions: 0.696
Complaint type: PAINT/PLASTER, correlation with evictions: 0.692
Complaint type: General Construction/Plumbing, correlation with evictions: 0.688
Complaint type: SAFETY, correlation with evictions: 0.677
Complaint type: Animal Abuse, correlation with evictions: 0.670
Complaint type: UNSANITARY CONDITION, correlation with evictions: 0.667
Complaint type: Indoor Air Quality, correlation with evictions: 0.659
Complaint type: GENERAL, correlation with evictions: 0.640
Complaint type: Plumbing, correlation with evictions: 0.623
Complaint type: Animal-Abuse, correlation with evictions: 0.615
Complaint type: Noise, correlation with evictions: 0.611
Complaint type: PLUMBING, correlation with evictions: 0.604
Complaint type: Noise - Residential, correlation with evictions: 0.603
Complaint type: Unsanitary Co