In [3]:
# !pip install geopandas folium matplotlib seaborn scipy
# !pip install esda
# !pip install splot
# # for google colab, had to reinstall some pacakges.

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import datetime as dt
import matplotlib
import matplotlib.pyplot as plt
import os
import io
import geopandas as gpd
import seaborn as sns

# suppress warning
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
file_path1 = '/content/drive/My Drive/X999/bbl_evictions_merged.csv'

In [6]:
file_path2 = '/content/drive/My Drive/X999/svi_simplified.csv'

In [7]:
bbl_evictions = pd.read_csv(file_path1)

In [8]:
svi_simplified_df = pd.read_csv(file_path2)

In [9]:
bbl_evictions.columns

Index(['court_index_number', 'docket_number', 'eviction_address',
       'eviction_apartment_number', 'executed_date', 'borough',
       'eviction_postcode', 'ejectment', 'eviction/legal_possession',
       'latitude', 'longitude', 'community_board', 'council_district',
       'census_tract', 'bin', 'bbl', 'nta', 'geometry', 'eviction_count',
       'year', 'average_year_eviction_count', 'bbl_clean', 'yearbuilt',
       'bldgclass', 'numfloors', 'unitsres', 'ownername', 'bldgarea',
       'building_type', 'building_category', 'is_condo', 'floor_category',
       'rent_era', 'architectural_style', 'economic_period',
       'residential_units_category', 'is_llc', 'building_size_category',
       'size_quartile', 'decade'],
      dtype='object')

In [10]:
svi_simplified_df.columns

Index(['FIPS', 'E_TOTPOP', 'RPL_THEME1', 'RPL_THEME2', 'RPL_THEME3',
       'RPL_THEME4', 'RPL_THEMES', 'EP_POV150', 'EP_UNEMP', 'EP_NOHSDP',
       'EP_UNINSUR', 'EP_AGE65', 'EP_AGE17', 'EP_DISABL', 'EP_LIMENG',
       'EP_NOVEH', 'EP_CROWD', 'EP_HBURD', 'EP_AFAM', 'EP_HISP', 'EP_ASIAN',
       'EP_AIAN', 'EP_NHPI', 'EP_TWOMORE', 'EP_OTHERRACE', 'EP_MINRTY',
       'EP_WHITE'],
      dtype='object')

In [11]:
svi_simplified_df = svi_simplified_df.rename(columns={'FIPS': 'zipcodes'})

In [12]:
bbl_evictions = bbl_evictions.rename(columns={'eviction_postcode': 'zipcodes'})

In [13]:
bbl_evictions = bbl_evictions.drop(columns=['bbl_clean'])

In [14]:
# merge based on bbl_evictions to just add svi columns/attributes to the bbl_evictions columns
merged_df = bbl_evictions.merge(
    svi_simplified_df,
    on='zipcodes',
    how='left'
)

In [15]:
saved_2017 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2017_reduced.csv"
saved_2018 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2018_reduced.csv"
saved_2019 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2019_reduced.csv"
saved_2020 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2020_reduced.csv"
saved_2021 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2021_reduced.csv"
saved_2022 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2022_reduced.csv"
saved_2023 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2023_reduced.csv"
saved_2024 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2024_reduced.csv"

In [16]:
df_2017 = pd.read_csv(saved_2017)
df_2018 = pd.read_csv(saved_2018)
df_2019 = pd.read_csv(saved_2019)
df_2020 = pd.read_csv(saved_2020)
df_2021 = pd.read_csv(saved_2021)
df_2022 = pd.read_csv(saved_2022)
df_2023 = pd.read_csv(saved_2023)
df_2024 = pd.read_csv(saved_2024)

In [17]:
normal_times_311_df = pd.concat([df_2017, df_2018, df_2019, df_2023, df_2024])

In [18]:
normal_times_311_df.head()

Unnamed: 0,unique_key,created_date,closed_date,complaint_type,incident_zip,incident_address,bbl,borough,latitude,longitude
0,38070156,2017-12-31 23:59:35,2018-01-04 19:27:02,HEAT/HOT WATER,10030.0,181 WEST 135 STREET,1019200000.0,MANHATTAN,40.815127,-73.943252
1,38067146,2017-12-31 23:59:34,2018-01-01 00:57:19,Noise - Residential,10035.0,2048 MADISON AVENUE,1017540000.0,MANHATTAN,40.808655,-73.938532
2,38066214,2017-12-31 23:59:15,2018-01-01 02:48:23,Noise - Residential,10466.0,1902 NEREID AVENUE,2050540000.0,BRONX,40.8987,-73.848528
3,38067041,2017-12-31 23:58:38,2018-01-01 02:53:28,Noise - Street/Sidewalk,11230.0,1201 AVENUE H,3066870000.0,BROOKLYN,40.629675,-73.964939
4,38068229,2017-12-31 23:58:33,2018-01-08 13:30:58,HEAT/HOT WATER,11226.0,70 LINDEN BOULEVARD,3050860000.0,BROOKLYN,40.652289,-73.956328


In [19]:
normal_times_311_df.columns

Index(['unique_key', 'created_date', 'closed_date', 'complaint_type',
       'incident_zip', 'incident_address', 'bbl', 'borough', 'latitude',
       'longitude'],
      dtype='object')

In [20]:
merged_df.columns

Index(['court_index_number', 'docket_number', 'eviction_address',
       'eviction_apartment_number', 'executed_date', 'borough', 'zipcodes',
       'ejectment', 'eviction/legal_possession', 'latitude', 'longitude',
       'community_board', 'council_district', 'census_tract', 'bin', 'bbl',
       'nta', 'geometry', 'eviction_count', 'year',
       'average_year_eviction_count', 'yearbuilt', 'bldgclass', 'numfloors',
       'unitsres', 'ownername', 'bldgarea', 'building_type',
       'building_category', 'is_condo', 'floor_category', 'rent_era',
       'architectural_style', 'economic_period', 'residential_units_category',
       'is_llc', 'building_size_category', 'size_quartile', 'decade',
       'E_TOTPOP', 'RPL_THEME1', 'RPL_THEME2', 'RPL_THEME3', 'RPL_THEME4',
       'RPL_THEMES', 'EP_POV150', 'EP_UNEMP', 'EP_NOHSDP', 'EP_UNINSUR',
       'EP_AGE65', 'EP_AGE17', 'EP_DISABL', 'EP_LIMENG', 'EP_NOVEH',
       'EP_CROWD', 'EP_HBURD', 'EP_AFAM', 'EP_HISP', 'EP_ASIAN', 'EP_AIAN',
      

In [21]:
merged_df.head()

Unnamed: 0,court_index_number,docket_number,eviction_address,eviction_apartment_number,executed_date,borough,zipcodes,ejectment,eviction/legal_possession,latitude,...,EP_HBURD,EP_AFAM,EP_HISP,EP_ASIAN,EP_AIAN,EP_NHPI,EP_TWOMORE,EP_OTHERRACE,EP_MINRTY,EP_WHITE
0,34859/16,53416,3476 SEYMOUR AVENUE,3-B,2017-01-03,BRONX,10469,Not an Ejectment,Possession,40.87762,...,36.5,52.2,29.5,5.5,0.6,0.0,2.3,1.0,91.2,8.8
1,B57808/16,74242,1426 BRYANT AVENUE,10 AKA 2ND FL UNIT,2017-01-03,BRONX,10459,Not an Ejectment,Possession,40.830691,...,52.4,28.9,67.3,0.2,0.3,0.0,1.3,0.1,98.1,1.9
2,N069212/14,355977,1309 5TH AVENUE,24H,2017-01-03,MANHATTAN,10029,Not an Ejectment,Possession,40.797309,...,48.5,25.0,45.7,10.8,0.1,0.0,2.0,1.6,85.1,14.9
3,K065455/16,367441,458 EAST 51 STREET,6A,2017-01-03,BROOKLYN,11203,Not an Ejectment,Possession,40.650624,...,41.1,80.3,6.9,1.8,0.1,0.0,5.5,0.4,95.0,5.0
4,33992/16,458984,580 EAST 168TH STREE T,*,2017-01-03,BRONX,10456,Not an Ejectment,Possession,40.830494,...,54.9,38.2,56.3,0.7,0.2,0.0,1.8,0.4,97.6,2.4


In [22]:
merged_df.shape
# lovely, all necessary features, but not too large

(74082, 65)

In [23]:
# bbl_complaints = normal_times_311_df.groupby(['bbl', 'complaint_type']).size().reset_index(name='complaint_count')

In [24]:
# bbl_complaints.shape

In [25]:
# bbl_complaints['bbl'] = bbl_complaints['bbl'].astype('Int64')

In [26]:
# bbl_complaints

In [27]:
# count_zero_bbl = (bbl_complaints['bbl'] == 0).sum()
# count_zero_bbl

In [28]:
# bbl_complaints = bbl_complaints[bbl_complaints['bbl'] != 0]
# bbl_complaints.reset_index(drop=True, inplace=True)

In [29]:
# bbl_complaints

In [30]:
# # first, create pivot table to have complaint types as columns
# complaint_pivot = pd.pivot_table(
#     bbl_complaints,
#     values='complaint_count',
#     index='bbl',
#     columns='complaint_type',
#     aggfunc='sum',
#     fill_value=0
# )

In [31]:
# complaint_pivot

In [32]:
# complaint_pivot['total_complaints'] = complaint_pivot.sum(axis=1)

In [33]:
# complaint_pivot

In [34]:
# complaint_pivot = complaint_pivot.reset_index()
# # so that bbl is a column, not the index

In [35]:
# final_merged_df = merged_df.merge(
#     complaint_pivot,
#     on='bbl',
#     how='left'
# )

In [36]:
# final_merged_df
# # merge this with the merged_df to connect BBLs with court_index_numbers

In [37]:
# final_pivot = final_merged_df.set_index('court_index_number')
# final_pivot
# # set court_index_number as the primary key (index)

In [38]:
# for col in complaint_pivot.columns:
#     if col != 'bbl':
#         final_pivot[col] = final_pivot[col].fillna(0)
# # fillna with 0

### It turns out, we do need a **pivot table**, but need to groupby first to make the merge process more seamless

In [39]:
merged_df.shape

(74082, 65)

In [40]:
merged_df.bbl = merged_df.bbl.astype('Int64')
normal_times_311_df.bbl = normal_times_311_df.bbl.astype('Int64')

In [41]:
normal_times_311_df.columns

Index(['unique_key', 'created_date', 'closed_date', 'complaint_type',
       'incident_zip', 'incident_address', 'bbl', 'borough', 'latitude',
       'longitude'],
      dtype='object')

In [42]:
merged_df.columns

Index(['court_index_number', 'docket_number', 'eviction_address',
       'eviction_apartment_number', 'executed_date', 'borough', 'zipcodes',
       'ejectment', 'eviction/legal_possession', 'latitude', 'longitude',
       'community_board', 'council_district', 'census_tract', 'bin', 'bbl',
       'nta', 'geometry', 'eviction_count', 'year',
       'average_year_eviction_count', 'yearbuilt', 'bldgclass', 'numfloors',
       'unitsres', 'ownername', 'bldgarea', 'building_type',
       'building_category', 'is_condo', 'floor_category', 'rent_era',
       'architectural_style', 'economic_period', 'residential_units_category',
       'is_llc', 'building_size_category', 'size_quartile', 'decade',
       'E_TOTPOP', 'RPL_THEME1', 'RPL_THEME2', 'RPL_THEME3', 'RPL_THEME4',
       'RPL_THEMES', 'EP_POV150', 'EP_UNEMP', 'EP_NOHSDP', 'EP_UNINSUR',
       'EP_AGE65', 'EP_AGE17', 'EP_DISABL', 'EP_LIMENG', 'EP_NOVEH',
       'EP_CROWD', 'EP_HBURD', 'EP_AFAM', 'EP_HISP', 'EP_ASIAN', 'EP_AIAN',
      

In [43]:
court_bbl_map = merged_df[['court_index_number', 'bbl']].drop_duplicates()
court_bbl_map.shape
# there are actually no duplicates, court_index_number is indeed good enough to be the sole primary key for the soon-to-be mega merged table

(74082, 2)

In [44]:
# # court_bbl_map.bbl = court_bbl_map.bbl.astype('Int64')
# normal_times_311_df.bbl = normal_times_311_df.bbl.astype('Int64')

In [45]:
# court_bbl_map

In [46]:
# complaints_pivot = pd.pivot_table(
#     normal_times_311_df,
#     index='bbl',
#     columns='complaint_type',
#     values='unique_key',
#     aggfunc='count',
#     fill_value=0
# )

In [47]:
# complaints_pivot

In [48]:
# complaints_pivot['total_complaints'] = complaints_pivot.sum(axis=1)

In [49]:
# court_complaints = court_bbl_map.merge(
#     complaints_pivot,
#     on='bbl',
#     how='left'
# ).fillna(0)

In [53]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
# display all columns

In [54]:
def categorize_complaint(complaint_type):
    complaint = complaint_type.lower().strip()

    # building systems and utilities stuff
    if 'heat' in complaint or 'hot water' in complaint:
        return 'heat_hot_water'
    elif any(term in complaint for term in ['water leak', 'plumbing', 'sewage']):
        return 'plumbing_issues'
    elif 'electric' in complaint:
        return 'electrical_issues'
    elif 'elevator' in complaint:
        return 'elevator_issues'

    # building structure and maintenance
    elif 'door' in complaint or 'window' in complaint:
        return 'doors_windows'
    elif any(term in complaint for term in ['paint', 'plaster', 'mold']):
        return 'walls_ceilings'
    elif 'floor' in complaint or 'stair' in complaint:
        return 'floors_stairs'
    elif 'outside building' in complaint:
        return 'building_exterior'
    elif 'appliance' in complaint:
        return 'appliances'

    # health and environmental impact
    elif 'unsanitary' in complaint or 'condition' in complaint:
        return 'sanitation_issues'
    elif any(pest in complaint for pest in ['rodent', 'mosquito', 'bee', 'wasp', 'pigeon']):
        return 'pest_issues'
    elif 'air' in complaint or 'asbestos' in complaint or 'smoking' in complaint:
        return 'air_quality'

    # noise (all noise complaints together)
    elif 'noise' in complaint:
        return 'noise_complaints'

    # public space influences and nuances
    elif 'homeless' in complaint or 'encampment' in complaint:
        return 'homeless_issues'
    elif 'graffiti' in complaint or 'advertisement' in complaint:
        return 'graffiti_posting'
    elif any(nuisance in complaint for nuisance in ['disorderly', 'panhandling', 'drinking', 'urinating', 'fireworks']):
        return 'public_nuisance'

    # living safety and services
    elif 'safety' in complaint:
        return 'safety_concerns'
    elif 'animal' in complaint or 'abuse' in complaint:
        return 'animal_issues'
    elif 'police' in complaint:
        return 'police_matters'

    # miscellaneous
    elif 'general' in complaint:
        return 'general_complaints'
    else:
        return 'other_issues'

In [55]:
# use categories, instead of real complaint types
# first we re-group the complaint type and then we merge
# the size would be smaller and better for merge later
normal_times_311_df['complaint_category'] = normal_times_311_df['complaint_type'].apply(categorize_complaint)

In [68]:
normal_times_311_df

Unnamed: 0,unique_key,created_date,closed_date,complaint_type,incident_zip,incident_address,bbl,borough,latitude,longitude,complaint_category
0,38070156,2017-12-31 23:59:35,2018-01-04 19:27:02,HEAT/HOT WATER,10030.0,181 WEST 135 STREET,1019200007,MANHATTAN,40.815127,-73.943252,heat_hot_water
1,38067146,2017-12-31 23:59:34,2018-01-01 00:57:19,Noise - Residential,10035.0,2048 MADISON AVENUE,1017540155,MANHATTAN,40.808655,-73.938532,noise_complaints
2,38066214,2017-12-31 23:59:15,2018-01-01 02:48:23,Noise - Residential,10466.0,1902 NEREID AVENUE,2050540041,BRONX,40.898700,-73.848528,noise_complaints
3,38067041,2017-12-31 23:58:38,2018-01-01 02:53:28,Noise - Street/Sidewalk,11230.0,1201 AVENUE H,3066870049,BROOKLYN,40.629675,-73.964939,noise_complaints
4,38068229,2017-12-31 23:58:33,2018-01-08 13:30:58,HEAT/HOT WATER,11226.0,70 LINDEN BOULEVARD,3050860041,BROOKLYN,40.652289,-73.956328,heat_hot_water
...,...,...,...,...,...,...,...,...,...,...,...
1527901,59889054,2024-01-01 00:00:51,2024-01-01 00:29:32,Noise - Street/Sidewalk,11416.0,85-04 95 AVENUE,4090210002,QUEENS,40.684738,-73.855753,noise_complaints
1527902,59892651,2024-01-01 00:00:48,2024-01-01 00:48:34,Noise - Street/Sidewalk,11377.0,41-52 72 STREET,4013110066,QUEENS,40.743761,-73.893186,noise_complaints
1527903,59891528,2024-01-01 00:00:46,2024-01-01 01:07:17,Illegal Fireworks,11417.0,106-23 75 STREET,4091250055,QUEENS,40.677497,-73.861919,public_nuisance
1527904,59888940,2024-01-01 00:00:43,2024-01-01 00:56:45,Noise - Residential,11207.0,640 STANLEY AVENUE,3043710001,BROOKLYN,40.658932,-73.884863,noise_complaints


In [56]:
# count each category for each bbl
# group the complaints by bbl and categories and then count them
bbl_category_counts = normal_times_311_df.groupby(['bbl', 'complaint_category']).size().reset_index(name='count')

In [67]:
bbl_category_counts

Unnamed: 0,bbl,complaint_category,count
0,0,animal_issues,2
1,0,appliances,27
2,0,doors_windows,54
3,0,electrical_issues,22
4,0,elevator_issues,56
...,...,...,...
872716,5200429999,noise_complaints,5
872717,5270000501,plumbing_issues,2
872718,5270000508,plumbing_issues,2
872719,5270000511,noise_complaints,1


### necessary to use a bit pivot table transformation here, because we want this table to have a "wide" format so that:

- each row represents a single building (bbl)
- each complaint category becomes its own column
- the values show the count for each category

In [57]:
# use a bit pivot table here, to make this a wide format with categories as columns
# pivot to have categories as columns
bbl_complaints_wide = bbl_category_counts.pivot(
    index='bbl',
    columns='complaint_category',
    values='count'
).fillna(0).reset_index()

In [66]:
bbl_complaints_wide

complaint_category,bbl,air_quality,animal_issues,appliances,building_exterior,doors_windows,electrical_issues,elevator_issues,floors_stairs,general_complaints,graffiti_posting,heat_hot_water,homeless_issues,noise_complaints,other_issues,pest_issues,plumbing_issues,police_matters,public_nuisance,safety_concerns,sanitation_issues,walls_ceilings,total_complaints
0,0,0.0,2.0,27.0,0.0,54.0,22.0,56.0,18.0,39.0,1.0,241.0,4.0,431.0,0.0,45.0,170.0,6.0,1.0,89.0,72.0,57.0,1335.0
1,144969020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0
2,1000010010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,22.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,25.0
3,1000010101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1000020001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
342956,5200429999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
342957,5270000501,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0
342958,5270000508,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0
342959,5270000511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [58]:
all_categories = [
    'heat_hot_water', 'plumbing_issues', 'electrical_issues', 'elevator_issues',
    'doors_windows', 'walls_ceilings', 'floors_stairs', 'building_exterior',
    'appliances', 'sanitation_issues', 'pest_issues', 'air_quality',
    'noise_complaints', 'homeless_issues', 'graffiti_posting', 'public_nuisance',
    'safety_concerns', 'animal_issues', 'police_matters', 'general_complaints',
    'other_issues'
]

In [99]:
# fill na with 0
for category in all_categories:
    if category not in bbl_complaints_wide.columns:
        bbl_complaints_wide[category] = 0

#  it's making sure that all possible complaint categories exist as columns, even if there were no complaints of that type
# in the entire dataset.
# for example, if no buildings had any "elevator_issues", the pivot operation wouldn't create an "elevator_issues" column at all.
# having another loop check ensures that every category in our predefined list exists as a column, even if it's all zeros.

In [100]:
# add a total column
bbl_complaints_wide['total_complaints'] = bbl_complaints_wide[all_categories].sum(axis=1)

In [65]:
bbl_complaints_wide
# so far, we do have the 311 complaint part figure out

complaint_category,bbl,air_quality,animal_issues,appliances,building_exterior,doors_windows,electrical_issues,elevator_issues,floors_stairs,general_complaints,graffiti_posting,heat_hot_water,homeless_issues,noise_complaints,other_issues,pest_issues,plumbing_issues,police_matters,public_nuisance,safety_concerns,sanitation_issues,walls_ceilings,total_complaints
0,0,0.0,2.0,27.0,0.0,54.0,22.0,56.0,18.0,39.0,1.0,241.0,4.0,431.0,0.0,45.0,170.0,6.0,1.0,89.0,72.0,57.0,1335.0
1,144969020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0
2,1000010010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,22.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,25.0
3,1000010101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1000020001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
342956,5200429999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
342957,5270000501,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0
342958,5270000508,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0
342959,5270000511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [131]:
merged_with_complaints = merged_df.merge(
    bbl_complaints_wide,
    on='bbl',
    how='left'
)
# the final merge with bbl, evictions, svi with 311 complaints

In [132]:
merged_with_complaints

Unnamed: 0,court_index_number,docket_number,eviction_address,eviction_apartment_number,executed_date,borough,zipcodes,ejectment,eviction/legal_possession,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta,geometry,eviction_count,year,average_year_eviction_count,yearbuilt,bldgclass,numfloors,unitsres,ownername,bldgarea,building_type,building_category,is_condo,floor_category,rent_era,architectural_style,economic_period,residential_units_category,is_llc,building_size_category,size_quartile,decade,E_TOTPOP,RPL_THEME1,RPL_THEME2,RPL_THEME3,RPL_THEME4,RPL_THEMES,EP_POV150,EP_UNEMP,EP_NOHSDP,EP_UNINSUR,EP_AGE65,EP_AGE17,EP_DISABL,EP_LIMENG,EP_NOVEH,EP_CROWD,EP_HBURD,EP_AFAM,EP_HISP,EP_ASIAN,EP_AIAN,EP_NHPI,EP_TWOMORE,EP_OTHERRACE,EP_MINRTY,EP_WHITE,total_complaints,heat_hot_water,plumbing_issues,electrical_issues,elevator_issues,doors_windows,walls_ceilings,floors_stairs,building_exterior,appliances,sanitation_issues,pest_issues,air_quality,noise_complaints,homeless_issues,graffiti_posting,public_nuisance,safety_concerns,animal_issues,police_matters,general_complaints,other_issues
0,34859/16,53416,3476 SEYMOUR AVENUE,3-B,2017-01-03,BRONX,10469,Not an Ejectment,Possession,40.877620,-73.849806,12.0,12.0,386.0,2117041.0,2047200001,Eastchester-Edenwald-Baychester,POINT (-73.849806 40.87762),13,2017,2.600000,1935.0,C1,4.0,158.0,EASTCHESTER HEIGHTS PROPERTY OWNER LLC,148800.0,pre-war,walk-up,False,mid-rise,"Pre-1947, pre-rent-control","1931–1950, Manhattan Modern","1930-1945, great depression and WWII",100+ units,True,mega,Q4 (largest 25%),1930-1939,71862.0,0.9255,0.9259,0.9746,0.8724,0.9507,22.6,8.4,17.5,4.8,17.0,22.2,12.0,6.2,34.7,6.9,36.5,52.2,29.5,5.5,0.6,0.0,2.3,1.0,91.2,8.8,2047200383.0,75.0,7.0,4.0,0.0,10.0,11.0,2.0,0.0,4.0,23.0,3.0,0.0,43.0,0.0,0.0,1.0,0.0,0.0,2.0,6.0,0.0
1,B57808/16,74242,1426 BRYANT AVENUE,10 AKA 2ND FL UNIT,2017-01-03,BRONX,10459,Not an Ejectment,Possession,40.830691,-73.888555,3.0,17.0,123.0,2099901.0,2029990111,Crotona Park East,POINT (-73.888555 40.830691),1,2017,1.000000,1995.0,B1,3.0,2.0,"BAYRON, AIDA L.",2520.0,post-war,two-family,False,low-rise,"1994–Present, vacancy decontrol","1981–2000, Post-Modernism","1991–2008, modern economic growth",2-unit,False,small,Q3 (50-75%),1990-1999,51964.0,0.9925,0.9846,0.9949,0.9333,0.9943,43.4,13.9,31.7,7.1,10.8,26.3,18.2,16.5,65.7,13.7,52.4,28.9,67.3,0.2,0.3,0.0,1.3,0.1,98.1,1.9,2029990115.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,N069212/14,355977,1309 5TH AVENUE,24H,2017-01-03,MANHATTAN,10029,Not an Ejectment,Possession,40.797309,-73.948901,11.0,9.0,17402.0,1078884.0,1016160001,East Harlem South,POINT (-73.948901 40.797309),20,2017,4.000000,1974.0,D7,34.0,600.0,HERITAGE HOLDINGS HOUSING DEVELOPMENT FU ND,680000.0,post-war,elevator,False,high-rise,"1970–1993, deregularization","1951–1980, the International Style, Alternative Modernism","1946–1975, pst war economic boom",100+ units,False,mega,Q4 (largest 25%),1970-1979,75614.0,0.9851,0.9094,0.9606,0.9476,0.9788,44.3,8.6,23.5,6.7,16.3,18.0,15.6,10.9,84.8,7.7,48.5,25.0,45.7,10.8,0.1,0.0,2.0,1.6,85.1,14.9,1016163449.0,329.0,193.0,47.0,163.0,102.0,86.0,43.0,1.0,22.0,167.0,6.0,2.0,490.0,0.0,0.0,3.0,29.0,4.0,1.0,36.0,0.0
3,K065455/16,367441,458 EAST 51 STREET,6A,2017-01-03,BROOKLYN,11203,Not an Ejectment,Possession,40.650624,-73.929261,17.0,45.0,862.0,3102875.0,3046980037,Rugby-Remsen Village,POINT (-73.929261 40.650624),12,2017,2.666667,1940.0,D1,6.0,53.0,458 EAST 51ST PARTNERS LLC,43020.0,pre-war,elevator,False,mid-rise,"Pre-1947, pre-rent-control","1931–1950, Manhattan Modern","1930-1945, great depression and WWII",21-100 units,True,very large,Q4 (largest 25%),1940-1949,78506.0,0.8956,0.7886,0.9814,0.9595,0.9386,21.2,6.9,12.1,5.9,19.7,18.7,10.5,2.4,49.7,6.7,41.1,80.3,6.9,1.8,0.1,0.0,5.5,0.4,95.0,5.0,3046980485.0,42.0,44.0,4.0,2.0,23.0,26.0,4.0,2.0,10.0,32.0,1.0,0.0,23.0,0.0,0.0,0.0,3.0,0.0,0.0,8.0,0.0
4,33992/16,458984,580 EAST 168TH STREE T,*,2017-01-03,BRONX,10456,Not an Ejectment,Possession,40.830494,-73.904108,3.0,16.0,185.0,2004234.0,2026110033,Morrisania-Melrose,POINT (-73.904108 40.830494),3,2017,3.000000,,,,,,,,,,,,,,,,,,,88575.0,0.9960,0.9903,0.9910,0.9972,0.9994,49.1,14.7,33.4,7.3,11.3,27.1,19.3,14.7,76.4,11.1,54.9,38.2,56.3,0.7,0.2,0.0,1.8,0.4,97.6,2.4,2026110035.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74077,308123/24,23218,45 BRADHURST AVENUE APT 1,1,2024-12-26,MANHATTAN,10030,Not an Ejectment,Possession,40.822848,-73.943318,9.0,9.0,227.0,1061113.0,1020510138,Hamilton Heights,POINT (-73.943318 40.822848),1,2024,1.000000,1901.0,B1,4.0,2.0,"RUIGOMEZ, ALFRED J",3815.0,pre-war,two-family,False,mid-rise,"Pre-1947, pre-rent-control","1900–1920, Beaux-Arts","Pre-1929, pre-great depression",2-unit,False,medium-small,Q4 (largest 25%),1900-1909,31268.0,0.9891,0.8245,0.9741,0.9897,0.9828,39.0,10.1,21.7,8.1,11.9,18.5,15.3,8.7,77.8,7.2,46.0,53.8,29.7,3.6,0.0,0.0,3.0,1.0,91.1,8.9,,,,,,,,,,,,,,,,,,,,,,
74078,314456/24,22735,370 WEST 52ND STREET APT 3-B,3-B,2024-12-26,MANHATTAN,10019,Not an Ejectment,Possession,40.764435,-73.987336,4.0,3.0,133.0,1078889.0,1010420061,Clinton,POINT (-73.987336 40.764435),4,2024,1.333333,1910.0,C7,5.0,27.0,370 OPERATING LLC,9764.0,pre-war,walk-up,False,mid-rise,"Pre-1947, pre-rent-control","1900–1920, Beaux-Arts","Pre-1929, pre-great depression",21-100 units,True,medium,Q4 (largest 25%),1910-1919,42766.0,0.5396,0.3795,0.8462,0.9681,0.7792,17.5,6.1,3.6,3.0,20.3,6.5,13.1,3.4,84.6,4.7,29.1,4.5,16.9,18.8,0.0,0.2,3.7,0.3,44.4,55.6,1010420203.0,13.0,8.0,7.0,0.0,3.0,3.0,2.0,0.0,0.0,5.0,0.0,0.0,20.0,0.0,0.0,0.0,4.0,0.0,3.0,3.0,0.0
74079,308611/20,21515,1629 ST JOHNS PLACE APT. 4F,4F,2024-12-27,BROOKLYN,11233,Not an Ejectment,Possession,40.670179,-73.923408,8.0,41.0,359.0,3036936.0,3013810057,Crown Heights North,POINT (-73.923408 40.670179),4,2024,1.500000,1925.0,C1,5.0,10.0,1629-1631 ST JOHNS PL LLC,6840.0,pre-war,walk-up,False,mid-rise,"Pre-1947, pre-rent-control","1921–1930, Art Deco Skyscrapers","Pre-1929, pre-great depression",6-20 units,True,medium,Q4 (largest 25%),1920-1929,83125.0,0.9541,0.7613,0.9673,0.8866,0.9438,33.0,7.3,14.9,6.9,12.2,20.9,12.4,2.4,63.0,5.0,41.3,66.7,15.7,1.3,0.5,0.0,3.6,0.2,88.1,11.9,3013810161.0,6.0,7.0,3.0,0.0,3.0,2.0,0.0,0.0,3.0,4.0,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
74080,306966/24,22953,119 BAXTER STREET APARTMENT 8R,8R,2024-12-27,MANHATTAN,10013,Not an Ejectment,Possession,40.718065,-73.998975,2.0,1.0,41.0,1079542.0,1002060004,SoHo-TriBeCa-Civic Center-Little Italy,POINT (-73.998975 40.718065),1,2024,1.000000,1910.0,C7,6.0,22.0,119 BAXTER STREET CC LLC,11880.0,pre-war,walk-up,False,mid-rise,"Pre-1947, pre-rent-control","1900–1920, Beaux-Arts","Pre-1929, pre-great depression",21-100 units,True,large,Q4 (largest 25%),1910-1919,29453.0,0.6261,0.3499,0.8507,0.9442,0.7873,14.4,4.4,12.6,4.1,17.2,14.1,8.2,12.8,70.6,6.2,23.7,2.8,8.3,29.7,0.2,0.0,4.6,0.6,46.2,53.8,1002060090.0,3.0,9.0,0.0,0.0,4.0,8.0,0.0,0.0,0.0,6.0,3.0,0.0,6.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0


In [133]:
zero_bbl_count = (merged_with_complaints['bbl'] == 0).sum()
zero_bbl_count

np.int64(3)

In [134]:
# see if these rows to see if there's a pattern
zero_bbl_rows = merged_with_complaints[merged_with_complaints['bbl'] == 0]
display(zero_bbl_rows.head())

Unnamed: 0,court_index_number,docket_number,eviction_address,eviction_apartment_number,executed_date,borough,zipcodes,ejectment,eviction/legal_possession,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta,geometry,eviction_count,year,average_year_eviction_count,yearbuilt,bldgclass,numfloors,unitsres,ownername,bldgarea,building_type,building_category,is_condo,floor_category,rent_era,architectural_style,economic_period,residential_units_category,is_llc,building_size_category,size_quartile,decade,E_TOTPOP,RPL_THEME1,RPL_THEME2,RPL_THEME3,RPL_THEME4,RPL_THEMES,EP_POV150,EP_UNEMP,EP_NOHSDP,EP_UNINSUR,EP_AGE65,EP_AGE17,EP_DISABL,EP_LIMENG,EP_NOVEH,EP_CROWD,EP_HBURD,EP_AFAM,EP_HISP,EP_ASIAN,EP_AIAN,EP_NHPI,EP_TWOMORE,EP_OTHERRACE,EP_MINRTY,EP_WHITE,total_complaints,heat_hot_water,plumbing_issues,electrical_issues,elevator_issues,doors_windows,walls_ceilings,floors_stairs,building_exterior,appliances,sanitation_issues,pest_issues,air_quality,noise_complaints,homeless_issues,graffiti_posting,public_nuisance,safety_concerns,animal_issues,police_matters,general_complaints,other_issues
2753,K64855/16,70651,783 MONROE STREET,unknown,2017-02-23,BROOKLYN,11221,Not an Ejectment,Possession,40.688384,-73.925284,3.0,41.0,385.0,3044744.0,0,Stuyvesant Heights,POINT (-73.925284 40.688384),3,2017,1.0,,,,,,,,,,,,,,,,,,,91236.0,0.9599,0.4593,0.9521,0.9624,0.9352,30.0,8.5,15.5,8.4,10.2,17.3,9.8,6.6,62.7,6.2,35.3,39.7,31.0,5.1,0.0,0.1,3.7,0.6,80.2,19.8,2670.0,241.0,170.0,22.0,56.0,54.0,57.0,18.0,0.0,27.0,72.0,45.0,0.0,431.0,4.0,1.0,1.0,89.0,2.0,6.0,39.0,0.0
60354,305838/23,367663,960 PROSPECT AVENUE AKA 961 REV. JAMES POLITE AVE,W-208,2023-12-18,BRONX,10459,Not an Ejectment,Possession,40.822745,-73.900336,2.0,17.0,12901.0,2129270.0,0,Longwood,POINT (-73.900336 40.822745),3,2023,1.0,,,,,,,,,,,,,,,,,,,51964.0,0.9925,0.9846,0.9949,0.9333,0.9943,43.4,13.9,31.7,7.1,10.8,26.3,18.2,16.5,65.7,13.7,52.4,28.9,67.3,0.2,0.3,0.0,1.3,0.1,98.1,1.9,2670.0,241.0,170.0,22.0,56.0,54.0,57.0,18.0,0.0,27.0,72.0,45.0,0.0,431.0,4.0,1.0,1.0,89.0,2.0,6.0,39.0,0.0
67777,343006/23,33978,960 PROSPECT AVENUE,W-103,2024-07-25,BRONX,10459,Not an Ejectment,Possession,40.822745,-73.900336,2.0,17.0,12901.0,2129270.0,0,Longwood,POINT (-73.900336 40.822745),3,2024,1.0,,,,,,,,,,,,,,,,,,,51964.0,0.9925,0.9846,0.9949,0.9333,0.9943,43.4,13.9,31.7,7.1,10.8,26.3,18.2,16.5,65.7,13.7,52.4,28.9,67.3,0.2,0.3,0.0,1.3,0.1,98.1,1.9,2670.0,241.0,170.0,22.0,56.0,54.0,57.0,18.0,0.0,27.0,72.0,45.0,0.0,431.0,4.0,1.0,1.0,89.0,2.0,6.0,39.0,0.0


In [139]:
all_columns = list(merged_with_complaints.columns),
# len(all_columns)
# all_columns
type(all_columns) # wierd, have to use list comprehension, as remove() does not work

tuple

In [145]:
# annoying that bbl is somewhere in the middle
# all_columns = merged_with_complaints.columns.tolist()
# print(all_columns)
# if 'court_index_number' in all_columns:
#     print("yes, court_index_number")
#     all_columns.remove('court_index_number')
# if 'bbl' in all_columns:
#     print("yes, bbl")
#     all_columns.remove('bbl')
# all_columns
remaining_columns = [col for col in all_columns if col not in ['court_index_number', 'bbl']]
remaining_columns = remaining_columns[0]
print(len(remaining_columns))
remaining_columns.remove('court_index_number')
remaining_columns.remove('bbl')


87


In [147]:
len(remaining_columns)
# good

85

In [148]:
new_column_order = ['court_index_number', 'bbl'] + remaining_columns

In [149]:
# new order in place
merged_with_complaints = merged_with_complaints[new_column_order]

In [151]:
display(merged_with_complaints.head())
# amazing

Unnamed: 0,court_index_number,bbl,docket_number,eviction_address,eviction_apartment_number,executed_date,borough,zipcodes,ejectment,eviction/legal_possession,latitude,longitude,community_board,council_district,census_tract,bin,nta,geometry,eviction_count,year,average_year_eviction_count,yearbuilt,bldgclass,numfloors,unitsres,ownername,bldgarea,building_type,building_category,is_condo,floor_category,rent_era,architectural_style,economic_period,residential_units_category,is_llc,building_size_category,size_quartile,decade,E_TOTPOP,RPL_THEME1,RPL_THEME2,RPL_THEME3,RPL_THEME4,RPL_THEMES,EP_POV150,EP_UNEMP,EP_NOHSDP,EP_UNINSUR,EP_AGE65,EP_AGE17,EP_DISABL,EP_LIMENG,EP_NOVEH,EP_CROWD,EP_HBURD,EP_AFAM,EP_HISP,EP_ASIAN,EP_AIAN,EP_NHPI,EP_TWOMORE,EP_OTHERRACE,EP_MINRTY,EP_WHITE,total_complaints,heat_hot_water,plumbing_issues,electrical_issues,elevator_issues,doors_windows,walls_ceilings,floors_stairs,building_exterior,appliances,sanitation_issues,pest_issues,air_quality,noise_complaints,homeless_issues,graffiti_posting,public_nuisance,safety_concerns,animal_issues,police_matters,general_complaints,other_issues
0,34859/16,2047200001,53416,3476 SEYMOUR AVENUE,3-B,2017-01-03,BRONX,10469,Not an Ejectment,Possession,40.87762,-73.849806,12.0,12.0,386.0,2117041.0,Eastchester-Edenwald-Baychester,POINT (-73.849806 40.87762),13,2017,2.6,1935.0,C1,4.0,158.0,EASTCHESTER HEIGHTS PROPERTY OWNER LLC,148800.0,pre-war,walk-up,False,mid-rise,"Pre-1947, pre-rent-control","1931–1950, Manhattan Modern","1930-1945, great depression and WWII",100+ units,True,mega,Q4 (largest 25%),1930-1939,71862.0,0.9255,0.9259,0.9746,0.8724,0.9507,22.6,8.4,17.5,4.8,17.0,22.2,12.0,6.2,34.7,6.9,36.5,52.2,29.5,5.5,0.6,0.0,2.3,1.0,91.2,8.8,2047200383.0,75.0,7.0,4.0,0.0,10.0,11.0,2.0,0.0,4.0,23.0,3.0,0.0,43.0,0.0,0.0,1.0,0.0,0.0,2.0,6.0,0.0
1,B57808/16,2029990111,74242,1426 BRYANT AVENUE,10 AKA 2ND FL UNIT,2017-01-03,BRONX,10459,Not an Ejectment,Possession,40.830691,-73.888555,3.0,17.0,123.0,2099901.0,Crotona Park East,POINT (-73.888555 40.830691),1,2017,1.0,1995.0,B1,3.0,2.0,"BAYRON, AIDA L.",2520.0,post-war,two-family,False,low-rise,"1994–Present, vacancy decontrol","1981–2000, Post-Modernism","1991–2008, modern economic growth",2-unit,False,small,Q3 (50-75%),1990-1999,51964.0,0.9925,0.9846,0.9949,0.9333,0.9943,43.4,13.9,31.7,7.1,10.8,26.3,18.2,16.5,65.7,13.7,52.4,28.9,67.3,0.2,0.3,0.0,1.3,0.1,98.1,1.9,2029990115.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,N069212/14,1016160001,355977,1309 5TH AVENUE,24H,2017-01-03,MANHATTAN,10029,Not an Ejectment,Possession,40.797309,-73.948901,11.0,9.0,17402.0,1078884.0,East Harlem South,POINT (-73.948901 40.797309),20,2017,4.0,1974.0,D7,34.0,600.0,HERITAGE HOLDINGS HOUSING DEVELOPMENT FU ND,680000.0,post-war,elevator,False,high-rise,"1970–1993, deregularization","1951–1980, the International Style, Alternative Modernism","1946–1975, pst war economic boom",100+ units,False,mega,Q4 (largest 25%),1970-1979,75614.0,0.9851,0.9094,0.9606,0.9476,0.9788,44.3,8.6,23.5,6.7,16.3,18.0,15.6,10.9,84.8,7.7,48.5,25.0,45.7,10.8,0.1,0.0,2.0,1.6,85.1,14.9,1016163449.0,329.0,193.0,47.0,163.0,102.0,86.0,43.0,1.0,22.0,167.0,6.0,2.0,490.0,0.0,0.0,3.0,29.0,4.0,1.0,36.0,0.0
3,K065455/16,3046980037,367441,458 EAST 51 STREET,6A,2017-01-03,BROOKLYN,11203,Not an Ejectment,Possession,40.650624,-73.929261,17.0,45.0,862.0,3102875.0,Rugby-Remsen Village,POINT (-73.929261 40.650624),12,2017,2.666667,1940.0,D1,6.0,53.0,458 EAST 51ST PARTNERS LLC,43020.0,pre-war,elevator,False,mid-rise,"Pre-1947, pre-rent-control","1931–1950, Manhattan Modern","1930-1945, great depression and WWII",21-100 units,True,very large,Q4 (largest 25%),1940-1949,78506.0,0.8956,0.7886,0.9814,0.9595,0.9386,21.2,6.9,12.1,5.9,19.7,18.7,10.5,2.4,49.7,6.7,41.1,80.3,6.9,1.8,0.1,0.0,5.5,0.4,95.0,5.0,3046980485.0,42.0,44.0,4.0,2.0,23.0,26.0,4.0,2.0,10.0,32.0,1.0,0.0,23.0,0.0,0.0,0.0,3.0,0.0,0.0,8.0,0.0
4,33992/16,2026110033,458984,580 EAST 168TH STREE T,*,2017-01-03,BRONX,10456,Not an Ejectment,Possession,40.830494,-73.904108,3.0,16.0,185.0,2004234.0,Morrisania-Melrose,POINT (-73.904108 40.830494),3,2017,3.0,,,,,,,,,,,,,,,,,,,88575.0,0.996,0.9903,0.991,0.9972,0.9994,49.1,14.7,33.4,7.3,11.3,27.1,19.3,14.7,76.4,11.1,54.9,38.2,56.3,0.7,0.2,0.0,1.8,0.4,97.6,2.4,2026110035.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [153]:
merged_with_complaints.shape

(74082, 87)

In [155]:
# remove rows with BBL = 0
merged_with_complaints_clean = merged_with_complaints[merged_with_complaints['bbl'] != 0] # good
len(merged_with_complaints_clean) # removed 3

74079

In [159]:
# fillna with 0
for category in all_categories + ['total_complaints']:
    if category in merged_with_complaints_clean.columns:
        merged_with_complaints_clean[category] = merged_with_complaints_clean[category].fillna(0)

In [161]:
# convert counts to integers
for col in all_categories + ['total_complaints']:
    if col in merged_with_complaints_clean.columns:
        merged_with_complaints_clean[col] = merged_with_complaints_clean[col].astype(int)

In [164]:
merged_with_complaints_clean.info(), \
merged_with_complaints_clean.shape

<class 'pandas.core.frame.DataFrame'>
Index: 74079 entries, 0 to 74081
Data columns (total 87 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   court_index_number           74079 non-null  object 
 1   bbl                          74079 non-null  Int64  
 2   docket_number                74079 non-null  int64  
 3   eviction_address             74079 non-null  object 
 4   eviction_apartment_number    74079 non-null  object 
 5   executed_date                74079 non-null  object 
 6   borough                      74079 non-null  object 
 7   zipcodes                     74079 non-null  int64  
 8   ejectment                    74079 non-null  object 
 9   eviction/legal_possession    74079 non-null  object 
 10  latitude                     74079 non-null  float64
 11  longitude                    74079 non-null  float64
 12  community_board              74079 non-null  float64
 13  council_district     

(None, (74079, 87))

In [165]:
complaint_cols = ['bbl'] + all_categories + ['total_complaints']
existing_cols = [col for col in complaint_cols if col in merged_with_complaints_clean.columns]
existing_cols

['bbl',
 'heat_hot_water',
 'plumbing_issues',
 'electrical_issues',
 'elevator_issues',
 'doors_windows',
 'walls_ceilings',
 'floors_stairs',
 'building_exterior',
 'appliances',
 'sanitation_issues',
 'pest_issues',
 'air_quality',
 'noise_complaints',
 'homeless_issues',
 'graffiti_posting',
 'public_nuisance',
 'safety_concerns',
 'animal_issues',
 'police_matters',
 'general_complaints',
 'other_issues',
 'total_complaints']

In [166]:
# just take a look at the ones related to the 311 complaint part
display(merged_with_complaints_clean[['court_index_number'] + existing_cols].head())

Unnamed: 0,court_index_number,bbl,heat_hot_water,plumbing_issues,electrical_issues,elevator_issues,doors_windows,walls_ceilings,floors_stairs,building_exterior,appliances,sanitation_issues,pest_issues,air_quality,noise_complaints,homeless_issues,graffiti_posting,public_nuisance,safety_concerns,animal_issues,police_matters,general_complaints,other_issues,total_complaints
0,34859/16,2047200001,75,7,4,0,10,11,2,0,4,23,3,0,43,0,0,1,0,0,2,6,0,2047200383
1,B57808/16,2029990111,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2029990115
2,N069212/14,1016160001,329,193,47,163,102,86,43,1,22,167,6,2,490,0,0,3,29,4,1,36,0,1016163449
3,K065455/16,3046980037,42,44,4,2,23,26,4,2,10,32,1,0,23,0,0,0,3,0,0,8,0,3046980485
4,33992/16,2026110033,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2026110035


In [167]:
# count how many buildings have each type of complaint
buildings_with_complaints_clean = {col: (merged_with_complaints[col] > 0).sum() for col in existing_cols[1:]}
# sorted_counts = sorted(buildings_with_complaints.items(), key=lambda x: x[1], reverse=True)
# this is just a list
complaint_counts_df = pd.DataFrame(list(buildings_with_complaints_clean.items()),
                                  columns=['complaint_category', 'building_count'])

In [168]:
complaint_counts_df = complaint_counts_df.sort_values('building_count', ascending=False)
complaint_counts_df = complaint_counts_df.reset_index(drop=True)
complaint_counts_df

Unnamed: 0,complaint_category,building_count
0,total_complaints,69551
1,noise_complaints,61827
2,plumbing_issues,58592
3,heat_hot_water,57767
4,sanitation_issues,55160
5,doors_windows,49015
6,walls_ceilings,48093
7,general_complaints,42970
8,electrical_issues,42030
9,pest_issues,40371


In [169]:
merged_with_complaints_clean.to_csv('/content/drive/My Drive/X999/bbl_evictions_311_svi_with_categories.csv', index=False)
# good, not too big, with all the necessary information
# could be used merely for retrival purpose

### there was some nan value issues.

In [173]:
nan_analysis = pd.DataFrame({
    'column': merged_with_complaints_clean.columns,
    'dtype': merged_with_complaints_clean.dtypes,
    'nan_count': merged_with_complaints_clean.isna().sum(),
})

In [174]:
nan_analysis['nan_percentage'] = (nan_analysis['nan_count'] / len(merged_with_complaints_clean) * 100).round(2)

In [175]:
nan_columns = nan_analysis[nan_analysis['nan_count'] > 0].sort_values('nan_count', ascending=False)

In [176]:
display(nan_columns)

Unnamed: 0,column,dtype,nan_count,nan_percentage
decade,decade,object,3770,5.09
yearbuilt,yearbuilt,float64,3767,5.09
numfloors,numfloors,float64,3767,5.09
bldgclass,bldgclass,object,3767,5.09
ownername,ownername,object,3767,5.09
bldgarea,bldgarea,float64,3767,5.09
building_type,building_type,object,3767,5.09
building_category,building_category,object,3767,5.09
is_condo,is_condo,object,3767,5.09
floor_category,floor_category,object,3767,5.09


In [177]:
display(nan_columns.groupby('dtype')['column'].count().reset_index().rename(columns={'column': 'count'}))


Unnamed: 0,dtype,count
0,float64,30
1,object,14


In [178]:
rows_with_nan = merged_with_complaints_clean.isna().any(axis=1)
nan_row_count = rows_with_nan.sum()
total_rows = len(merged_with_complaints_clean)
nan_row_percentage = (nan_row_count / total_rows) * 100

In [179]:
nan_row_count, total_rows, nan_row_percentage
# could have deleted them, but could also have just fillna with reasonable strategies

(np.int64(5383), 74079, np.float64(7.266566773309576))