In [2]:
# !pip install geopandas folium matplotlib seaborn scipy
# !pip install esda
# !pip install splot
# # for google colab, had to reinstall some pacakges.

In [3]:
import pandas as pd
import numpy as np
from scipy import stats
import datetime as dt
import matplotlib
import matplotlib.pyplot as plt
import os
import io
import geopandas as gpd
import seaborn as sns

# suppress warning
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
file_path1 = '/content/drive/My Drive/X999/bbl_evictions_merged_covid.csv'

In [6]:
file_path2 = '/content/drive/My Drive/X999/svi_simplified.csv'

In [7]:
bbl_evictions = pd.read_csv(file_path1)

In [8]:
svi_simplified_df = pd.read_csv(file_path2)

In [9]:
bbl_evictions.columns

Index(['court_index_number', 'docket_number', 'eviction_address',
       'eviction_apartment_number', 'executed_date', 'borough',
       'eviction_postcode', 'ejectment', 'eviction/legal_possession',
       'latitude', 'longitude', 'community_board', 'council_district',
       'census_tract', 'bin', 'bbl', 'nta', 'geometry', 'eviction_count',
       'year', 'average_year_eviction_count', 'bbl_clean', 'yearbuilt',
       'bldgclass', 'numfloors', 'unitsres', 'ownername', 'bldgarea',
       'building_type', 'building_category', 'is_condo', 'floor_category',
       'rent_era', 'architectural_style', 'economic_period',
       'residential_units_category', 'is_llc', 'building_size_category',
       'size_quartile', 'decade'],
      dtype='object')

In [10]:
svi_simplified_df.columns

Index(['FIPS', 'E_TOTPOP', 'RPL_THEME1', 'RPL_THEME2', 'RPL_THEME3',
       'RPL_THEME4', 'RPL_THEMES', 'EP_POV150', 'EP_UNEMP', 'EP_NOHSDP',
       'EP_UNINSUR', 'EP_AGE65', 'EP_AGE17', 'EP_DISABL', 'EP_LIMENG',
       'EP_NOVEH', 'EP_CROWD', 'EP_HBURD', 'EP_AFAM', 'EP_HISP', 'EP_ASIAN',
       'EP_AIAN', 'EP_NHPI', 'EP_TWOMORE', 'EP_OTHERRACE', 'EP_MINRTY',
       'EP_WHITE'],
      dtype='object')

In [11]:
svi_simplified_df = svi_simplified_df.rename(columns={'FIPS': 'zipcodes'})

In [12]:
bbl_evictions = bbl_evictions.rename(columns={'eviction_postcode': 'zipcodes'})

In [13]:
bbl_evictions = bbl_evictions.drop(columns=['bbl_clean'])

In [14]:
# merge based on bbl_evictions to just add svi columns/attributes to the bbl_evictions columns
merged_df = bbl_evictions.merge(
    svi_simplified_df,
    on='zipcodes',
    how='left'
)

In [15]:
saved_2017 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2017_reduced.csv"
saved_2018 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2018_reduced.csv"
saved_2019 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2019_reduced.csv"
saved_2020 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2020_reduced.csv"
saved_2021 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2021_reduced.csv"
saved_2022 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2022_reduced.csv"
saved_2023 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2023_reduced.csv"
saved_2024 = "/content/drive/My Drive/X999/311_different_years/filtered_df_2024_reduced.csv"

In [16]:
df_2017 = pd.read_csv(saved_2017)
df_2018 = pd.read_csv(saved_2018)
df_2019 = pd.read_csv(saved_2019)
df_2020 = pd.read_csv(saved_2020)
df_2021 = pd.read_csv(saved_2021)
df_2022 = pd.read_csv(saved_2022)
df_2023 = pd.read_csv(saved_2023)
df_2024 = pd.read_csv(saved_2024)

In [20]:
covid_311_df = pd.concat([df_2020, df_2021, df_2022])

In [21]:
covid_311_df.head()

Unnamed: 0,unique_key,created_date,closed_date,complaint_type,incident_zip,incident_address,bbl,borough,latitude,longitude
0,48538697,2020-12-31 23:59:55,2021-01-01 01:07:04,Noise - Vehicle,10460.0,1569 HOE AVENUE,2029820000.0,BRONX,40.83582,-73.887516
1,48536596,2020-12-31 23:59:28,2021-01-01 01:33:12,Noise - Residential,10028.0,235 EAST 83 STREET,1015290000.0,MANHATTAN,40.776503,-73.954525
2,48536500,2020-12-31 23:58:55,2021-01-01 00:24:54,Noise - Residential,10468.0,2380 GRAND AVENUE,2031990000.0,BRONX,40.861553,-73.904168
3,48542024,2020-12-31 23:58:45,2021-01-14 16:49:17,Noise - Helicopter,10003.0,195 1 AVENUE,1004530000.0,MANHATTAN,40.729916,-73.983616
4,48543542,2020-12-31 23:58:39,2021-01-01 00:13:47,Noise - Residential,10034.0,571 ACADEMY STREET,1022218000.0,MANHATTAN,40.863565,-73.923221


In [22]:
covid_311_df.columns

Index(['unique_key', 'created_date', 'closed_date', 'complaint_type',
       'incident_zip', 'incident_address', 'bbl', 'borough', 'latitude',
       'longitude'],
      dtype='object')

In [23]:
merged_df.columns

Index(['court_index_number', 'docket_number', 'eviction_address',
       'eviction_apartment_number', 'executed_date', 'borough', 'zipcodes',
       'ejectment', 'eviction/legal_possession', 'latitude', 'longitude',
       'community_board', 'council_district', 'census_tract', 'bin', 'bbl',
       'nta', 'geometry', 'eviction_count', 'year',
       'average_year_eviction_count', 'yearbuilt', 'bldgclass', 'numfloors',
       'unitsres', 'ownername', 'bldgarea', 'building_type',
       'building_category', 'is_condo', 'floor_category', 'rent_era',
       'architectural_style', 'economic_period', 'residential_units_category',
       'is_llc', 'building_size_category', 'size_quartile', 'decade',
       'E_TOTPOP', 'RPL_THEME1', 'RPL_THEME2', 'RPL_THEME3', 'RPL_THEME4',
       'RPL_THEMES', 'EP_POV150', 'EP_UNEMP', 'EP_NOHSDP', 'EP_UNINSUR',
       'EP_AGE65', 'EP_AGE17', 'EP_DISABL', 'EP_LIMENG', 'EP_NOVEH',
       'EP_CROWD', 'EP_HBURD', 'EP_AFAM', 'EP_HISP', 'EP_ASIAN', 'EP_AIAN',
      

In [24]:
merged_df.head()

Unnamed: 0,court_index_number,docket_number,eviction_address,eviction_apartment_number,executed_date,borough,zipcodes,ejectment,eviction/legal_possession,latitude,...,EP_HBURD,EP_AFAM,EP_HISP,EP_ASIAN,EP_AIAN,EP_NHPI,EP_TWOMORE,EP_OTHERRACE,EP_MINRTY,EP_WHITE
0,35484/19,494073,184 MT. EDEN PARKWAY,UNIT 1D,2020-01-02,BRONX,10457,Not an Ejectment,Possession,40.842943,...,54.0,30.9,65.3,0.4,0.1,0.0,1.1,0.4,98.3,1.7
1,251388/19,117473,160 EAST 117 STREET,3-B,2020-01-02,MANHATTAN,10035,Not an Ejectment,Possession,40.799094,...,45.1,36.7,43.0,3.4,0.1,0.0,3.4,0.3,87.0,13.0
2,66822/19,25733,66-07 ALDERTON ST,unknown,2020-01-02,QUEENS,11374,Not an Ejectment,Possession,40.719316,...,31.7,3.5,22.1,30.2,0.1,0.1,5.6,1.1,62.6,37.4
3,68501/18,91505,1245 STRATFORD AVE,D12,2020-01-02,BRONX,10472,Not an Ejectment,Possession,40.830623,...,50.3,23.1,61.7,8.2,0.1,0.1,1.8,3.0,97.9,2.1
4,68498/19,26147,28-16 47TH STREET,1-L,2020-01-02,QUEENS,11103,Not an Ejectment,Possession,40.76239,...,33.4,2.3,26.6,13.2,0.0,0.0,4.1,0.3,46.5,53.5


In [25]:
merged_df.shape
# lovely, all necessary features, but not too large

(6338, 65)

### It turns out, we do need a **pivot table**, but need to groupby first to make the merge process more seamless

In [26]:
merged_df.shape

(6338, 65)

In [27]:
merged_df.bbl = merged_df.bbl.astype('Int64')
covid_311_df.bbl = covid_311_df.bbl.astype('Int64')

In [28]:
covid_311_df.columns

Index(['unique_key', 'created_date', 'closed_date', 'complaint_type',
       'incident_zip', 'incident_address', 'bbl', 'borough', 'latitude',
       'longitude'],
      dtype='object')

In [29]:
merged_df.columns

Index(['court_index_number', 'docket_number', 'eviction_address',
       'eviction_apartment_number', 'executed_date', 'borough', 'zipcodes',
       'ejectment', 'eviction/legal_possession', 'latitude', 'longitude',
       'community_board', 'council_district', 'census_tract', 'bin', 'bbl',
       'nta', 'geometry', 'eviction_count', 'year',
       'average_year_eviction_count', 'yearbuilt', 'bldgclass', 'numfloors',
       'unitsres', 'ownername', 'bldgarea', 'building_type',
       'building_category', 'is_condo', 'floor_category', 'rent_era',
       'architectural_style', 'economic_period', 'residential_units_category',
       'is_llc', 'building_size_category', 'size_quartile', 'decade',
       'E_TOTPOP', 'RPL_THEME1', 'RPL_THEME2', 'RPL_THEME3', 'RPL_THEME4',
       'RPL_THEMES', 'EP_POV150', 'EP_UNEMP', 'EP_NOHSDP', 'EP_UNINSUR',
       'EP_AGE65', 'EP_AGE17', 'EP_DISABL', 'EP_LIMENG', 'EP_NOVEH',
       'EP_CROWD', 'EP_HBURD', 'EP_AFAM', 'EP_HISP', 'EP_ASIAN', 'EP_AIAN',
      

In [30]:
court_bbl_map = merged_df[['court_index_number', 'bbl']].drop_duplicates()
court_bbl_map.shape
# there are actually no duplicates, court_index_number is indeed good enough to be the sole primary key for the soon-to-be mega merged table

(6338, 2)

In [33]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
# display all columns

In [34]:
def categorize_complaint(complaint_type):
    complaint = complaint_type.lower().strip()

    # building systems and utilities stuff
    if 'heat' in complaint or 'hot water' in complaint:
        return 'heat_hot_water'
    elif any(term in complaint for term in ['water leak', 'plumbing', 'sewage']):
        return 'plumbing_issues'
    elif 'electric' in complaint:
        return 'electrical_issues'
    elif 'elevator' in complaint:
        return 'elevator_issues'

    # building structure and maintenance
    elif 'door' in complaint or 'window' in complaint:
        return 'doors_windows'
    elif any(term in complaint for term in ['paint', 'plaster', 'mold']):
        return 'walls_ceilings'
    elif 'floor' in complaint or 'stair' in complaint:
        return 'floors_stairs'
    elif 'outside building' in complaint:
        return 'building_exterior'
    elif 'appliance' in complaint:
        return 'appliances'

    # health and environmental impact
    elif 'unsanitary' in complaint or 'condition' in complaint:
        return 'sanitation_issues'
    elif any(pest in complaint for pest in ['rodent', 'mosquito', 'bee', 'wasp', 'pigeon']):
        return 'pest_issues'
    elif 'air' in complaint or 'asbestos' in complaint or 'smoking' in complaint:
        return 'air_quality'

    # noise (all noise complaints together)
    elif 'noise' in complaint:
        return 'noise_complaints'

    # public space influences and nuances
    elif 'homeless' in complaint or 'encampment' in complaint:
        return 'homeless_issues'
    elif 'graffiti' in complaint or 'advertisement' in complaint:
        return 'graffiti_posting'
    elif any(nuisance in complaint for nuisance in ['disorderly', 'panhandling', 'drinking', 'urinating', 'fireworks']):
        return 'public_nuisance'

    # living safety and services
    elif 'safety' in complaint:
        return 'safety_concerns'
    elif 'animal' in complaint or 'abuse' in complaint:
        return 'animal_issues'
    elif 'police' in complaint:
        return 'police_matters'

    # miscellaneous
    elif 'general' in complaint:
        return 'general_complaints'
    else:
        return 'other_issues'

In [35]:
# use categories, instead of real complaint types
# first we re-group the complaint type and then we merge
# the size would be smaller and better for merge later
covid_311_df['complaint_category'] = covid_311_df['complaint_type'].apply(categorize_complaint)

In [36]:
covid_311_df

Unnamed: 0,unique_key,created_date,closed_date,complaint_type,incident_zip,incident_address,bbl,borough,latitude,longitude,complaint_category
0,48538697,2020-12-31 23:59:55,2021-01-01 01:07:04,Noise - Vehicle,10460.0,1569 HOE AVENUE,2029820027,BRONX,40.835820,-73.887516,noise_complaints
1,48536596,2020-12-31 23:59:28,2021-01-01 01:33:12,Noise - Residential,10028.0,235 EAST 83 STREET,1015290018,MANHATTAN,40.776503,-73.954525,noise_complaints
2,48536500,2020-12-31 23:58:55,2021-01-01 00:24:54,Noise - Residential,10468.0,2380 GRAND AVENUE,2031990003,BRONX,40.861553,-73.904168,noise_complaints
3,48542024,2020-12-31 23:58:45,2021-01-14 16:49:17,Noise - Helicopter,10003.0,195 1 AVENUE,1004530034,MANHATTAN,40.729916,-73.983616,noise_complaints
4,48543542,2020-12-31 23:58:39,2021-01-01 00:13:47,Noise - Residential,10034.0,571 ACADEMY STREET,1022217501,MANHATTAN,40.863565,-73.923221,noise_complaints
...,...,...,...,...,...,...,...,...,...,...,...
1433290,52934948,2022-01-01 00:01:25,2022-01-01 00:26:49,Noise - Residential,11236.0,1625 ROCKAWAY PARKWAY,3082240022,BROOKLYN,40.641367,-73.898103,noise_complaints
1433291,52934960,2022-01-01 00:01:20,2022-01-01 00:07:25,Noise - Residential,10026.0,95 LENOX AVENUE,1018240016,MANHATTAN,40.801511,-73.950068,noise_complaints
1433292,52931678,2022-01-01 00:01:17,2022-01-01 02:55:16,Noise - Residential,10456.0,974 SHERIDAN AVENUE,2024550014,BRONX,40.829808,-73.919235,noise_complaints
1433293,52938624,2022-01-01 00:01:04,2022-01-01 00:18:49,Noise - Street/Sidewalk,10025.0,998 AMSTERDAM AVENUE,1018810032,MANHATTAN,40.802562,-73.964176,noise_complaints


In [38]:
# count each category for each bbl
# group the complaints by bbl and categories and then count them
bbl_category_counts = covid_311_df.groupby(['bbl', 'complaint_category']).size().reset_index(name='count')

In [39]:
bbl_category_counts

Unnamed: 0,bbl,complaint_category,count
0,0,animal_issues,1
1,0,appliances,7
2,0,doors_windows,8
3,0,electrical_issues,2
4,0,elevator_issues,16
...,...,...,...
639730,5270000501,sanitation_issues,1
639731,5270000504,sanitation_issues,1
639732,5270000506,noise_complaints,1
639733,5270000508,noise_complaints,1


### necessary to use a bit pivot table transformation here, because we want this table to have a "wide" format so that:

- each row represents a single building (bbl)
- each complaint category becomes its own column
- the values show the count for each category

In [40]:
# use a bit pivot table here, to make this a wide format with categories as columns
# pivot to have categories as columns
bbl_complaints_wide = bbl_category_counts.pivot(
    index='bbl',
    columns='complaint_category',
    values='count'
).fillna(0).reset_index()

In [41]:
bbl_complaints_wide

complaint_category,bbl,air_quality,animal_issues,appliances,building_exterior,doors_windows,electrical_issues,elevator_issues,floors_stairs,general_complaints,graffiti_posting,heat_hot_water,homeless_issues,noise_complaints,other_issues,pest_issues,plumbing_issues,police_matters,public_nuisance,safety_concerns,sanitation_issues,walls_ceilings
0,0,0.0,1.0,7.0,0.0,8.0,2.0,16.0,6.0,9.0,1.0,43.0,1.0,244.0,22.0,19.0,47.0,3.0,0.0,0.0,17.0,13.0
1,1000010010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
2,1000010101,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1000010201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1000020001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282093,5270000501,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
282094,5270000504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
282095,5270000506,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
282096,5270000508,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
all_categories = [
    'heat_hot_water', 'plumbing_issues', 'electrical_issues', 'elevator_issues',
    'doors_windows', 'walls_ceilings', 'floors_stairs', 'building_exterior',
    'appliances', 'sanitation_issues', 'pest_issues', 'air_quality',
    'noise_complaints', 'homeless_issues', 'graffiti_posting', 'public_nuisance',
    'safety_concerns', 'animal_issues', 'police_matters', 'general_complaints',
    'other_issues'
]

In [43]:
# fill na with 0
for category in all_categories:
    if category not in bbl_complaints_wide.columns:
        bbl_complaints_wide[category] = 0

#  it's making sure that all possible complaint categories exist as columns, even if there were no complaints of that type
# in the entire dataset.
# for example, if no buildings had any "elevator_issues", the pivot operation wouldn't create an "elevator_issues" column at all.
# having another loop check ensures that every category in our predefined list exists as a column, even if it's all zeros.

In [44]:
# add a total column
bbl_complaints_wide['total_complaints'] = bbl_complaints_wide[all_categories].sum(axis=1)

In [45]:
bbl_complaints_wide
# so far, we do have the 311 complaint part figure out

complaint_category,bbl,air_quality,animal_issues,appliances,building_exterior,doors_windows,electrical_issues,elevator_issues,floors_stairs,general_complaints,graffiti_posting,heat_hot_water,homeless_issues,noise_complaints,other_issues,pest_issues,plumbing_issues,police_matters,public_nuisance,safety_concerns,sanitation_issues,walls_ceilings,total_complaints
0,0,0.0,1.0,7.0,0.0,8.0,2.0,16.0,6.0,9.0,1.0,43.0,1.0,244.0,22.0,19.0,47.0,3.0,0.0,0.0,17.0,13.0,459.0
1,1000010010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,12.0
2,1000010101,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3,1000010201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1000020001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282093,5270000501,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,8.0
282094,5270000504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
282095,5270000506,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
282096,5270000508,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [46]:
merged_with_complaints = merged_df.merge(
    bbl_complaints_wide,
    on='bbl',
    how='left'
)
# the final merge with bbl, evictions, svi with 311 complaints

In [47]:
merged_with_complaints

Unnamed: 0,court_index_number,docket_number,eviction_address,eviction_apartment_number,executed_date,borough,zipcodes,ejectment,eviction/legal_possession,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta,geometry,eviction_count,year,average_year_eviction_count,yearbuilt,bldgclass,numfloors,unitsres,ownername,bldgarea,building_type,building_category,is_condo,floor_category,rent_era,architectural_style,economic_period,residential_units_category,is_llc,building_size_category,size_quartile,decade,E_TOTPOP,RPL_THEME1,RPL_THEME2,RPL_THEME3,RPL_THEME4,RPL_THEMES,EP_POV150,EP_UNEMP,EP_NOHSDP,EP_UNINSUR,EP_AGE65,EP_AGE17,EP_DISABL,EP_LIMENG,EP_NOVEH,EP_CROWD,EP_HBURD,EP_AFAM,EP_HISP,EP_ASIAN,EP_AIAN,EP_NHPI,EP_TWOMORE,EP_OTHERRACE,EP_MINRTY,EP_WHITE,air_quality,animal_issues,appliances,building_exterior,doors_windows,electrical_issues,elevator_issues,floors_stairs,general_complaints,graffiti_posting,heat_hot_water,homeless_issues,noise_complaints,other_issues,pest_issues,plumbing_issues,police_matters,public_nuisance,safety_concerns,sanitation_issues,walls_ceilings,total_complaints
0,35484/19,494073,184 MT. EDEN PARKWAY,UNIT 1D,2020-01-02,BRONX,10457,Not an Ejectment,Possession,40.842943,-73.910173,4.0,16.0,225.0,2007828.0,2028200035,East Concourse-Concourse Village,POINT (-73.910173 40.842943),1,2020,1.0,1928.0,C1,5.0,26.0,184 MT. EDEN LLC,23185.0,pre-war,walk-up,False,mid-rise,"Pre-1947, pre-rent-control","1921–1930, Art Deco Skyscrapers","Pre-1929, pre-great depression",21-100 units,True,large,Q4 (largest 25%),1920-1929,79817.0,0.9977,0.9801,0.9961,0.9989,0.9989,48.6,13.2,30.5,9.7,9.3,29.5,16.3,18.6,72.1,18.7,54.0,30.9,65.3,0.4,0.1,0.0,1.1,0.4,98.3,1.7,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,11.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,3.0,21.0
1,251388/19,117473,160 EAST 117 STREET,3-B,2020-01-02,MANHATTAN,10035,Not an Ejectment,Possession,40.799094,-73.940658,11.0,8.0,182.0,1052311.0,1016447502,East Harlem North,POINT (-73.940658 40.799094),1,2020,1.0,,,,,,,,,,,,,,,,,,,38595.0,0.9903,0.9692,0.9656,0.9709,0.9914,46.2,10.7,24.9,7.4,14.3,21.3,18.9,8.7,83.9,8.4,45.1,36.7,43.0,3.4,0.1,0.0,3.4,0.3,87.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
2,66822/19,25733,66-07 ALDERTON ST,unknown,2020-01-02,QUEENS,11374,Not an Ejectment,Possession,40.719316,-73.859091,6.0,29.0,703.0,4461930.0,4031500075,Rego Park,POINT (-73.859091 40.719316),1,2020,1.0,1995.0,C0,2.0,3.0,66-07 ALDERTON LLC,2910.0,post-war,walk-up,False,low-rise,"1994–Present, vacancy decontrol","1981–2000, Post-Modernism","1991–2008, modern economic growth",3-5 units,True,small,Q4 (largest 25%),1990-1999,47230.0,0.7873,0.7151,0.9087,0.7926,0.8578,19.3,4.8,9.5,6.2,19.4,18.8,12.5,13.7,50.8,10.1,31.7,3.5,22.1,30.2,0.1,0.1,5.6,1.1,62.6,37.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,68501/18,91505,1245 STRATFORD AVE,D12,2020-01-02,BRONX,10472,Not an Ejectment,Possession,40.830623,-73.875843,9.0,18.0,54.0,2025350.0,2037760056,West Farms-Bronx River,POINT (-73.875843 40.830623),5,2020,1.0,1928.0,D1,6.0,73.0,1245 STRATFORD LLC,66000.0,pre-war,elevator,False,mid-rise,"Pre-1947, pre-rent-control","1921–1930, Art Deco Skyscrapers","Pre-1929, pre-great depression",21-100 units,True,very large,Q4 (largest 25%),1920-1929,65283.0,0.9966,0.9419,0.9921,0.9835,0.9937,47.0,10.0,31.9,12.3,12.7,26.0,12.3,19.4,61.6,14.8,50.3,23.1,61.7,8.2,0.1,0.1,1.8,3.0,97.9,2.1,0.0,0.0,23.0,0.0,20.0,9.0,14.0,19.0,28.0,0.0,95.0,0.0,35.0,0.0,1.0,43.0,2.0,1.0,4.0,26.0,26.0,346.0
4,68498/19,26147,28-16 47TH STREET,1-L,2020-01-02,QUEENS,11103,Not an Ejectment,Possession,40.762390,-73.908502,1.0,22.0,147.0,4012954.0,4007200032,Astoria,POINT (-73.908502 40.76239),1,2020,1.0,1928.0,C2,3.0,6.0,28-16 47 STREET LLC,5250.0,pre-war,walk-up,False,low-rise,"Pre-1947, pre-rent-control","1921–1930, Art Deco Skyscrapers","Pre-1929, pre-great depression",6-20 units,True,medium,Q4 (largest 25%),1920-1929,34752.0,0.8888,0.1379,0.8535,0.8695,0.8159,17.5,7.8,10.9,10.3,11.4,12.4,7.7,7.1,61.9,6.3,33.4,2.3,26.6,13.2,0.0,0.0,4.1,0.3,46.5,53.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6333,313130/22,22053,2813 OCEAN AVENUE,4-B,2022-12-23,BROOKLYN,11235,Not an Ejectment,Possession,40.592437,-73.950081,15.0,48.0,59401.0,3203179.0,3074220101,Sheepshead Bay-Gerritsen Beach-Manhattan Beach,POINT (-73.950081 40.592437),1,2022,1.0,1929.0,D1,6.0,60.0,JAC OCEAN LLC,55920.0,pre-war,elevator,False,mid-rise,"Pre-1947, pre-rent-control","1921–1930, Art Deco Skyscrapers","1930-1945, great depression and WWII",21-100 units,True,very large,Q4 (largest 25%),1920-1929,83069.0,0.9094,0.9179,0.7623,0.9732,0.9524,26.8,5.7,10.4,8.1,25.3,17.1,15.0,25.6,47.6,15.4,41.4,2.4,9.1,14.8,0.0,0.0,5.2,0.5,32.1,67.9,0.0,0.0,3.0,0.0,1.0,1.0,2.0,1.0,0.0,0.0,1.0,0.0,17.0,0.0,0.0,2.0,0.0,0.0,0.0,3.0,2.0,33.0
6334,304061/22,21846,25 TENNIS COURT,2B,2022-12-23,BROOKLYN,11226,Not an Ejectment,Possession,40.648539,-73.961758,14.0,40.0,51002.0,3117101.0,3050990021,Flatbush,POINT (-73.961758 40.648539),3,2022,1.0,1929.0,D1,6.0,61.0,25-35 TENNIS COURT LLC,75756.0,pre-war,elevator,False,mid-rise,"Pre-1947, pre-rent-control","1921–1930, Art Deco Skyscrapers","1930-1945, great depression and WWII",21-100 units,True,very large,Q4 (largest 25%),1920-1929,101053.0,0.9300,0.4536,0.9639,0.9692,0.9220,23.7,5.9,13.9,9.1,13.1,18.7,6.7,5.6,66.1,10.0,39.2,63.2,14.9,3.2,0.3,0.0,4.1,0.7,86.3,13.7,0.0,0.0,6.0,0.0,6.0,2.0,5.0,2.0,1.0,0.0,15.0,0.0,106.0,0.0,0.0,8.0,16.0,21.0,0.0,6.0,4.0,198.0
6335,308756/22,22391,205 SEABREEZE AVE,4-I,2022-12-23,BROOKLYN,11224,Not an Ejectment,Possession,40.575841,-73.971162,13.0,48.0,35602.0,3196594.0,3072800188,West Brighton,POINT (-73.971162 40.575841),1,2022,1.0,1930.0,D1,6.0,58.0,SEABREEZE REALTY CO,48000.0,pre-war,elevator,False,mid-rise,"Pre-1947, pre-rent-control","1921–1930, Art Deco Skyscrapers","1930-1945, great depression and WWII",21-100 units,False,very large,Q4 (largest 25%),1930-1939,47893.0,0.9713,0.9983,0.8755,0.9202,0.9920,42.9,10.6,17.7,5.0,28.5,19.6,20.0,25.8,55.8,9.4,42.8,20.1,19.5,9.5,0.4,0.0,2.9,0.0,52.3,47.7,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,1.0,0.0,20.0,0.0,8.0,0.0,0.0,7.0,1.0,0.0,0.0,4.0,1.0,46.0
6336,305234/22,21968,3030 EMMONS AVENUE,3-A,2022-12-23,BROOKLYN,11235,Not an Ejectment,Possession,40.583919,-73.937886,15.0,48.0,622.0,3248031.0,3088150042,Sheepshead Bay-Gerritsen Beach-Manhattan Beach,POINT (-73.937886 40.583919),10,2022,1.0,1972.0,D3,7.0,134.0,"GRECIAN GARDENS SHEEPSHEAD BAY, LLC",144750.0,post-war,elevator,False,high-rise,"1970–1993, deregularization","1951–1980, the International Style, Alternative Modernism","1946–1975, pst war economic boom",100+ units,True,mega,Q4 (largest 25%),1970-1979,83069.0,0.9094,0.9179,0.7623,0.9732,0.9524,26.8,5.7,10.4,8.1,25.3,17.1,15.0,25.6,47.6,15.4,41.4,2.4,9.1,14.8,0.0,0.0,5.2,0.5,32.1,67.9,0.0,0.0,0.0,0.0,4.0,0.0,10.0,1.0,0.0,0.0,25.0,0.0,41.0,0.0,0.0,7.0,0.0,1.0,0.0,3.0,1.0,93.0


In [48]:
zero_bbl_count = (merged_with_complaints['bbl'] == 0).sum()
zero_bbl_count

np.int64(0)

In [49]:
# see if these rows to see if there's a pattern
zero_bbl_rows = merged_with_complaints[merged_with_complaints['bbl'] == 0]
display(zero_bbl_rows.head())
# so in covid time, there was no rows with bbl == 0 in complaints

Unnamed: 0,court_index_number,docket_number,eviction_address,eviction_apartment_number,executed_date,borough,zipcodes,ejectment,eviction/legal_possession,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta,geometry,eviction_count,year,average_year_eviction_count,yearbuilt,bldgclass,numfloors,unitsres,ownername,bldgarea,building_type,building_category,is_condo,floor_category,rent_era,architectural_style,economic_period,residential_units_category,is_llc,building_size_category,size_quartile,decade,E_TOTPOP,RPL_THEME1,RPL_THEME2,RPL_THEME3,RPL_THEME4,RPL_THEMES,EP_POV150,EP_UNEMP,EP_NOHSDP,EP_UNINSUR,EP_AGE65,EP_AGE17,EP_DISABL,EP_LIMENG,EP_NOVEH,EP_CROWD,EP_HBURD,EP_AFAM,EP_HISP,EP_ASIAN,EP_AIAN,EP_NHPI,EP_TWOMORE,EP_OTHERRACE,EP_MINRTY,EP_WHITE,air_quality,animal_issues,appliances,building_exterior,doors_windows,electrical_issues,elevator_issues,floors_stairs,general_complaints,graffiti_posting,heat_hot_water,homeless_issues,noise_complaints,other_issues,pest_issues,plumbing_issues,police_matters,public_nuisance,safety_concerns,sanitation_issues,walls_ceilings,total_complaints


In [50]:
all_columns = list(merged_with_complaints.columns),
# len(all_columns)
# all_columns
type(all_columns) # wierd, have to use list comprehension, as remove() does not work

tuple

In [51]:
# annoying that bbl is somewhere in the middle
# all_columns = merged_with_complaints.columns.tolist()
# print(all_columns)
# if 'court_index_number' in all_columns:
#     print("yes, court_index_number")
#     all_columns.remove('court_index_number')
# if 'bbl' in all_columns:
#     print("yes, bbl")
#     all_columns.remove('bbl')
# all_columns
remaining_columns = [col for col in all_columns if col not in ['court_index_number', 'bbl']]
remaining_columns = remaining_columns[0]
print(len(remaining_columns))
remaining_columns.remove('court_index_number')
remaining_columns.remove('bbl')


87


In [52]:
len(remaining_columns)
# good

85

In [53]:
new_column_order = ['court_index_number', 'bbl'] + remaining_columns

In [54]:
# new order in place
merged_with_complaints = merged_with_complaints[new_column_order]

In [66]:
merged_with_complaints.columns

Index(['court_index_number', 'bbl', 'docket_number', 'eviction_address',
       'eviction_apartment_number', 'executed_date', 'borough', 'zipcodes', 'ejectment',
       'eviction/legal_possession', 'latitude', 'longitude', 'community_board', 'council_district',
       'census_tract', 'bin', 'nta', 'geometry', 'eviction_count', 'year',
       'average_year_eviction_count', 'yearbuilt', 'bldgclass', 'numfloors', 'unitsres',
       'ownername', 'bldgarea', 'building_type', 'building_category', 'is_condo', 'floor_category',
       'rent_era', 'architectural_style', 'economic_period', 'residential_units_category',
       'is_llc', 'building_size_category', 'size_quartile', 'decade', 'E_TOTPOP', 'RPL_THEME1',
       'RPL_THEME2', 'RPL_THEME3', 'RPL_THEME4', 'RPL_THEMES', 'EP_POV150', 'EP_UNEMP',
       'EP_NOHSDP', 'EP_UNINSUR', 'EP_AGE65', 'EP_AGE17', 'EP_DISABL', 'EP_LIMENG', 'EP_NOVEH',
       'EP_CROWD', 'EP_HBURD', 'EP_AFAM', 'EP_HISP', 'EP_ASIAN', 'EP_AIAN', 'EP_NHPI',
       'EP_TWOMO

In [55]:
display(merged_with_complaints.head())
# amazing

Unnamed: 0,court_index_number,bbl,docket_number,eviction_address,eviction_apartment_number,executed_date,borough,zipcodes,ejectment,eviction/legal_possession,latitude,longitude,community_board,council_district,census_tract,bin,nta,geometry,eviction_count,year,average_year_eviction_count,yearbuilt,bldgclass,numfloors,unitsres,ownername,bldgarea,building_type,building_category,is_condo,floor_category,rent_era,architectural_style,economic_period,residential_units_category,is_llc,building_size_category,size_quartile,decade,E_TOTPOP,RPL_THEME1,RPL_THEME2,RPL_THEME3,RPL_THEME4,RPL_THEMES,EP_POV150,EP_UNEMP,EP_NOHSDP,EP_UNINSUR,EP_AGE65,EP_AGE17,EP_DISABL,EP_LIMENG,EP_NOVEH,EP_CROWD,EP_HBURD,EP_AFAM,EP_HISP,EP_ASIAN,EP_AIAN,EP_NHPI,EP_TWOMORE,EP_OTHERRACE,EP_MINRTY,EP_WHITE,air_quality,animal_issues,appliances,building_exterior,doors_windows,electrical_issues,elevator_issues,floors_stairs,general_complaints,graffiti_posting,heat_hot_water,homeless_issues,noise_complaints,other_issues,pest_issues,plumbing_issues,police_matters,public_nuisance,safety_concerns,sanitation_issues,walls_ceilings,total_complaints
0,35484/19,2028200035,494073,184 MT. EDEN PARKWAY,UNIT 1D,2020-01-02,BRONX,10457,Not an Ejectment,Possession,40.842943,-73.910173,4.0,16.0,225.0,2007828.0,East Concourse-Concourse Village,POINT (-73.910173 40.842943),1,2020,1.0,1928.0,C1,5.0,26.0,184 MT. EDEN LLC,23185.0,pre-war,walk-up,False,mid-rise,"Pre-1947, pre-rent-control","1921–1930, Art Deco Skyscrapers","Pre-1929, pre-great depression",21-100 units,True,large,Q4 (largest 25%),1920-1929,79817.0,0.9977,0.9801,0.9961,0.9989,0.9989,48.6,13.2,30.5,9.7,9.3,29.5,16.3,18.6,72.1,18.7,54.0,30.9,65.3,0.4,0.1,0.0,1.1,0.4,98.3,1.7,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,11.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,3.0,21.0
1,251388/19,1016447502,117473,160 EAST 117 STREET,3-B,2020-01-02,MANHATTAN,10035,Not an Ejectment,Possession,40.799094,-73.940658,11.0,8.0,182.0,1052311.0,East Harlem North,POINT (-73.940658 40.799094),1,2020,1.0,,,,,,,,,,,,,,,,,,,38595.0,0.9903,0.9692,0.9656,0.9709,0.9914,46.2,10.7,24.9,7.4,14.3,21.3,18.9,8.7,83.9,8.4,45.1,36.7,43.0,3.4,0.1,0.0,3.4,0.3,87.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
2,66822/19,4031500075,25733,66-07 ALDERTON ST,unknown,2020-01-02,QUEENS,11374,Not an Ejectment,Possession,40.719316,-73.859091,6.0,29.0,703.0,4461930.0,Rego Park,POINT (-73.859091 40.719316),1,2020,1.0,1995.0,C0,2.0,3.0,66-07 ALDERTON LLC,2910.0,post-war,walk-up,False,low-rise,"1994–Present, vacancy decontrol","1981–2000, Post-Modernism","1991–2008, modern economic growth",3-5 units,True,small,Q4 (largest 25%),1990-1999,47230.0,0.7873,0.7151,0.9087,0.7926,0.8578,19.3,4.8,9.5,6.2,19.4,18.8,12.5,13.7,50.8,10.1,31.7,3.5,22.1,30.2,0.1,0.1,5.6,1.1,62.6,37.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,68501/18,2037760056,91505,1245 STRATFORD AVE,D12,2020-01-02,BRONX,10472,Not an Ejectment,Possession,40.830623,-73.875843,9.0,18.0,54.0,2025350.0,West Farms-Bronx River,POINT (-73.875843 40.830623),5,2020,1.0,1928.0,D1,6.0,73.0,1245 STRATFORD LLC,66000.0,pre-war,elevator,False,mid-rise,"Pre-1947, pre-rent-control","1921–1930, Art Deco Skyscrapers","Pre-1929, pre-great depression",21-100 units,True,very large,Q4 (largest 25%),1920-1929,65283.0,0.9966,0.9419,0.9921,0.9835,0.9937,47.0,10.0,31.9,12.3,12.7,26.0,12.3,19.4,61.6,14.8,50.3,23.1,61.7,8.2,0.1,0.1,1.8,3.0,97.9,2.1,0.0,0.0,23.0,0.0,20.0,9.0,14.0,19.0,28.0,0.0,95.0,0.0,35.0,0.0,1.0,43.0,2.0,1.0,4.0,26.0,26.0,346.0
4,68498/19,4007200032,26147,28-16 47TH STREET,1-L,2020-01-02,QUEENS,11103,Not an Ejectment,Possession,40.76239,-73.908502,1.0,22.0,147.0,4012954.0,Astoria,POINT (-73.908502 40.76239),1,2020,1.0,1928.0,C2,3.0,6.0,28-16 47 STREET LLC,5250.0,pre-war,walk-up,False,low-rise,"Pre-1947, pre-rent-control","1921–1930, Art Deco Skyscrapers","Pre-1929, pre-great depression",6-20 units,True,medium,Q4 (largest 25%),1920-1929,34752.0,0.8888,0.1379,0.8535,0.8695,0.8159,17.5,7.8,10.9,10.3,11.4,12.4,7.7,7.1,61.9,6.3,33.4,2.3,26.6,13.2,0.0,0.0,4.1,0.3,46.5,53.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [56]:
merged_with_complaints.shape

(6338, 87)

In [57]:
# remove rows with BBL = 0
merged_with_complaints_clean = merged_with_complaints[merged_with_complaints['bbl'] != 0] # good
len(merged_with_complaints_clean) # removed 3

6338

In [58]:
# fillna with 0
for category in all_categories + ['total_complaints']:
    if category in merged_with_complaints_clean.columns:
        merged_with_complaints_clean[category] = merged_with_complaints_clean[category].fillna(0)

In [59]:
# convert counts to integers
for col in all_categories + ['total_complaints']:
    if col in merged_with_complaints_clean.columns:
        merged_with_complaints_clean[col] = merged_with_complaints_clean[col].astype(int)

In [60]:
merged_with_complaints_clean.info(), \
merged_with_complaints_clean.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6338 entries, 0 to 6337
Data columns (total 87 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   court_index_number           6338 non-null   object 
 1   bbl                          6338 non-null   Int64  
 2   docket_number                6338 non-null   int64  
 3   eviction_address             6338 non-null   object 
 4   eviction_apartment_number    6338 non-null   object 
 5   executed_date                6338 non-null   object 
 6   borough                      6338 non-null   object 
 7   zipcodes                     6338 non-null   int64  
 8   ejectment                    6338 non-null   object 
 9   eviction/legal_possession    6338 non-null   object 
 10  latitude                     6338 non-null   float64
 11  longitude                    6338 non-null   float64
 12  community_board              6338 non-null   float64
 13  council_district  

(None, (6338, 87))

In [61]:
complaint_cols = ['bbl'] + all_categories + ['total_complaints']
existing_cols = [col for col in complaint_cols if col in merged_with_complaints_clean.columns]
existing_cols

['bbl',
 'heat_hot_water',
 'plumbing_issues',
 'electrical_issues',
 'elevator_issues',
 'doors_windows',
 'walls_ceilings',
 'floors_stairs',
 'building_exterior',
 'appliances',
 'sanitation_issues',
 'pest_issues',
 'air_quality',
 'noise_complaints',
 'homeless_issues',
 'graffiti_posting',
 'public_nuisance',
 'safety_concerns',
 'animal_issues',
 'police_matters',
 'general_complaints',
 'other_issues',
 'total_complaints']

In [62]:
# just take a look at the ones related to the 311 complaint part
display(merged_with_complaints_clean[['court_index_number'] + existing_cols].head())

Unnamed: 0,court_index_number,bbl,heat_hot_water,plumbing_issues,electrical_issues,elevator_issues,doors_windows,walls_ceilings,floors_stairs,building_exterior,appliances,sanitation_issues,pest_issues,air_quality,noise_complaints,homeless_issues,graffiti_posting,public_nuisance,safety_concerns,animal_issues,police_matters,general_complaints,other_issues,total_complaints
0,35484/19,2028200035,1,2,1,0,1,3,0,0,0,0,1,0,11,0,0,0,1,0,0,0,0,21
1,251388/19,1016447502,1,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,7
2,66822/19,4031500075,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,2
3,68501/18,2037760056,95,43,9,14,20,26,19,0,23,26,1,0,35,0,0,1,4,0,2,28,0,346
4,68498/19,4007200032,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1


In [63]:
# count how many buildings have each type of complaint
buildings_with_complaints_clean = {col: (merged_with_complaints[col] > 0).sum() for col in existing_cols[1:]}
# sorted_counts = sorted(buildings_with_complaints.items(), key=lambda x: x[1], reverse=True)
# this is just a list
complaint_counts_df = pd.DataFrame(list(buildings_with_complaints_clean.items()),
                                  columns=['complaint_category', 'building_count'])

In [64]:
complaint_counts_df = complaint_counts_df.sort_values('building_count', ascending=False)
complaint_counts_df = complaint_counts_df.reset_index(drop=True)
complaint_counts_df

Unnamed: 0,complaint_category,building_count
0,total_complaints,5602
1,noise_complaints,4883
2,plumbing_issues,4176
3,heat_hot_water,4131
4,sanitation_issues,3943
5,doors_windows,3265
6,walls_ceilings,3228
7,electrical_issues,2731
8,general_complaints,2679
9,pest_issues,2531


In [78]:
merged_with_complaints_clean.to_csv('/content/drive/My Drive/X999/bbl_evictions_311_svi_with_categories_covid.csv', index=False)
# good, not too big, with all the necessary information
# could be used merely for retrival purpose
# there is some nan values, 6.7%, will need to decide how to deal with them (either go back to the previous cleaning files to fill
# them, or delete them here, or fill them here later)

### there was some nan value issues.

In [69]:
nan_analysis = pd.DataFrame({
    'column': merged_with_complaints_clean.columns,
    'dtype': merged_with_complaints_clean.dtypes,
    'nan_count': merged_with_complaints_clean.isna().sum(),
})

In [70]:
nan_analysis['nan_percentage'] = (nan_analysis['nan_count'] / len(merged_with_complaints_clean) * 100).round(2)

In [71]:
nan_columns = nan_analysis[nan_analysis['nan_count'] > 0].sort_values('nan_count', ascending=False)

In [72]:
display(nan_columns)

Unnamed: 0,column,dtype,nan_count,nan_percentage
yearbuilt,yearbuilt,float64,314,4.95
bldgclass,bldgclass,object,314,4.95
numfloors,numfloors,float64,314,4.95
unitsres,unitsres,float64,314,4.95
ownername,ownername,object,314,4.95
bldgarea,bldgarea,float64,314,4.95
building_type,building_type,object,314,4.95
building_category,building_category,object,314,4.95
is_condo,is_condo,object,314,4.95
floor_category,floor_category,object,314,4.95


In [73]:
display(nan_columns.groupby('dtype')['column'].count().reset_index().rename(columns={'column': 'count'}))


Unnamed: 0,dtype,count
0,float64,30
1,object,14


In [75]:
rows_with_nan = merged_with_complaints_clean.isna().any(axis=1)
nan_row_count = rows_with_nan.sum()
total_rows = len(merged_with_complaints_clean)
nan_row_percentage = (nan_row_count / total_rows) * 100

In [77]:
nan_row_count, total_rows, nan_row_percentage
# could have deleted them, but could also have just fillna with reasonable strategies

(np.int64(423), 6338, np.float64(6.674029662354054))