# In this notebook, we only focus on cleaning the eviction dataset from:

https://data.cityofnewyork.us/City-Government/Evictions/6z8x-wfk4/data_preview

The explainations of the data can be found here: https://data.cityofnewyork.us/City-Government/Evictions/6z8x-wfk4/about_data

- We went from 96894 records and 20 columns, to 76484 records 17 columns.
- We treat "court_index_number" as primary keys and dropped records that have the same cour index number but with earlier eviction exected dates.
- We only focus on residential records and drop rows that have NaN in geo data columns

Finally, we saved the cleaned dataset to google drive and will do the visual and data exploration, and eventually test out hypothesis regarding complaints and evictions in NYC in different notebooks later.

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib
import matplotlib.pyplot as plt
import os
import io
import folium
import geopandas as gpd
import seaborn as sns

# suppress warning
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
# # load eviction data
# from google.colab import files
# uploaded = files.upload()

# uploaded manually

from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/My Drive/X999/evictions.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# evictions = pd.read_csv(io.BytesIO(uploaded['Evictions_20240918.csv']))
evictions_df = pd.read_csv(file_path)

In [None]:
evictions_df.head(1)

Unnamed: 0,Court Index Number,Docket Number,Eviction Address,Eviction Apartment Number,Executed Date,Marshal First Name,Marshal Last Name,Residential/Commercial,BOROUGH,Eviction Postcode,Ejectment,Eviction/Legal Possession,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA
0,315863/23,16461,626 RIVERSIDE DRIVE,18O,02/08/2024,David,Smith,Residential,MANHATTAN,10031,Not an Ejectment,Possession,40.823647,-73.954431,9.0,7.0,225.0,1062307.0,1020870000.0,Hamilton Heights


In [None]:
evictions_df.shape

(96894, 20)

In [None]:
evictions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96894 entries, 0 to 96893
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Court Index Number         96894 non-null  object 
 1   Docket Number              96894 non-null  int64  
 2   Eviction Address           96894 non-null  object 
 3   Eviction Apartment Number  81988 non-null  object 
 4   Executed Date              96894 non-null  object 
 5   Marshal First Name         96894 non-null  object 
 6   Marshal Last Name          96894 non-null  object 
 7   Residential/Commercial     96894 non-null  object 
 8   BOROUGH                    96894 non-null  object 
 9   Eviction Postcode          96894 non-null  int64  
 10  Ejectment                  96894 non-null  object 
 11  Eviction/Legal Possession  96894 non-null  object 
 12  Latitude                   88077 non-null  float64
 13  Longitude                  88077 non-null  flo

In [None]:
evictions_df.columns

Index(['Court Index Number', 'Docket Number ', 'Eviction Address',
       'Eviction Apartment Number', 'Executed Date', 'Marshal First Name',
       'Marshal Last Name', 'Residential/Commercial', 'BOROUGH',
       'Eviction Postcode', 'Ejectment', 'Eviction/Legal Possession',
       'Latitude', 'Longitude', 'Community Board', 'Council District',
       'Census Tract', 'BIN', 'BBL', 'NTA'],
      dtype='object')

In [None]:
evictions_df["Marshal Last Name"].head()

Unnamed: 0,Marshal Last Name
0,Smith
1,Daley
2,Barone
3,Grossman
4,Barone


In [None]:
evictions_df["Residential/Commercial"].unique()

array(['Residential', 'Commercial'], dtype=object)

In [None]:
# cleaning functions
def drop_columns(df, columns):
    df.drop(columns, axis=1, inplace=True)

# convert all the column names into lower case, strip the final space, and change the spaces betwen words to "_"
def clean_column_names(df):
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.strip().str.replace(' ', '_')
    return df

In [None]:
evictions_df.shape

(96894, 20)

In [None]:
evictions_df.columns

Index(['Court Index Number', 'Docket Number ', 'Eviction Address',
       'Eviction Apartment Number', 'Executed Date', 'Marshal First Name',
       'Marshal Last Name', 'Residential/Commercial', 'BOROUGH',
       'Eviction Postcode', 'Ejectment', 'Eviction/Legal Possession',
       'Latitude', 'Longitude', 'Community Board', 'Council District',
       'Census Tract', 'BIN', 'BBL', 'NTA'],
      dtype='object')

In [None]:
# drop rows that represent commercial units, instead of residentail units
# drop columns that are irrelevant, such as "Marshal Last Name", and "Marshal first Name"

# drop_rows_by_value(evictions_df, "Residential/Commercial", 'Commercial')
drop_columns(evictions_df, ["Marshal Last Name", "Marshal First Name"])
evictions_df = evictions_df[evictions_df['Residential/Commercial'] != 'Commercial']

In [None]:
evictions_df["Residential/Commercial"].unique()
evictions_df.shape

(87915, 18)

In [None]:
print(evictions_df.isnull().sum())

Court Index Number               0
Docket Number                    0
Eviction Address                 0
Eviction Apartment Number    10369
Executed Date                    0
Residential/Commercial           0
BOROUGH                          0
Eviction Postcode                0
Ejectment                        0
Eviction/Legal Possession        0
Latitude                      5886
Longitude                     5886
Community Board               5886
Council District              5886
Census Tract                  5886
BIN                           6027
BBL                           6027
NTA                           5886
dtype: int64


In [None]:
type(evictions_df["Eviction Apartment Number"][0])

str

In [None]:
# though "eviction apartment number" column has NaN, still keep the strings there, because they might be interesting
# to look at for certain case studies later
evictions_df['Eviction Apartment Number'].fillna("unknown", inplace=True)

In [None]:
print(evictions_df.isnull().sum())

Court Index Number              0
Docket Number                   0
Eviction Address                0
Eviction Apartment Number       0
Executed Date                   0
Residential/Commercial          0
BOROUGH                         0
Eviction Postcode               0
Ejectment                       0
Eviction/Legal Possession       0
Latitude                     5886
Longitude                    5886
Community Board              5886
Council District             5886
Census Tract                 5886
BIN                          6027
BBL                          6027
NTA                          5886
dtype: int64


In [None]:
evictions_df.dropna(inplace=True)

In [None]:
# sanity check
evictions_df.shape, \
evictions_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 81888 entries, 0 to 96893
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Court Index Number         81888 non-null  object 
 1   Docket Number              81888 non-null  int64  
 2   Eviction Address           81888 non-null  object 
 3   Eviction Apartment Number  81888 non-null  object 
 4   Executed Date              81888 non-null  object 
 5   Residential/Commercial     81888 non-null  object 
 6   BOROUGH                    81888 non-null  object 
 7   Eviction Postcode          81888 non-null  int64  
 8   Ejectment                  81888 non-null  object 
 9   Eviction/Legal Possession  81888 non-null  object 
 10  Latitude                   81888 non-null  float64
 11  Longitude                  81888 non-null  float64
 12  Community Board            81888 non-null  float64
 13  Council District           81888 non-null  float64


((81888, 18), None)

In [None]:
evictions_df = clean_column_names(evictions_df)

In [None]:
evictions_df.shape, \
evictions_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 81888 entries, 0 to 96893
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   court_index_number         81888 non-null  object 
 1   docket_number              81888 non-null  int64  
 2   eviction_address           81888 non-null  object 
 3   eviction_apartment_number  81888 non-null  object 
 4   executed_date              81888 non-null  object 
 5   residential/commercial     81888 non-null  object 
 6   borough                    81888 non-null  object 
 7   eviction_postcode          81888 non-null  int64  
 8   ejectment                  81888 non-null  object 
 9   eviction/legal_possession  81888 non-null  object 
 10  latitude                   81888 non-null  float64
 11  longitude                  81888 non-null  float64
 12  community_board            81888 non-null  float64
 13  council_district           81888 non-null  float64


((81888, 18), None)

In [None]:
len(evictions_df.court_index_number.unique()), \
len(evictions_df.docket_number.unique())

(76484, 70207)

In [None]:
# 81888 - 76484

In [None]:
evictions_df.drop_duplicates(inplace=True)

In [None]:
# since all rows are residential now, drop the column that indicates if it is residential or commercial
evictions_df.drop(columns=['residential/commercial'], inplace=True)

In [None]:
evictions_df.shape, \
evictions_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 81750 entries, 0 to 96893
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   court_index_number         81750 non-null  object 
 1   docket_number              81750 non-null  int64  
 2   eviction_address           81750 non-null  object 
 3   eviction_apartment_number  81750 non-null  object 
 4   executed_date              81750 non-null  object 
 5   borough                    81750 non-null  object 
 6   eviction_postcode          81750 non-null  int64  
 7   ejectment                  81750 non-null  object 
 8   eviction/legal_possession  81750 non-null  object 
 9   latitude                   81750 non-null  float64
 10  longitude                  81750 non-null  float64
 11  community_board            81750 non-null  float64
 12  council_district           81750 non-null  float64
 13  census_tract               81750 non-null  float64


((81750, 17), None)

In [None]:
# find some rows that have the same court index number
duplicate_groups = evictions_df.groupby('court_index_number').filter(lambda x: len(x) > 1)
duplicate_groups.head(2)

Unnamed: 0,court_index_number,docket_number,eviction_address,eviction_apartment_number,executed_date,residential/commercial,borough,eviction_postcode,ejectment,eviction/legal_possession,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta
11,307019/20,108457,505 WEST 54TH STREET,1111,08/16/2022,Residential,MANHATTAN,10019,Not an Ejectment,Possession,40.767339,-73.9903,4.0,6.0,135.0,1087122.0,1010830000.0,Clinton
18,317581/23,19740,46 SULLIVAN STREET APT. 5D,5D,06/04/2024,Residential,BROOKLYN,11231,Not an Ejectment,Possession,40.676995,-74.011014,6.0,38.0,53.0,3008556.0,3005560000.0,Carroll Gardens-Columbia Street-Red Hook


In [None]:
filtered_rows = evictions_df[evictions_df["court_index_number"] == '307019/20']

# Display the filtered rows
filtered_rows

Unnamed: 0,court_index_number,docket_number,eviction_address,eviction_apartment_number,executed_date,residential/commercial,borough,eviction_postcode,ejectment,eviction/legal_possession,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta
11,307019/20,108457,505 WEST 54TH STREET,1111,08/16/2022,Residential,MANHATTAN,10019,Not an Ejectment,Possession,40.767339,-73.9903,4.0,6.0,135.0,1087122.0,1010830000.0,Clinton
33131,307019/20,108457,505 WEST 54TH STREET,1111,09/28/2022,Residential,MANHATTAN,10019,Not an Ejectment,Possession,40.767339,-73.9903,4.0,6.0,135.0,1087122.0,1010830000.0,Clinton


In [None]:
evictions_df['executed_date'] = pd.to_datetime(evictions_df['executed_date'])

In [None]:
evictions_df.sort_values(by=['court_index_number', 'executed_date'], inplace=True)

In [None]:
evictions_df = evictions_df.drop_duplicates(subset=['court_index_number'], keep='last')

In [None]:
# final eviction df sanity check
evictions_df.shape, \
evictions_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 76484 entries, 79693 to 6269
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   court_index_number         76484 non-null  object        
 1   docket_number              76484 non-null  int64         
 2   eviction_address           76484 non-null  object        
 3   eviction_apartment_number  76484 non-null  object        
 4   executed_date              76484 non-null  datetime64[ns]
 5   borough                    76484 non-null  object        
 6   eviction_postcode          76484 non-null  int64         
 7   ejectment                  76484 non-null  object        
 8   eviction/legal_possession  76484 non-null  object        
 9   latitude                   76484 non-null  float64       
 10  longitude                  76484 non-null  float64       
 11  community_board            76484 non-null  float64       
 12  counci

((76484, 17), None)

In [None]:
evictions_df.head()

Unnamed: 0,court_index_number,docket_number,eviction_address,eviction_apartment_number,executed_date,borough,eviction_postcode,ejectment,eviction/legal_possession,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta
79693,*313639/23,5202,710 61ST STREET,2ND FLOOR,2024-03-04,BROOKLYN,11220,Not an Ejectment,Possession,40.635941,-74.011883,7.0,38.0,118.0,3143881.0,3057940000.0,Sunset Park East
2448,*324973/22,5308,462 60TH STREET,FOURTH FLOOR APT AKA,2024-08-13,BROOKLYN,11220,Not an Ejectment,Possession,40.640008,-74.017068,7.0,38.0,122.0,3143435.0,3057820000.0,Sunset Park West
27159,*53336/16,170279,3400 PAUL AVENUE,15D,2018-10-17,BRONX,10468,Not an Ejectment,Possession,40.87719,-73.889569,7.0,11.0,409.0,2015444.0,2032510000.0,Van Cortlandt Village
77871,*5990/17,2703,480 CONCORD AVENUE,4E,2019-08-30,BRONX,10455,Not an Ejectment,Possession,40.811197,-73.90881,1.0,8.0,35.0,2003900.0,2025770000.0,Mott Haven-Port Morris
4237,000098/17,69483,65 EAST 193RD ST,1B,2017-05-04,BRONX,10468,Not an Ejectment,Possession,40.866075,-73.896515,7.0,14.0,401.0,2013945.0,2031770000.0,Bedford Park-Fordham North


In [None]:
# Assuming you have a DataFrame `df` that you want to save as CSV
evictions_df.to_csv('/content/drive/My Drive/X999/evictions_df_cleaned.csv', index=False)