In [1]:
#imports
import pandas as pd

In [2]:
nyc_data = pd.read_csv("C:/Users/Ralph Arren/Desktop/Uni/Uni Year 3/ML_Project/City_Council_Capital_Budget.csv", encoding="ISO-8859-1")
# nyc_data.head()

In [3]:
nyc_data.isnull().sum()
# displaying rows with missing values
# nyc_data[nyc_data.isnull().any(axis=1)]

Reported              0
Fiscal_Year          12
Borough               2
Award                12
Council_District     33
Sponsor              33
Title                 7
Description         108
ID                    2
Budget_Line           3
dtype: int64

_____________________________________________________________________________________________________________________________________________________
Dealing with "Reported" column

In [4]:
# Cell 3: Show rows where 'Reported' is not a number
# errors='coerce' turns non-numbers into NaN, so we just look for those
nyc_data[pd.to_numeric(nyc_data['Reported'], errors='coerce').isna()]

Unnamed: 0,Reported,Fiscal_Year,Borough,Award,Council_District,Sponsor,Title,Description,ID,Budget_Line
2440,BATHROOMS AT,,,,,,,,,
2441,PARK WEST,,,,,,,,,
2442,"HIGH SCHOOL (M535/M542)""",,FY20,,,,,,--RENOVATE A MALE AND FEMALE BATHROOM FOR NYC ...,170000.0
2458,"GYMNASIUM UPGRADE""",,FY20,,,,,,ERROR: #NAME?,300000.0
5450,"B:X546 S:X546""",,FY22,,,,,,ERROR: #NAME?,55000.0
6403,"B:M028 S:M028""",,FY23,,,,,,ERROR: #NAME?,150000.0
11254,"EDUCATION CENTER""",,2000000,,,,,,FY26,


We will be dropping all 7 rows with invalid years because they are corrupted. They make up less than 5% of our data (0.06%) to be exact, so the effect is negligible. Risk is high if we keep them.

(2440, 2441, 2442, 2458, 5450, 6403, 11254)

In [5]:
#Dropping Cells with invalid "reported" values: 

nyc_data = nyc_data[pd.to_numeric(nyc_data['Reported'], errors='coerce').notna()]

In [6]:
#Checking
nyc_data[pd.to_numeric(nyc_data['Reported'], errors='coerce').isna()]

Unnamed: 0,Reported,Fiscal_Year,Borough,Award,Council_District,Sponsor,Title,Description,ID,Budget_Line


______________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________
Here we handle missing or invalid Fiscal_Years

In [7]:
# Create a filter for rows where Fiscal_Year is null (NaN)
missing_fiscal = nyc_data[nyc_data['Fiscal_Year'].isnull()]

# Display these rows
missing_fiscal


Unnamed: 0,Reported,Fiscal_Year,Borough,Award,Council_District,Sponsor,Title,Description,ID,Budget_Line
2439,2020,,M,,3,Johnson,MORE ACCESSIBLE,,E CN817,E D001
2457,2020,,K,,36,Cornegy,P.S. 3 BEDFORD VILLAGE (K003) (K003),,E CN832,E D001
5449,2022,,M,,10,Rodriguez,BRONX THEATRE HIGH SCHOOL,,E CN673,E D001
6402,2023,,M,,7,Abreu,P.S. 28 - AUDITORIUM LIGHTING UPGRADE,,E CN136,E D001
11253,2026,,Q,,23,"Speaker,Lee",QUEENS COUNTY FARM EDUCATIONAL CENTER,CONSTRUCTION OF A VISITOR AND,P CN1606,P D019


In [8]:
valid_format = r'^FY\d{2}$'
is_invalid = ~nyc_data['Fiscal_Year'].str.match(valid_format, na=False) & nyc_data['Fiscal_Year'].notna()
nyc_data[is_invalid]

Unnamed: 0,Reported,Fiscal_Year,Borough,Award,Council_District,Sponsor,Title,Description,ID,Budget_Line


No invalid Fiscal_Year entries

In [9]:
nyc_data['Fiscal_Year'] = nyc_data['Fiscal_Year'].ffill()

In [10]:
#check
nyc_data[nyc_data['Fiscal_Year'].isnull()]

Unnamed: 0,Reported,Fiscal_Year,Borough,Award,Council_District,Sponsor,Title,Description,ID,Budget_Line


_____________________________________________________________________________________________________________________________________________________
Dealing with Boroughs
Valid Borough Values: 'M', 'X', 'K', 'Q', 'R'
(S and SI is for Staten Island, which is not present in this data set)

M - Manhattan,
X - The Bronx,
K - Brooklyn (Kings),
Q - Queens,
R - Staten Island (Richmond),

In [11]:
nyc_data['Borough'].value_counts()

Borough
Q    3106
K    2955
M    2756
X    1852
R     781
A      45
C       1
Name: count, dtype: int64

In [12]:
# Cell 4: Show rows with weird Borough codes
valid_boroughs = ['M', 'X', 'K', 'Q', 'R', 'A'] 
#A most-likely stands for Citywide (we will drop this later on)
#S and SI is for Staten Island, which is not present in this data set
nyc_data[~nyc_data['Borough'].isin(valid_boroughs) & nyc_data['Borough'].notna()]

Unnamed: 0,Reported,Fiscal_Year,Borough,Award,Council_District,Sponsor,Title,Description,ID,Budget_Line
2732,2020,FY20,C,3000000.0,15,"Speaker, Torres","CONCOURSE VILLAGE, INC",FUNDS WILL SUPPORT NEEDED REPAIRS AT THE DEVEL...,HD NC926,HD D003


In [13]:
#Changing the "C" Borough value to "X" since CONCOURSE VILLAGE, INC is locatd in the South Brox
nyc_data.loc[2732, 'Borough'] = 'X'

In [14]:
#checking value
nyc_data.loc[2732, 'Borough']

'X'

_____________________________________________________________________________________________________________________________________________________
Dealing with 'Council_District' values

In [15]:
# Cell 5: Filter for any invalid District numbers
nyc_data[pd.to_numeric(nyc_data['Council_District'], errors='coerce').isna()]



Unnamed: 0,Reported,Fiscal_Year,Borough,Award,Council_District,Sponsor,Title,Description,ID,Budget_Line
3300,2021,FY21,A,504000.0,,,JEWISH CHILD CARE ASSOCIATION OF NEW YORK - IN...,REALLOCATION OF PRIOR FUNDING IN THREE PROJECT...,CS TA001,CS DN207
4188,2021,FY21,A,277000.0,,,MAKE THE ROAD NEW YORK CONSTRUCTION,REALLOCATION OF PRIOR FUNDING IN 850 ROADNYC2 ...,ED TA001,ED DN631
4311,2021,FY21,A,2800000.0,,,VILLA MARIA REHAB WORK,REALLOCATION OF FUNDING FROM 806 RLAFMOR LAFAY...,HD TA001,HD D024
4345,2021,FY21,A,514000.0,,,NEWYORK-PRESBYTERIAN QUEENS ULTRASOUND MACHINES,REALLOCATION OF PRIOR FUNDING FROM 850 HLDNHOS...,HL TA001,HL DN305
4346,2021,FY21,A,377000.0,,,FLUSHING HOSPITAL - MOMMOGRAPHY AND ULTRASOUND...,REALLOCATION OF PRIOR FUNDING IN 850 HLDNFLUSH...,HL TA002,HL DN565
4384,2021,FY21,A,490000.0,,,CUNY CITY TECH PEARL BUILDING 3RD FLOOR CONSTR...,REALLOCATION OF FUNDING FROM ACADEMIC LEARNING...,HN TA001,HN D300
4416,2021,FY21,A,2973000.0,,,GREENBURGER CENTER FOR SOCIAL AND CRIMINAL JUS...,REALLOCATION OF PRIOR FUNDING FROM EXISTING PR...,HR TA001,HR DN01N
4481,2021,FY21,A,500000.0,,,ST. ALBANS RENOVATION (850 LQSARENOV),REALLOCATION OF FUNDING FROM SOUTH HOLLIS LIBR...,LQ TA001,LQ D122
4580,2021,FY21,A,1915000.0,,,DETECTIVE KEITH WILLIAMS PARK,REALLOCATION OF FUNDING FROM 846 P-412RWP5.,P TA001,P D019
4706,2021,FY21,A,150000.0,,,ABC NO RIO - CONSTRUCTION OF NEW FACILITY,REALLOCATION OF FUNDING FROM 801 GOVISSPWK.,PV TA001,PV D467


In [16]:
#conversion to numeric
nyc_data['Council_District_num'] = pd.to_numeric(
    nyc_data['Council_District'],
    errors='coerce'
)

nyc_data.loc[
    nyc_data['Council_District_num'].notna() &
    ~nyc_data['Council_District_num'].between(1, 51)
]

Unnamed: 0,Reported,Fiscal_Year,Borough,Award,Council_District,Sponsor,Title,Description,ID,Budget_Line,Council_District_num
875,2019,FY19,M,1000000.0,1000,"Rodriguez, Speaker",GEORGE WASHINGTON HIGH SCHOOL,SCIENCE LAB FOR ALL 4 SCHOOLS,E CN881,E D001,1000.0
876,2019,FY19,Q,750000.0,2800,"Adams, Speaker","HSLEPS, Q690 AUDITORUM UPGRADE","AUDITORIUM UPGRADE AT HSLEPS, Q690",E CN882,E D001,2800.0
878,2019,FY19,R,1500000.0,955000,"Staten Island Delegation, Matteo, Speaker",I.S. 51 - PLAYGROUND UPGRADE,PLAYGROUND IS 51 (PRINCIPAL NICHOLAS MELE),E CN885,E D001,955000.0
879,2019,FY19,K,3500000.0,4700,"Treyger, Speaker",JOHN DEWEY HIGH SCHOOL (21K540) - AUDITORIUM U...,AUDITORIUM UPGRADE,E CN888,E D001,4700.0
880,2019,FY19,M,2000000.0,1000,"Rodriguez, Speaker",GREGORIO LUPERON H.S. ROBOTICS PROGRAM,TECHNOLOGY UPGRADE,E CN892,E D001,1000.0
...,...,...,...,...,...,...,...,...,...,...,...
11497,2026,FY26,Q,299000.0,99,Technical Adjustments,ST. MARY'S HOSPITAL INITIAL OUTFITTING,IO FOR NEW 18 BED UNIT,HL NC1549,HL DN367,99.0
11498,2026,FY26,R,933000.0,99,Technical Adjustments,STATEN ISLAND UNIVERSITY HOSPITAL,ELECTROPHYSIOLOGY LAB 2,HL NC1550,HL DN404,99.0
11499,2026,FY26,Q,318000.0,99,Technical Adjustments,"KOREAN-AMERICAN FAMILY SERVICE CENTER, INC. (8...",TECHNICAL ADJUSTMENT TO RESTORE FUNDS TO PROJE...,HR NC1552,HR DN918,99.0
11500,2026,FY26,M,189000.0,99,Technical Adjustments,"CEC STUYVESANT COVE, INC. DBA SOLAR ONE (850 E...",TECHNICAL ADJUSTMENT TO RESTORE FUNDS TO 850 E...,ED NC1553,ED DN690,99.0


Let's drop Council_District since we dont need that anymore


In [20]:
# nyc_data = nyc_data.drop(columns=['Council_District'])
#check
nyc_data.head()

Unnamed: 0,Reported,Fiscal_Year,Borough,Award,Sponsor,Title,Description,ID,Budget_Line,Council_District_num
0,2019,FY19,X,250000.0,Gibson,VOLUNTEERS OF AMERICA,CLARKE PLACE SENIOR RESIDENCE PROJECT,AG CN002,HD D024,16.0
1,2019,FY19,K,3640000.0,"Speaker, Espinal",CYPRESS HILLS CHILD CARE CENTER,CONSTRUCTION OF A CHILD CARE CENTER,CS NC001,CS DN956,37.0
2,2019,FY19,M,425000.0,Kallos,[SCA] [02M077/02M198] LOWER LAB/ISADOR E. IDA ...,HVAC SYSTEM FOR CAFETERIA,E CN001,E D001,5.0
3,2019,FY19,M,100000.0,Kallos,[SCA] [02M077/02M198] LOWER LAB/ISADOR E. IDA ...,PLAYGROUND RENOVATION,E CN002,E D001,5.0
4,2019,FY19,M,35000.0,Kallos,[SCA] [02M114] EAST SIDE MIDDLE SCHOOL,TECHNOLOGY UPGRADES,E CN003,E D001,5.0


________________________________________________________________________________________________________________________________________________
Dropping more colums we generally do not need!

- Reported [DROPPED] : redundant with fiscal year, and could reduce accuracy. Fiscal Year is more important as it gives us information of when the money was actually allocated to a project.

- ID [DROPPED] : unique keys have no predictive power.

- Sponsor : text ver of council_district, redundant

In [23]:
# nyc_data = nyc_data.drop(columns=['Reported', 'ID'])
nyc_data = nyc_data.drop(columns='Sponsor')
#check
nyc_data.head()

Unnamed: 0,Fiscal_Year,Borough,Award,Title,Description,Budget_Line,Council_District_num
0,FY19,X,250000.0,VOLUNTEERS OF AMERICA,CLARKE PLACE SENIOR RESIDENCE PROJECT,HD D024,16.0
1,FY19,K,3640000.0,CYPRESS HILLS CHILD CARE CENTER,CONSTRUCTION OF A CHILD CARE CENTER,CS DN956,37.0
2,FY19,M,425000.0,[SCA] [02M077/02M198] LOWER LAB/ISADOR E. IDA ...,HVAC SYSTEM FOR CAFETERIA,E D001,5.0
3,FY19,M,100000.0,[SCA] [02M077/02M198] LOWER LAB/ISADOR E. IDA ...,PLAYGROUND RENOVATION,E D001,5.0
4,FY19,M,35000.0,[SCA] [02M114] EAST SIDE MIDDLE SCHOOL,TECHNOLOGY UPGRADES,E D001,5.0


Data Cleaning Part 2____________________________________________________________________________________________________________________________
checking for any null and invalid entries per column

In [24]:
nyc_data.isnull().sum()

Fiscal_Year               0
Borough                   0
Award                     5
Title                     0
Description             101
Budget_Line               0
Council_District_num     26
dtype: int64

In [27]:
#Drop the 5 rows where Award is null
nyc_data.dropna(subset=['Award'], inplace = True)

#check
nyc_data.isnull().sum()


Fiscal_Year              0
Borough                  0
Award                    0
Title                    0
Description             97
Budget_Line              0
Council_District_num    26
dtype: int64

In [None]:
#Dealing with Awards
nyc_data[pd.to_numeric(nyc_data['Award'], errors='coerce').isna()]

Unnamed: 0,Fiscal_Year,Borough,Award,Title,Description,Budget_Line,Council_District_num


In [31]:
nyc_data[nyc_data['Award'].astype(str).str.contains('$', regex=False)]

Unnamed: 0,Fiscal_Year,Borough,Award,Title,Description,Budget_Line,Council_District_num


Everything looks consistent so far...let's export


In [None]:
# nyc_data.to_csv('cleaned_nyc_capital_projects.csv', index=False)

In [33]:
nyc_data.shape

(11491, 7)