In [1]:
# check for required file

import os
from pathlib import Path

FILE_PATH = '../data/NYC_Local_Mental_Health_Programs.csv'

file_path = Path(FILE_PATH)

# Check if the file exists
if file_path.exists():
    print(f"File found: {file_path}")
else:
    print(f"File not found: {file_path}. Please ensure the file is downloaded correctly.")


File found: ../data/NYC_Local_Mental_Health_Programs.csv


In [2]:
# Load the dataset
import pandas as pd
import numpy as np

initial_df = pd.read_csv(FILE_PATH)

initial_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4952 entries, 0 to 4951
Data columns (total 28 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Row Created Date Time             4952 non-null   object 
 1    Sponsor Name                     4952 non-null   object 
 2    Sponsor Code                     4952 non-null   int64  
 3    Agency Name                      4952 non-null   object 
 4    Agency Code                      4952 non-null   int64  
 5   Facility Name                     4952 non-null   object 
 6    Facility Code                    4952 non-null   int64  
 7    Program Name                     4952 non-null   object 
 8    Program Code                     4952 non-null   int64  
 9    Populations Served               4728 non-null   object 
 10   Agency Phone                     4948 non-null   object 
 11   Program Phone                    4919 non-null   object 
 12    Prog

In [3]:
initial_df.head()

Unnamed: 0,Row Created Date Time,Sponsor Name,Sponsor Code,Agency Name,Agency Code,Facility Name,Facility Code,Program Name,Program Code,Populations Served,...,Program Tier,Operating Certificate Duration,Program County,Program Region,Program Type Description,Program Category Description,Program Subcategory Description,Location,Counties,New York Zip Codes
0,2018-04-27T09:52:56.000,Albany County Department of Mental Health,8099,Albany County Department of Mental Health,70520,Albany County Department of Mental Health,6901,Albany County ACT Program,471,Adults,...,,36.0,Albany,Hudson River,Assertive Community Treatment (ACT),Outpatient,Assertive Community Treatment,"{'human_address': '{""address"": ""175 Green Stre...",2030.0,39.0
1,2018-04-27T09:52:56.000,Albany County Department of Mental Health,8099,Albany County Department of Mental Health,70520,Albany County Department of Mental Health,6901,Albany County DMH - HHCM,18,Adults,...,,,Albany,Hudson River,Health Home Care Management,Support,Care Coordination,"{'human_address': '{""address"": ""175 Green Stre...",2030.0,39.0
2,2018-04-27T09:52:56.000,Albany County Department of Mental Health,8099,Albany County Department of Mental Health,70520,Albany County Department of Mental Health,6901,Albany County DMH - HH NonMed CM,17,Adults,...,,,Albany,Hudson River,Health Home Non-Medicaid Care Management,Support,Care Coordination,"{'human_address': '{""address"": ""175 Green Stre...",2030.0,39.0
3,2018-04-27T09:52:56.000,Albany County Department of Mental Health,8099,Albany County Department of Mental Health,70520,Albany County Department of Mental Health,6901,Albany County Mental Health Clinic,100,Children Adolescents Adults,...,,36.0,Albany,Hudson River,Clinic Treatment,Outpatient,Clinic Treatment,"{'human_address': '{""address"": ""260 South Pear...",2030.0,39.0
4,2018-04-27T09:52:56.000,Albany County Department of Mental Health,8099,Albany County Department of Mental Health,70520,Albany County Department of Mental Health,6901,Mobile Crisis Team,19,Adults,...,,,Albany,Hudson River,Crisis Intervention,Emergency,Crisis,"{'human_address': '{""address"": ""75 New Scotlan...",2030.0,45.0


In [4]:
# Columns to drop
columns_to_keep = [' Program Subcategory Description', ' Program Category Description', ' Program Type Description',
                   ' Populations Served', ' Program Name', ' Program County', 'New York Zip Codes']

# Drop the columns
initial_df = initial_df[columns_to_keep]
initial_df.head()

Unnamed: 0,Program Subcategory Description,Program Category Description,Program Type Description,Populations Served,Program Name,Program County,New York Zip Codes
0,Assertive Community Treatment,Outpatient,Assertive Community Treatment (ACT),Adults,Albany County ACT Program,Albany,39.0
1,Care Coordination,Support,Health Home Care Management,Adults,Albany County DMH - HHCM,Albany,39.0
2,Care Coordination,Support,Health Home Non-Medicaid Care Management,Adults,Albany County DMH - HH NonMed CM,Albany,39.0
3,Clinic Treatment,Outpatient,Clinic Treatment,Children Adolescents Adults,Albany County Mental Health Clinic,Albany,39.0
4,Crisis,Emergency,Crisis Intervention,Adults,Mobile Crisis Team,Albany,45.0


In [5]:
# Count the number of NaN values per column
nan_counts = initial_df.isna().sum()


# Print the NaN counts
print(nan_counts)

 Program Subcategory Description       0
 Program Category Description          0
 Program Type Description              0
 Populations Served                  224
 Program Name                          0
 Program County                        0
New York Zip Codes                  2007
dtype: int64


In [6]:
initial_df[' Populations Served'] = initial_df[' Populations Served'].fillna('Adults')


In [7]:
# Count the number of NaN values per column
nan_counts = initial_df.isna().sum()


# Print the NaN counts
print(nan_counts)

 Program Subcategory Description       0
 Program Category Description          0
 Program Type Description              0
 Populations Served                    0
 Program Name                          0
 Program County                        0
New York Zip Codes                  2007
dtype: int64


In [8]:
initial_df[' Program County'].unique()

array(['Albany', 'Allegany', 'Bronx', 'Broome', 'Cattaraugus', 'Cayuga',
       'Chautauqua', 'Chemung', 'Chenango', 'Clinton', 'Columbia',
       'Cortland', 'Delaware', 'Dutchess', 'Erie', 'Essex', 'Franklin',
       'Fulton', 'Genesee', 'Greene', 'Hamilton', 'Herkimer', 'Jefferson',
       'Kings', 'Lewis', 'Livingston', 'Madison', 'Monroe', 'Montgomery',
       'Nassau', 'New York', 'Niagara', 'Oneida', 'Onondaga', 'Ontario',
       'Orange', 'Orleans', 'Oswego', 'Otsego', 'Putnam', 'Queens',
       'Rensselaer', 'Richmond', 'Rockland', 'Saratoga', 'Schenectady',
       'Schoharie', 'Schuyler', 'Seneca', 'Steuben', 'St. Lawrence',
       'Suffolk', 'Sullivan', 'Tioga', 'Tompkins', 'Ulster',
       'USA Not NYS', 'Warren', 'Washington', 'Wayne', 'Westchester',
       'Wyoming', 'Yates'], dtype=object)

In [10]:
count = (initial_df[' Program County'] == "New York").sum()
print(f"Number of rows where Program County is 'New York': {count}")

Number of rows where Program County is 'New York': 471
