In [5]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import matplotlib.pyplot as plt

# check for required file
FILE_PATH = '../data/NYC_Mental_Health_Service_Finder_Data.csv'

file_path = Path(FILE_PATH)

# Check if the file exists
if file_path.exists():
    print(f"File found: {file_path}")
else:
    print(f"File not found: {file_path}. Please ensure the file is downloaded correctly.")

    # Load the dataset
initial_df = pd.read_csv(FILE_PATH)

File found: ../data/NYC_Mental_Health_Service_Finder_Data.csv


In [6]:
initial_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 492 entries, 0 to 491
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name_1                  492 non-null    object 
 1   name_2                  414 non-null    object 
 2   street_1                492 non-null    object 
 3   street_2                167 non-null    object 
 4   city                    492 non-null    object 
 5   zip                     492 non-null    int64  
 6   phone                   488 non-null    object 
 7   website                 369 non-null    object 
 8   latitude                492 non-null    float64
 9   longitude               490 non-null    float64
 10  flag_saf                211 non-null    float64
 11  flag_mhf                305 non-null    float64
 12  flag_mc                 301 non-null    float64
 13  flag_md                 408 non-null    float64
 14  flag_np_ss              278 non-null    fl

In [7]:
initial_df.head()

Unnamed: 0,name_1,name_2,street_1,street_2,city,zip,phone,website,latitude,longitude,...,flag_hv,flag_dv,flag_chld,flag_yad,flag_adlt,flag_snr,flag_si,filter_military,filter_inpatient_svc,filter_residential_pgm
0,Allied Service Center NYC,Queens Opioid Treatment Clinic Otp 3,2036 Amsterdam Avenue,,NEW YORK,10032,212-645-0875,,40.835269,-73.940293,...,1.0,,,1.0,,,,,,
1,Beth Israel Medical Center,Ny Center Addiction Treatment Op 1,25 12th Street,,BROOKLYN,11215,718-965-7900,,40.671835,-73.997069,...,,,,1.0,,,,,,
2,New York Center Addiction Treatment,Bleuler Cd Recovery Services Op,37-20 74Th Street,,QUEENS,11372,212-966-9537,http://nycats.net/,40.748278,-73.891721,...,,,,1.0,,,,,,
3,"Bleuler Psychotherapy Center, Inc.",Bleuler Cd Recovery Services Op,104-70 Queens Boulevard,,QUEENS,11375,718-275-6010,,40.72337,-73.848295,...,,,1.0,1.0,1.0,1.0,,,,
4,"Safe Space NYC, Inc.",Family Life Clinic,133-25 Guy R. Brewer Boulevard,,QUEENS,11434,718-206-3440,,40.673769,-73.775951,...,,,1.0,1.0,1.0,,,,,


In [8]:

columns_to_keep = [
    'flag_hv',
    'flag_dv',
    'flag_chld',
    'flag_yad',
    'flag_adlt',
    'flag_snr',
    'flag_si',
    'filter_military',
    'filter_inpatient_svc',
    'filter_residential_pgm',
    'name_2', 'city', 'latitude', 'longitude'
]

initial_df = initial_df[columns_to_keep]
initial_df.head()

Unnamed: 0,flag_hv,flag_dv,flag_chld,flag_yad,flag_adlt,flag_snr,flag_si,filter_military,filter_inpatient_svc,filter_residential_pgm,name_2,city,latitude,longitude
0,1.0,,,1.0,,,,,,,Queens Opioid Treatment Clinic Otp 3,NEW YORK,40.835269,-73.940293
1,,,,1.0,,,,,,,Ny Center Addiction Treatment Op 1,BROOKLYN,40.671835,-73.997069
2,,,,1.0,,,,,,,Bleuler Cd Recovery Services Op,QUEENS,40.748278,-73.891721
3,,,1.0,1.0,1.0,1.0,,,,,Bleuler Cd Recovery Services Op,QUEENS,40.72337,-73.848295
4,,,1.0,1.0,1.0,,,,,,Family Life Clinic,QUEENS,40.673769,-73.775951


In [9]:
import pandas as pd
import numpy as np

# Map flag_ columns from NaN, 1.0 => 0, 1
flag_columns = [col for col in initial_df.columns if col.startswith('flag_')]
initial_df[flag_columns] = initial_df[flag_columns].fillna(0).astype(int)

# Map filter_ columns from NaN, 1.0 => 0, 1
filter_columns = [col for col in initial_df.columns if col.startswith('filter_')]
initial_df[filter_columns] = initial_df[filter_columns].fillna(0).astype(int)

initial_df['service_count'] = initial_df[flag_columns].sum(axis=1)

# flag_saf: This flag indicates whether the service is categorized as a Safe Access Flag (SAF). SAFs are typically locations that have been vetted to ensure safe access for the populations they serve.
# flag_mhf: This flag indicates whether the service is a Mental Health Facility (MHF). It’s used to identify facilities specifically offering mental health services.
# flag_mc: This flag likely stands for "Mobile Crisis," marking services that include or focus on mobile crisis response units.
# flag_vet: Indicates whether the service provides specialized care for veterans.
# flag_adlt: This flag marks services that are specifically designed for adults.
# flag_chld: Marks services that are intended for children.
# flag_pw: This flag indicates services that are accessible to or designed for people with disabilities or specific physical needs.
# flag_snr: Identifies services that are geared towards senior citizens.

column_mapping = {
    'flag_saf': 'Safe Access Facility',
    'flag_mhf': 'Mental Health Facility',
    'flag_mc': 'Mobile Crisis Unit',
    'flag_vet': 'Veteran Services',
    'flag_adlt': 'Adult Services',
    'flag_chld': 'Child Services',
    'flag_pw': 'Disability Services',
    'flag_snr': 'Senior Services',
    'filter_inpatient_svc': 'Inpatient Services',
    'filter_residential_pgm': 'Residential Program',
    'flag_hv': 'HIV Services',
    'flag_dv': 'Domestic Violence Services',
    'flag_yad': 'Youth and Adolescent Services',
    'flag_si': 'Substance Abuse Services',
    'filter_military': 'Military Services'
}


# Assuming your DataFrame is named 'initial_df'
initial_df = initial_df.rename(columns=column_mapping)

# Verify changes
initial_df.head()

Unnamed: 0,HIV Services,Domestic Violence Services,Child Services,Youth and Adolescent Services,Adult Services,Senior Services,Substance Abuse Services,Military Services,Inpatient Services,Residential Program,name_2,city,latitude,longitude,service_count
0,1,0,0,1,0,0,0,0,0,0,Queens Opioid Treatment Clinic Otp 3,NEW YORK,40.835269,-73.940293,2
1,0,0,0,1,0,0,0,0,0,0,Ny Center Addiction Treatment Op 1,BROOKLYN,40.671835,-73.997069,1
2,0,0,0,1,0,0,0,0,0,0,Bleuler Cd Recovery Services Op,QUEENS,40.748278,-73.891721,1
3,0,0,1,1,1,1,0,0,0,0,Bleuler Cd Recovery Services Op,QUEENS,40.72337,-73.848295,4
4,0,0,1,1,1,0,0,0,0,0,Family Life Clinic,QUEENS,40.673769,-73.775951,3


In [10]:
# Replace '(null)' with NaN
initial_df.replace('(null)', np.nan, inplace=True)

# Replace 'UNKNOWN' with NaN
initial_df.replace('UNKNOWN', np.nan, inplace=True)

# Drop rows where latitude or longitude are NaN
initial_df = initial_df.dropna(subset=['latitude', 'longitude'])

# Verify changes
print(f"Remaining rows after dropping NaN values in latitude/longitude: {initial_df.shape[0]}")

initial_df.head()

Remaining rows after dropping NaN values in latitude/longitude: 490


Unnamed: 0,HIV Services,Domestic Violence Services,Child Services,Youth and Adolescent Services,Adult Services,Senior Services,Substance Abuse Services,Military Services,Inpatient Services,Residential Program,name_2,city,latitude,longitude,service_count
0,1,0,0,1,0,0,0,0,0,0,Queens Opioid Treatment Clinic Otp 3,NEW YORK,40.835269,-73.940293,2
1,0,0,0,1,0,0,0,0,0,0,Ny Center Addiction Treatment Op 1,BROOKLYN,40.671835,-73.997069,1
2,0,0,0,1,0,0,0,0,0,0,Bleuler Cd Recovery Services Op,QUEENS,40.748278,-73.891721,1
3,0,0,1,1,1,1,0,0,0,0,Bleuler Cd Recovery Services Op,QUEENS,40.72337,-73.848295,4
4,0,0,1,1,1,0,0,0,0,0,Family Life Clinic,QUEENS,40.673769,-73.775951,3


In [12]:
# Save the initial_df to a CSV file
initial_df.to_csv('../data/NYC_Mental_Health_Service_Finder_Data_Cleaned_Reduced.csv', index=False)

print("DataFrame saved to NYC_Mental_Health_Service_Finder_Data_Cleaned_Reduced.csv")

DataFrame saved to NYC_Mental_Health_Service_Finder_Data_Cleaned_Reduced.csv


In [13]:
# Import the MongoDBGeoHandler class
from mongo_handler import MongoDBGeoHandler

# Initialize the handler with a score column name
mongo_handler = MongoDBGeoHandler(db_name='nyc_data', collection_name='mental_health_services', score_column_name='service_count')

# Insert the cleaned data into MongoDB
mongo_handler.insert_data(initial_df)

# Query for the number of records and average score within a 5000-meter radius of a specific point
result = mongo_handler.count_records_within_radius(40.730610, -73.935242, 5000)
print(f"Number of records within the radius: {result['count']}")
print(f"Average service_count within the radius: {result['average_score']}")

Data has been successfully inserted into MongoDB in the 'nyc_data' database, 'mental_health_services' collection.
Total documents inserted: 490
Number of records within the radius: 85
Average service_count within the radius: 3.541176470588235
