In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import matplotlib.pyplot as plt

# check for required file
FILE_PATH = '../data/Mental_Health_Service_Finder_Data_20240816.csv'

file_path = Path(FILE_PATH)

# Check if the file exists
if file_path.exists():
    print(f"File found: {file_path}")
else:
    print(f"File not found: {file_path}. Please ensure the file is downloaded correctly.")

    # Load the dataset
initial_df = pd.read_csv(FILE_PATH)

File found: ../data/Mental_Health_Service_Finder_Data_20240816.csv


In [2]:
# flag_saf: This flag indicates whether the service is categorized as a Safe Access Flag (SAF). SAFs are typically locations that have been vetted to ensure safe access for the populations they serve.
# flag_mhf: This flag indicates whether the service is a Mental Health Facility (MHF). It’s used to identify facilities specifically offering mental health services.
# flag_mc: This flag likely stands for "Mobile Crisis," marking services that include or focus on mobile crisis response units.
# flag_vet: Indicates whether the service provides specialized care for veterans.
# flag_adlt: This flag marks services that are specifically designed for adults.
# flag_chld: Marks services that are intended for children.
# flag_pw: This flag indicates services that are accessible to or designed for people with disabilities or specific physical needs.
# flag_snr: Identifies services that are geared towards senior citizens.

initial_df.head()

Unnamed: 0,name_1,name_2,street_1,street_2,city,zip,phone,website,latitude,longitude,...,flag_hv,flag_dv,flag_chld,flag_yad,flag_adlt,flag_snr,flag_si,filter_military,filter_inpatient_svc,filter_residential_pgm
0,Allied Service Center NYC,Queens Opioid Treatment Clinic Otp 3,2036 Amsterdam Avenue,,NEW YORK,10032,212-645-0875,,40.835269,-73.940293,...,1.0,,,1.0,,,,,,
1,Beth Israel Medical Center,Ny Center Addiction Treatment Op 1,25 12th Street,,BROOKLYN,11215,718-965-7900,,40.671835,-73.997069,...,,,,1.0,,,,,,
2,New York Center Addiction Treatment,Bleuler Cd Recovery Services Op,37-20 74Th Street,,QUEENS,11372,212-966-9537,http://nycats.net/,40.748278,-73.891721,...,,,,1.0,,,,,,
3,"Bleuler Psychotherapy Center, Inc.",Bleuler Cd Recovery Services Op,104-70 Queens Boulevard,,QUEENS,11375,718-275-6010,,40.72337,-73.848295,...,,,1.0,1.0,1.0,1.0,,,,
4,"Safe Space NYC, Inc.",Family Life Clinic,133-25 Guy R. Brewer Boulevard,,QUEENS,11434,718-206-3440,,40.673769,-73.775951,...,,,1.0,1.0,1.0,,,,,


In [3]:

flag_and_filter_columns = [
    'flag_hv',
    'flag_dv',
    'flag_chld',
    'flag_yad',
    'flag_adlt',
    'flag_snr',
    'flag_si',
    'filter_military',
    'filter_inpatient_svc',
    'filter_residential_pgm'
]

columns_to_keep = ['name_2', 'city', 'latitude', 'longitude'] + flag_and_filter_columns


initial_df = initial_df[columns_to_keep]

In [4]:
import pandas as pd
import numpy as np

# Map flag_ columns from NaN, 1.0 => 0, 1
flag_columns = [col for col in initial_df.columns if col.startswith('flag_')]
initial_df[flag_columns] = initial_df[flag_columns].fillna(0).astype(int)

# Map filter_ columns from NaN, 1.0 => 0, 1
filter_columns = [col for col in initial_df.columns if col.startswith('filter_')]
initial_df[filter_columns] = initial_df[filter_columns].fillna(0).astype(int)

initial_df['specialized_care'] = initial_df[flag_columns].sum(axis=1)

# Verify changes
initial_df.head()

Unnamed: 0,name_2,city,latitude,longitude,flag_hv,flag_dv,flag_chld,flag_yad,flag_adlt,flag_snr,flag_si,filter_military,filter_inpatient_svc,filter_residential_pgm,specialized_care
0,Queens Opioid Treatment Clinic Otp 3,NEW YORK,40.835269,-73.940293,1,0,0,1,0,0,0,0,0,0,2
1,Ny Center Addiction Treatment Op 1,BROOKLYN,40.671835,-73.997069,0,0,0,1,0,0,0,0,0,0,1
2,Bleuler Cd Recovery Services Op,QUEENS,40.748278,-73.891721,0,0,0,1,0,0,0,0,0,0,1
3,Bleuler Cd Recovery Services Op,QUEENS,40.72337,-73.848295,0,0,1,1,1,1,0,0,0,0,4
4,Family Life Clinic,QUEENS,40.673769,-73.775951,0,0,1,1,1,0,0,0,0,0,3


In [5]:
# Replace '(null)' with NaN
initial_df.replace('(null)', np.nan, inplace=True)

# Replace 'UNKNOWN' with NaN
initial_df.replace('UNKNOWN', np.nan, inplace=True)

# Count the number of NaN values per column
nan_counts = initial_df.isna().sum()

# Print the NaN counts
print(nan_counts)

# Drop rows where latitude or longitude are NaN
initial_df = initial_df.dropna(subset=['latitude', 'longitude'])

# Verify changes
print(f"Remaining rows after dropping NaN values in latitude/longitude: {initial_df.shape[0]}")

name_2                    78
city                       0
latitude                   0
longitude                  2
flag_hv                    0
flag_dv                    0
flag_chld                  0
flag_yad                   0
flag_adlt                  0
flag_snr                   0
flag_si                    0
filter_military            0
filter_inpatient_svc       0
filter_residential_pgm     0
specialized_care           0
dtype: int64
Remaining rows after dropping NaN values in latitude/longitude: 490


In [6]:
# Save the initial_df to a CSV file
initial_df.to_csv('../data/Mental_Health_Service_Finder_Data_Cleaned_Reduced.csv', index=False)

print("DataFrame saved to Mental_Health_Service_Finder_Data_Cleaned_Reduced.csv")

DataFrame saved to Mental_Health_Service_Finder_Data_Cleaned_Reduced.csv


In [7]:
# Import the MongoDBGeoHandler class
from mongo_handler import MongoDBGeoHandler

# Initialize the handler
mongo_handler = MongoDBGeoHandler()

# Insert the cleaned data into MongoDB
mongo_handler.insert_data(initial_df, 'mental_health_services', 'services')

# Count the number of records within a 5000-meter radius of a specific point
count = mongo_handler.count_records_within_radius('mental_health_services', 'services', 40.730610, -73.935242, 5000)
print(f"Number of records within the radius: {count}")


Data has been successfully inserted into MongoDB in the 'mental_health_services' database, 'services' collection.
Total documents inserted: 490
Number of records within the radius: 85
