# Ingestion for NY Data

In [1]:
# Import system packages.
import sys, os
import urllib.parse
import datetime
import json

# Import data processing tools
import pandas as pd

# Import MongoDB related packages.
from pymongo import MongoClient
from dotenv import dotenv_values
from bson.objectid import ObjectId

In [2]:
# Load the environment variables.
ENV_PATH = "./../.env"
config = { 
    **dotenv_values(ENV_PATH + ".config"),
    **dotenv_values(ENV_PATH + ".secrets"),
    **os.environ,
}

config['LOOKUP_SHEETS'] = config['LOOKUP_SHEETS'].split(',')

In [3]:
# Import Database class.
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"\\connection")
from database import Database

In [4]:
# Initialize and connect to the MongoDB client and show client details.
Database.initialize(config)

OK


MongoClient(host=['mongodb.thelionsredmane.stream:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin')

In [5]:
# Use specific database and show details.
Database.use("providers")

OK


Database(MongoClient(host=['mongodb.thelionsredmane.stream:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin'), 'providers')

## NYS Provider Ingestion

The `providers` database has the following collections:
- `ny_providers# Preview pandas.DataFrame information.
def preview(df):
    if df is not None:
        print(df.head())
        print(df.tail())
        print(df.info())`

In [6]:
# Preview pandas.DataFrame information.
def preview(df):
    if df is not None:
        print(df.head())
        print(df.tail())
        print(df.info())

In [7]:
# Load in dataset for the lookup table.
def load_sheet(filename, sheet, **settings):
    df = None
    with open(filename, "rb") as excel:
        df = pd.read_excel(excel, sheet_name=sheet, **settings)
    return df

In [8]:
# Prepare the providers URI.
uri = config['NY_PROVIDERS'].split('#')
filename = "./../data/" + uri[0]
worksheets = uri[1].split(',')
print(filename)
print(worksheets)

./../data/gov.nys.serviceproviders.xlsx
['ddso_service_providers', 'ddso_discharge_facilities', 'ofa_service_providers', 'cil_service_providers']


## Service Providers

Generally service providers have the following important fields:

```yaml
facility: <str>
info:
  phone: [<XXX-XXX-XXXX>]
  fax: [<XXX-XXX-XXXX>]
  website: <subdomain.domain.com>
  person: <entity>
keywords: [<str>,...]
address:
  street:
    line_1: <str>
    line_2: <str>
  city: <city>
  county: <county>
  state: <state>
  zipcode: <zipcode>
category: 
  disability: [<str>,...]
  service: [<str>,...]
misc: 
  content: [<str>,...]
  <key>: <value>
```

In [9]:
# Method for preparing a service provider.
class Provider:    
    """Provider instance."""
    def __init__(self, data, **kwargs):
        self.data = { **data }
        self.data['misc'] = {
            **self.data.get('misc', {}),
            **kwargs
        }
        
    def to_dict(self):
        return { **self.data }          

In [10]:
def to_records(df, columns=None):
    # Set df_view
    df_view = df
    
    if columns:
        df_view.columns = columns
        
    return df_view.to_dict("records")

In [11]:
# Prepare the NY Providers table.
ddso_df = load_sheet(filename, worksheets[0], index_col=None, usecols=list(range(0, 22)))
ddso_df

Unnamed: 0,Developmental Disability Services Office,Service Provider Agency,Street Address,Street Address Line 2,City,State,Zip Code,Phone,County,Website Url,...,Family Care,Self-Direction Services,Individual Support Services (ISSs),Day Habilitation,Prevocational,Supported Employment Enrollments,Community Habilitation,Family Support Services,Developmental Centers And Special Population Services,Location 1
0,METRO NEW YORK DDSO,UNIQUE LIFE INC.,2197 NEW ENGLAND THRUWAY,,BRONX,NY,10475,347-449-7890,BRONX,http://www.uniquelifeinc.org,...,N,Y,Y,Y,N,N,Y,Y,N,
1,BROOME DDSO,BROOME COUNTY URBAN LEAGUE,43-45 CARROLL STREET,,BINGHAMTON,NY,13901,607-723-7303,BROOME,http://www.bcul.org,...,N,N,N,N,N,N,N,Y,N,"43-45 CARROLL STREET\n BINGHAMTON, NY 13901\n ..."
2,HUDSON VALLEY DDSO,OPENGATE INC.,357 MAIN STREET,,ARMONK,NY,10504,914-277-5350,WESTCHESTER,http://www.opengateinc.org,...,N,N,N,Y,N,Y,Y,Y,N,"357 MAIN STREET\n ARMONK, NY 10504\n (41.12418..."
3,METRO NEW YORK DDSO,COMMUNITY OPTIONS NEW YORK INC.,350 5TH AVENUE,SUITE 5230,NEW YORK,NY,10118,212-227-9110,NEW YORK,http://www.comop.org,...,N,N,Y,Y,Y,Y,Y,N,N,"350 5TH AVENUE\n NEW YORK, NY 10118\n (40.7480..."
4,LONG ISLAND DDSO,CENTER FOR RAPID RECOVERY,312 GREENWICH STREET,,HEMPSTEAD,NY,11550,516-292-6449,NASSAU,http://www.rapidrecovery.org,...,N,N,N,Y,N,N,Y,Y,N,"312 GREENWICH STREET\n HEMPSTEAD, NY 11550\n (..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
528,BROOKLYN DDSO,UNIQUE CARE COMMUNITY SERVICES INC.,9317 AVENUE L,,BROOKLYN,NY,11236,347-502-2956,KINGS,,...,N,N,N,Y,N,N,Y,Y,N,"9317 AVENUE L\n BROOKLYN, NY 11236\n (40.63681..."
529,LONG ISLAND DDSO,SUFFOLK CO. NYSARC INC.,2900 VETERANS MEMORIAL HIGHWAY,,BOHEMIA,NY,11716,631-585-0100,SUFFOLK,http://www.ahrcsuffolk.org,...,N,N,N,Y,Y,Y,Y,Y,N,"2900 VETERANS MEMORIAL HIGHWAY\n BOHEMIA, NY 1..."
530,FINGER LAKES DDSO,ONTARIO CO. NYSARC INC.,3071 COUNTY COMPLEX DRIVE,,CANANDAIGUA,NY,14424,585-394-7500,ONTARIO,http://www.ontarioarc.org,...,Y,N,N,Y,Y,Y,Y,Y,N,"3071 COUNTY COMPLEX DRIVE\n CANANDAIGUA, NY 14..."
531,CENTRAL NEW YORK DDSO,HERITAGE FARM INC.,3599 STATE ROUTE 46,R.D.#1 P.O. BOX 143,BOUCKVILLE,NY,13310,315-893-1889,MADISON,http://heritagefarminc.org,...,N,N,N,Y,N,Y,Y,Y,N,"3599 STATE ROUTE 46\n BOUCKVILLE, NY 13310\n (..."


In [12]:
ddso_df.head()

Unnamed: 0,Developmental Disability Services Office,Service Provider Agency,Street Address,Street Address Line 2,City,State,Zip Code,Phone,County,Website Url,...,Family Care,Self-Direction Services,Individual Support Services (ISSs),Day Habilitation,Prevocational,Supported Employment Enrollments,Community Habilitation,Family Support Services,Developmental Centers And Special Population Services,Location 1
0,METRO NEW YORK DDSO,UNIQUE LIFE INC.,2197 NEW ENGLAND THRUWAY,,BRONX,NY,10475,347-449-7890,BRONX,http://www.uniquelifeinc.org,...,N,Y,Y,Y,N,N,Y,Y,N,
1,BROOME DDSO,BROOME COUNTY URBAN LEAGUE,43-45 CARROLL STREET,,BINGHAMTON,NY,13901,607-723-7303,BROOME,http://www.bcul.org,...,N,N,N,N,N,N,N,Y,N,"43-45 CARROLL STREET\n BINGHAMTON, NY 13901\n ..."
2,HUDSON VALLEY DDSO,OPENGATE INC.,357 MAIN STREET,,ARMONK,NY,10504,914-277-5350,WESTCHESTER,http://www.opengateinc.org,...,N,N,N,Y,N,Y,Y,Y,N,"357 MAIN STREET\n ARMONK, NY 10504\n (41.12418..."
3,METRO NEW YORK DDSO,COMMUNITY OPTIONS NEW YORK INC.,350 5TH AVENUE,SUITE 5230,NEW YORK,NY,10118,212-227-9110,NEW YORK,http://www.comop.org,...,N,N,Y,Y,Y,Y,Y,N,N,"350 5TH AVENUE\n NEW YORK, NY 10118\n (40.7480..."
4,LONG ISLAND DDSO,CENTER FOR RAPID RECOVERY,312 GREENWICH STREET,,HEMPSTEAD,NY,11550,516-292-6449,NASSAU,http://www.rapidrecovery.org,...,N,N,N,Y,N,N,Y,Y,N,"312 GREENWICH STREET\n HEMPSTEAD, NY 11550\n (..."


In [31]:
def get_ddso_facility(df):    
    return df.iloc[:,[0,1]].agg(' - '.join, axis=1)
facilities = get_ddso_facility(ddso_df)

In [30]:
def get_ddso_info(df):
    phone = df.loc[:, df.columns.isin(['Phone'])]
    website = df.loc[:, df.columns.isin(['Website Url'])]
    return {
        'phone': phone,
        'website': website,
    }
infos = get_ddso_info(ddso_df)

In [40]:
def get_ddso_address(df):
    street = {
        'line1': df.loc[:, df.columns.isin(['Street Address Line 1'])],
        'line2': df.loc[:, df.columns.isin(['Street Address Line 2'])],
        'line3': df.loc[:, df.columns.isin(['Location 1'])],
    }
    city = df.loc[:, df.columns.isin(['City'])]
    county = df.loc[:, df.columns.isin(['County'])]    
    state = df.loc[:, df.columns.isin(['State'])]
    zipcode = df.loc[:, df.columns.isin(['Zip Code'])]
    return {
        'street': street,
        'city': city,
        'county': county,
        'state': state,
        'zipcode': zipcode
    }
addresses = get_ddso_address(ddso_df)

## Categories

`disability_category`:
- Ambulatory
- Hearing
- Cognitive
- Vision
- Independent Living
- Self-care
- Other

`service_category`:
- Communication Services
- Advocacy Services
- Job Development and Placement
- Information and Retrieval
- Counseling
- Independent Living Skill Instruction
- Community Education

In [28]:
def get_ddso_keywords(df):
    
    ICF = df['Intermediate Care Facilities (ICFs)'].map({'Y': "Intermediate Care Facilities ICF", 'N': None})
    IRA = df['Individual Residential Alternative (IRA)'].map({'Y': "Individual Residential Alternative IRA", 'N': None})
    FC = df['Family Care'].map({'Y': "Family Care", 'N': None})
    SDS = df['Self-Direction Services'].map({'Y': "Self Direction Services", 'N': None})
    ISS = df['Individual Support Services (ISSs)'].map({'Y': "Individual Support Services ISS", 'N': None})
    DH = df['Day Habilitation'].map({'Y': "Day Habilitation", 'N': None})
    PV = df['Prevocational'].map({'Y': "Prevocational", 'N': None})
    SEE = df['Supported Employment Enrollments'].map({'Y': "Supported Employment Enrollments", 'N': None})
    CH = df['Community Habilitation'].map({'Y': "Community Habilitation", 'N': None})
    FSS = df['Family Support Services'].map({'Y': "Family Support Services", 'N': None})
    DCSPS = df['Developmental Centers And Special Population Services'].map({'Y': "Developmental Centers and Special Population Services", 'N': None})
    keyword_df = [ICF, IRA, FC, SDS, ISS, DH, PV, SEE, CH, FSS, DCSPS]
    keyword_df = pd.concat(keyword_df, axis=1) 
    
    keyword_df.columns = ['ICF', 'IRA','FC', 'SDS', 'ISS', 'DH', 'PV', 'SEE', 'CH', 'FSS', 'DCSPS']
    return keyword_df.fillna('').astype(str).apply(lambda x: ' '.join(x.str.lower().str.strip()), axis=1)
keywords = get_ddso_keywords(ddso_df)

In [17]:
categories = [
    ['ambulatory', 'hearing', 'deaf', 'hoh', 'hard of hearing', 'cognitive', 'vision', 'independent living', 'self-care', 'other'],
    ['communication', 'advocacy', 'job', 'development', 'placement', 'information', 'retireval', 'counseling', 'independent', 'living', 'skill', 'instruction', 'community','education']
]

In [66]:
def get_at(df, i):
    return df.iloc[i]

# [{ 'a': i } for i in range(ddso_df.shape[0])]

payload = [{ 
    'facility': get_at(facilities, i),
    'info': {
        'phone': infos.get('phone').iloc[i].astype(str)[0],
        'website': infos.get('website').iloc[i].astype(str)[0],
    },
    'address': {
        # 'street': {
             # 'line1': addresses.get('street').get('line1').iloc[i].astype(str)[0],
             # 'line2': addresses.get('street').get('line2').iloc[i].astype(str)[0],
             # 'line3': addresses.get('street').get('line3').iloc[i].astype(str)[0],
        # },
        'city': addresses.get('city').iloc[i][0],
        'county': addresses.get('county').iloc[i][0],
        'state': addresses.get('state').iloc[i][0],
        'zipcode': int(addresses.get('zipcode').iloc[i][0])
    },
    'keywords': ",".join(get_at(keywords, i).split()),
    # 'category': {},
    # 'misc': {}
 } for i in range(ddso_df.shape[0])]

payload

[{'facility': 'METRO NEW YORK DDSO - UNIQUE LIFE INC.',
  'info': {'phone': '347-449-7890', 'website': 'http://www.uniquelifeinc.org'},
  'address': {'city': 'BRONX',
   'county': 'BRONX',
   'state': 'NY',
   'zipcode': 10475},
  'keywords': 'self,direction,services,individual,support,services,iss,day,habilitation,community,habilitation,family,support,services'},
 {'facility': 'BROOME DDSO - BROOME COUNTY URBAN LEAGUE',
  'info': {'phone': '607-723-7303', 'website': 'http://www.bcul.org'},
  'address': {'city': 'BINGHAMTON',
   'county': 'BROOME',
   'state': 'NY',
   'zipcode': 13901},
  'keywords': 'family,support,services'},
 {'facility': 'HUDSON VALLEY DDSO - OPENGATE INC.',
  'info': {'phone': '914-277-5350', 'website': 'http://www.opengateinc.org'},
  'address': {'city': 'ARMONK',
   'county': 'WESTCHESTER',
   'state': 'NY',
   'zipcode': 10504},
  'keywords': 'intermediate,care,facilities,icf,individual,residential,alternative,ira,day,habilitation,supported,employment,enrollme

In [62]:
# Use specific database and show details.
Database.use("providers")

OK


Database(MongoClient(host=['mongodb.thelionsredmane.stream:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin'), 'providers')

In [68]:
# Insert many.
# Database.insert_many("services", payload)