# Pipeline: DDSO Service Providers

This notebook accomplishes the following:

1. Load the data from the `gov.nys.serviceproviders.xlsx#ddso_service_providers` worksheet into memory using `pandas`.

2. Tidy the fields to prepare it for the schema described in [`settings.toml`]("./../../settings.toml") file.

3. Ingest the data to the `providers` database, underneath the `services` collection.


In [1]:
# Import system path.
import sys, os
import pandas as pd
import numpy as np

# Add module path to the kernel PATH if it isn't currently added.
def add_module_path(module_path):
    if module_path not in sys.path:
        sys.path.append(module_path)
                
# Add directories to MODULE_PATH
root = os.path.abspath(os.path.join('..\\..\\'))
add_module_path(root)
add_module_path(root + "\\data")
add_module_path(root + "\\utils")
add_module_path(root + "\\models")
add_module_path(root + "\\connection")

In [2]:
# Import utilities.
from bunch import Bunch, bunchify, unbunchify
from box import Box
from utils.finder import *
from pandas import DataFrame, Series
from matplotlib import pyplot as plt
from IPython.display import display
from typing import *

# Import configuration settings.
from config import settings

In [3]:
# Helper methods for extracting named child arrays from Box object parents:

def extract(obj, key, keyMapper = lambda x: x[0].name, valueMapper = lambda x: x[0]):
    extraction = {}
    idx = 0
    for item in obj[key]:
        extraction[keyMapper((item, idx))] = valueMapper((item, idx))
        idx += 1
    return Box(extraction)

def get_databases(root):
    return extract(root, 'databases', valueMapper = lambda db: get_collection(db[0]))

def get_collection(db):    
    return extract(db, 'collections', valueMapper = lambda coll: Box(schema=get_schema(coll[0]), sources=get_sources(coll[0])))

def get_sources(collection):
    return extract(collection, 'sources', keyMapper = lambda src: src[1], valueMapper = lambda src: { 'path': src[0].name, 'type': src[0].type, **get_worksheets(src[0]) })

def get_worksheets(source):
    if(source['type'] == "excel"):
        return Box(worksheets=source.worksheets)
    return {}

def get_schema(collection):
    return extract(collection, 'schema', keyMapper = lambda x: x[0]['field'])

def get_fields(schema):
    try:
        return [entry.field for entry in schema]
    except:
        return [*schema.keys()]

In [4]:
def format_source(source, idx = None):
    idx = '' if idx is None else f'[{str(idx)}]:'
    try:
        return f"{idx}\"{source.path}\"\n{source.worksheets}\n"
    except:
        return f"{idx}\"{source.path}\"\n"
    
def format_sources(sources):
    return ','.join([format_source(v, k) for k, v in sources.items()])

## Read in the Data

Load the data from the `gov.nys.serviceproviders.xlsx#ddso_service_providers` worksheet into memory using `pandas`.

1. Grab the `database`, `collection`, and `source` information from the `settings.toml` file. This is already loaded in via the settings object from the `from config import settings` line.

2. Load the data from the file into a `pandas.DataFrame` object.


In [5]:
# Read in the metadata.
db = get_databases(settings.db).providers
services = db.services
sources = services.sources
fields = get_fields(services.schema)
fields, sources

(['_id',
  'facility',
  'keywords',
  'category.disability',
  'category.service',
  'info.phone',
  'info.fax',
  'info.website.url',
  'info.website.subdomain',
  'info.website.hostname',
  'info.website.domain',
  'info.addressee',
  'address.location',
  'address.street.line1',
  'address.street.line2',
  'address.coordinates.latitude',
  'address.coordinates.longitude',
  'address.city',
  'address.county',
  'address.state',
  'address.zipcode'],
 <Box: {0: {'path': 'gov.ca.serviceproviders.xlsx', 'type': 'excel', 'worksheets': [{'name': 'dss_service_providers', 'fields': 'A:N'}]}, 1: {'path': 'gov.ca.serviceproviders.xlsx', 'type': 'excel', 'worksheets': [{'name': 'dss_service_providers', 'fields': 'A:N'}]}, 2: {'path': 'gov.nys.serviceproviders.xlsx', 'type': 'excel', 'worksheets': [{'name': 'ddso_service_providers', 'fields': 'A:V'}, {'name': 'ddso_discharge_facilities', 'fields': 'A:F'}, {'name': 'ofa_service_providers', 'fields': 'A:J'}]}, 3: {'path': 'org.handson.ohio.serv

In [6]:
# Get the source for the ddso_service_providers.
print(format_sources(sources))

[0]:"gov.ca.serviceproviders.xlsx"
[{'name': 'dss_service_providers', 'fields': 'A:N'}]
,[1]:"gov.ca.serviceproviders.xlsx"
[{'name': 'dss_service_providers', 'fields': 'A:N'}]
,[2]:"gov.nys.serviceproviders.xlsx"
[{'name': 'ddso_service_providers', 'fields': 'A:V'}, {'name': 'ddso_discharge_facilities', 'fields': 'A:F'}, {'name': 'ofa_service_providers', 'fields': 'A:J'}]
,[3]:"org.handson.ohio.serviceproviders.json"
,[4]:"irlu-CA-results.html"
,[5]:"irlu-NY-results.html"
,[6]:"irlu-OH-results.html"



In [7]:
# Load excel file based on input source information, into a dataframe
def load_excel(source, worksheet, dirpath='', names=None, **kwargs):
    path = source.path
    sheet = worksheet.name
    cols = worksheet.fields
    df = pd.read_excel(dirpath + path, sheet_name=sheet, usecols=cols, **kwargs)
    oldnames = df.columns
    if names:
        df.columns = names
    return df, list(oldnames.values)

In [8]:
source = sources[2]
worksheet = source.worksheets[0]
source.path, worksheet

('gov.nys.serviceproviders.xlsx',
 <Box: {'name': 'ddso_service_providers', 'fields': 'A:V'}>)

In [9]:
df, columns = load_excel(source, worksheet, dirpath="./../../" + settings.datadir, names=list(range(22)))
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,METRO NEW YORK DDSO,UNIQUE LIFE INC.,2197 NEW ENGLAND THRUWAY,,BRONX,NY,10475,347-449-7890,BRONX,http://www.uniquelifeinc.org,...,N,Y,Y,Y,N,N,Y,Y,N,
1,BROOME DDSO,BROOME COUNTY URBAN LEAGUE,43-45 CARROLL STREET,,BINGHAMTON,NY,13901,607-723-7303,BROOME,http://www.bcul.org,...,N,N,N,N,N,N,N,Y,N,"43-45 CARROLL STREET\n BINGHAMTON, NY 13901\n ..."
2,HUDSON VALLEY DDSO,OPENGATE INC.,357 MAIN STREET,,ARMONK,NY,10504,914-277-5350,WESTCHESTER,http://www.opengateinc.org,...,N,N,N,Y,N,Y,Y,Y,N,"357 MAIN STREET\n ARMONK, NY 10504\n (41.12418..."
3,METRO NEW YORK DDSO,COMMUNITY OPTIONS NEW YORK INC.,350 5TH AVENUE,SUITE 5230,NEW YORK,NY,10118,212-227-9110,NEW YORK,http://www.comop.org,...,N,N,Y,Y,Y,Y,Y,N,N,"350 5TH AVENUE\n NEW YORK, NY 10118\n (40.7480..."
4,LONG ISLAND DDSO,CENTER FOR RAPID RECOVERY,312 GREENWICH STREET,,HEMPSTEAD,NY,11550,516-292-6449,NASSAU,http://www.rapidrecovery.org,...,N,N,N,Y,N,N,Y,Y,N,"312 GREENWICH STREET\n HEMPSTEAD, NY 11550\n (..."


In [10]:
columns = [ { 'id': i, 'name': columns[i] } for i in range(22)]
columns

[{'id': 0, 'name': 'Developmental Disability Services Office'},
 {'id': 1, 'name': 'Service Provider Agency'},
 {'id': 2, 'name': 'Street Address'},
 {'id': 3, 'name': 'Street Address Line 2'},
 {'id': 4, 'name': 'City'},
 {'id': 5, 'name': 'State'},
 {'id': 6, 'name': 'Zip Code'},
 {'id': 7, 'name': 'Phone'},
 {'id': 8, 'name': 'County'},
 {'id': 9, 'name': 'Website Url'},
 {'id': 10, 'name': 'Intermediate Care Facilities'},
 {'id': 11, 'name': 'Individual Residential Alternative'},
 {'id': 12, 'name': 'Family Care'},
 {'id': 13, 'name': 'Self-Direction Services'},
 {'id': 14, 'name': 'Individual Support Services'},
 {'id': 15, 'name': 'Day Habilitation'},
 {'id': 16, 'name': 'Prevocational'},
 {'id': 17, 'name': 'Supported Employment Enrollments'},
 {'id': 18, 'name': 'Community Habilitation'},
 {'id': 19, 'name': 'Family Support Services'},
 {'id': 20, 'name': 'Developmental Centers And Special Population Services'},
 {'id': 21, 'name': 'Location 1'}]

## Tidy the Data

This is where we wrangle the data. We need to:

1. Impute missing data.
2. Extract additional features.
3. Reduce into final schema format.

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 533 entries, 0 to 532
Data columns (total 22 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       533 non-null    object
 1   1       533 non-null    object
 2   2       533 non-null    object
 3   3       153 non-null    object
 4   4       533 non-null    object
 5   5       533 non-null    object
 6   6       533 non-null    int64 
 7   7       526 non-null    object
 8   8       533 non-null    object
 9   9       442 non-null    object
 10  10      533 non-null    object
 11  11      533 non-null    object
 12  12      533 non-null    object
 13  13      533 non-null    object
 14  14      533 non-null    object
 15  15      533 non-null    object
 16  16      533 non-null    object
 17  17      533 non-null    object
 18  18      533 non-null    object
 19  19      533 non-null    object
 20  20      533 non-null    object
 21  21      497 non-null    object
dtypes: int64(1), object(21)
me

### Imputation

Examination of the nulls suggest:

- Column 3  `Street Address Line 2`, impute with empty `str`.
- Column 7  `Phone`, impute with `"Unknown"`.
- Column 9  `Website Url`, impute with empty `str`.
- Column 21 `Location 1`, impute with empty `str`.

In [12]:
# Make a tidied dataset pointer.
df = df

def fill_empty(column, value):
    df[df.columns[column]] = df[df.columns[column]].fillna(value)
    return df

# Impute missing values.
fill_empty(3, '')
fill_empty(7, "Unknown")
fill_empty(9, '')
fill_empty(21, '')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,METRO NEW YORK DDSO,UNIQUE LIFE INC.,2197 NEW ENGLAND THRUWAY,,BRONX,NY,10475,347-449-7890,BRONX,http://www.uniquelifeinc.org,...,N,Y,Y,Y,N,N,Y,Y,N,
1,BROOME DDSO,BROOME COUNTY URBAN LEAGUE,43-45 CARROLL STREET,,BINGHAMTON,NY,13901,607-723-7303,BROOME,http://www.bcul.org,...,N,N,N,N,N,N,N,Y,N,"43-45 CARROLL STREET\n BINGHAMTON, NY 13901\n ..."
2,HUDSON VALLEY DDSO,OPENGATE INC.,357 MAIN STREET,,ARMONK,NY,10504,914-277-5350,WESTCHESTER,http://www.opengateinc.org,...,N,N,N,Y,N,Y,Y,Y,N,"357 MAIN STREET\n ARMONK, NY 10504\n (41.12418..."
3,METRO NEW YORK DDSO,COMMUNITY OPTIONS NEW YORK INC.,350 5TH AVENUE,SUITE 5230,NEW YORK,NY,10118,212-227-9110,NEW YORK,http://www.comop.org,...,N,N,Y,Y,Y,Y,Y,N,N,"350 5TH AVENUE\n NEW YORK, NY 10118\n (40.7480..."
4,LONG ISLAND DDSO,CENTER FOR RAPID RECOVERY,312 GREENWICH STREET,,HEMPSTEAD,NY,11550,516-292-6449,NASSAU,http://www.rapidrecovery.org,...,N,N,N,Y,N,N,Y,Y,N,"312 GREENWICH STREET\n HEMPSTEAD, NY 11550\n (..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
528,BROOKLYN DDSO,UNIQUE CARE COMMUNITY SERVICES INC.,9317 AVENUE L,,BROOKLYN,NY,11236,347-502-2956,KINGS,,...,N,N,N,Y,N,N,Y,Y,N,"9317 AVENUE L\n BROOKLYN, NY 11236\n (40.63681..."
529,LONG ISLAND DDSO,SUFFOLK CO. NYSARC INC.,2900 VETERANS MEMORIAL HIGHWAY,,BOHEMIA,NY,11716,631-585-0100,SUFFOLK,http://www.ahrcsuffolk.org,...,N,N,N,Y,Y,Y,Y,Y,N,"2900 VETERANS MEMORIAL HIGHWAY\n BOHEMIA, NY 1..."
530,FINGER LAKES DDSO,ONTARIO CO. NYSARC INC.,3071 COUNTY COMPLEX DRIVE,,CANANDAIGUA,NY,14424,585-394-7500,ONTARIO,http://www.ontarioarc.org,...,Y,N,N,Y,Y,Y,Y,Y,N,"3071 COUNTY COMPLEX DRIVE\n CANANDAIGUA, NY 14..."
531,CENTRAL NEW YORK DDSO,HERITAGE FARM INC.,3599 STATE ROUTE 46,R.D.#1 P.O. BOX 143,BOUCKVILLE,NY,13310,315-893-1889,MADISON,http://heritagefarminc.org,...,N,N,N,Y,N,Y,Y,Y,N,"3599 STATE ROUTE 46\n BOUCKVILLE, NY 13310\n (..."


### Feature Extraction

Extracting features is the next step.

- Extract subdomain from the website field.
- Extract hostname from the website field.
- Extract the top level domain from the website field.
- Map columns`[10:21]` to an appropriate keyword based on the column.
- Combine columns`[0,1]`, and remove duplicate words.
- Extract the coordinates from the location field (if present).
- Split coordinates into 'longitude' and 'latitude' features.


In [13]:
# Map function.
def map(df, selector, mapper, default=None, exhaustive=False):
    """Map values in selection of data frame and potentially """
    if default:
        return df[selector].map(mapper).fillna(default)
    else: 
        fill = None if exhaustive else df[selector]
        return df[selector].map(mapper).fillna(fill)

In [14]:
# Import tldextract (from: https://github.com/john-kurkowski/tldextract)
from tldextract import extract as ex

def urlparse(url):
    results = ex(url)
    return Box(url='.'.join(part for part in results if part), subdomain=results[0], hostname=results[1], domain=results[2])

# Get the url.
urls = df[df.columns[9]]
df['info.website.url'] = urls.map(lambda x: urlparse(x).url)
df['info.website.subdomain'] = urls.map(lambda x: urlparse(x).subdomain)
df['info.website.hostname'] = urls.map(lambda x: urlparse(x).hostname)
df['info.website.domain'] = urls.map(lambda x: urlparse(x).domain)
df[df.columns[22:]]

Unnamed: 0,info.website.url,info.website.subdomain,info.website.hostname,info.website.domain
0,www.uniquelifeinc.org,www,uniquelifeinc,org
1,www.bcul.org,www,bcul,org
2,www.opengateinc.org,www,opengateinc,org
3,www.comop.org,www,comop,org
4,www.rapidrecovery.org,www,rapidrecovery,org
...,...,...,...,...
528,,,,
529,www.ahrcsuffolk.org,www,ahrcsuffolk,org
530,www.ontarioarc.org,www,ontarioarc,org
531,heritagefarminc.org,,heritagefarminc,org


In [15]:
# Map keyword columns into a single keyword column.
import re    # For regex substitution.

def normalize(x):
    if x and isinstance(x, (str,)):
        # Remove special "s)" situations.
        x = re.sub('s[)]', ')', x)
        # Make uppercase.
        x = x.upper()    
        # Remove parentheses.
        x = re.sub('[({})]', '', x)
    return x

def get_keyword(keywords, index):
    result = find_one(keywords, lambda x: x['id'] == index)
    if result:
        result = normalize(result['name'])
    return result

    
def map_keywords(keywords):
    # Loop through the keyword columns and apply the mapping.
    results = []
    for i in range(keywords[0]['id'], keywords[len(keywords) - 1]['id'] + 1):
        idx = i
        result = (idx, map(df, df.columns[i], mapper={'Y': get_keyword(keywords, i), 'N': ''}))
        results.append(result)
    return results

# Select the keyword columns we're processing.  
keywords = columns[10:21]
display(keywords)

# Map the keys.
for entry in map_keywords(keywords):
    df[df.columns[entry[0]]] = entry[1]
df[df.columns[10:21]]

# Combine keywords into one 'keywords' field
def reduce_keywords(row):
    # Remove duplicates.
    words = set(list(row.values))
    words = list(filter(lambda x: x > '', words))
    words.sort()
    words = ','.join(words)
    return words

# Replace the keywords.    
df['keywords'] = df[df.columns[10:21]].astype(str).apply(lambda row: reduce_keywords(row), axis=1)
df[df.columns[22:]].head()

[{'id': 10, 'name': 'Intermediate Care Facilities'},
 {'id': 11, 'name': 'Individual Residential Alternative'},
 {'id': 12, 'name': 'Family Care'},
 {'id': 13, 'name': 'Self-Direction Services'},
 {'id': 14, 'name': 'Individual Support Services'},
 {'id': 15, 'name': 'Day Habilitation'},
 {'id': 16, 'name': 'Prevocational'},
 {'id': 17, 'name': 'Supported Employment Enrollments'},
 {'id': 18, 'name': 'Community Habilitation'},
 {'id': 19, 'name': 'Family Support Services'},
 {'id': 20, 'name': 'Developmental Centers And Special Population Services'}]

Unnamed: 0,info.website.url,info.website.subdomain,info.website.hostname,info.website.domain,keywords
0,www.uniquelifeinc.org,www,uniquelifeinc,org,"COMMUNITY HABILITATION,DAY HABILITATION,FAMILY..."
1,www.bcul.org,www,bcul,org,FAMILY SUPPORT SERVICES
2,www.opengateinc.org,www,opengateinc,org,"COMMUNITY HABILITATION,DAY HABILITATION,FAMILY..."
3,www.comop.org,www,comop,org,"COMMUNITY HABILITATION,DAY HABILITATION,INDIVI..."
4,www.rapidrecovery.org,www,rapidrecovery,org,"COMMUNITY HABILITATION,DAY HABILITATION,FAMILY..."


In [16]:
# Keep order while removing duplicates.
def sequentialset(li):
    seen = set()
    adder = seen.add
    return [x for x in li if not (x in seen or adder(x))]

# Combine facility name components into one facility field
def make_facility(row):
    # Remove duplicates.
    words = ' - '.join(list(row.values))
    words = words.split()
    words = sequentialset(words)
    words = ' '.join(words)
    # words = list(filter(lambda x: x if x else '', words))
    # words = ','.join(words)
    return words

# Combine columns 0 and 1.
orgs = df[df.columns[0:2]]
orgs = orgs.astype(str).apply(lambda row: make_facility(row), axis=1)
df['facility'] = orgs
df[df.columns[26:]].head()

Unnamed: 0,keywords,facility
0,"COMMUNITY HABILITATION,DAY HABILITATION,FAMILY...",METRO NEW YORK DDSO - UNIQUE LIFE INC.
1,FAMILY SUPPORT SERVICES,BROOME DDSO - COUNTY URBAN LEAGUE
2,"COMMUNITY HABILITATION,DAY HABILITATION,FAMILY...",HUDSON VALLEY DDSO - OPENGATE INC.
3,"COMMUNITY HABILITATION,DAY HABILITATION,INDIVI...",METRO NEW YORK DDSO - COMMUNITY OPTIONS INC.
4,"COMMUNITY HABILITATION,DAY HABILITATION,FAMILY...",LONG ISLAND DDSO - CENTER FOR RAPID RECOVERY


In [17]:
# Extract the location newlines.
def extract_components(row):
    content = row.splitlines()
    content = [word.strip() for word in filter(None, content)]
    return content

# Extract the location information.
def extract_location(loc):
    components = extract_components(loc)
    if components:            
        street, city_state_zipcode, coordinates = components
        city_state, zipcode = city_state_zipcode.rsplit(' ', 1)
        city, state = city_state.strip().rsplit(',', 1)
        state = state.strip()
        content = (street, city, state, zipcode, coordinates)
        return content
    return None

# Extract the latitude.
def extract_latitude(coords):
    lat = None
    try:
        lat, _ = coords.split(',')
        lat = lat.strip()
        lat = float(lat)
    except:
        lon = float("nan")
    lat = Box(latitude=lat)
    return lat

# Extract the longitude.
def extract_longitude(coords):
    lon = None
    try:
        _, lon = coords.split(',')
        lon = lon.strip()
        lon = float(lon)
    except:
        lon = float("nan")
    lon = Box(longitude=lon)
    return lon

# Extract the coordinates.
def extract_coordinates(loc):
    components = extract_location(loc)
    try:
        coords = re.sub('[({})]', '', components[4])
        lat = extract_latitude(coords)
        lon = extract_longitude(coords)
        coords = Box(**lat, **lon)
        return coords
    except:
        return Box()

location = df[df.columns[21]]

# Map latitudes and longitudes.
latitudes = location.apply(lambda row: extract_coordinates(row).get('latitude', float('nan')))
longitudes = location.apply(lambda row: extract_coordinates(row).get('longitude', float("nan")))
latitudes.astype(str) + ', ' + longitudes.astype(str)

0                   nan, nan
1      42.096971, -75.906545
2      41.124182, -73.714492
3      40.748082, -73.984819
4      40.699263, -73.613421
               ...          
528    40.636811, -73.898189
529    40.788805, -73.145202
530    42.884515, -77.213507
531    42.899615, -75.580535
532    40.727853, -73.707291
Name: 21, Length: 533, dtype: object

In [18]:
# Import the geocoder service wrapper.  (https://geopy.readthedocs.io/en/stable/#accessing-geocoders)
from tqdm.notebook import trange, tqdm
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from geopy.location import Location
from geopy.point import Point
from functools import partial

# Get the application name.
def application_name():
    return f"{settings.app}@{settings.app.version}"

# Initalize the geocoder instance.
def get_geocoder_instance():    
    # Initialize reference.
    gc = None
    try:
        if settings.geocoder.instance:
            gc = settings.geocoder.instance
        else:
            raise ValueError()
    except:
        settings.geocoder.instance = Nominatim(user_agent=application_name())
        gc = settings.geocoder.instance
    return gc
        
def get_reverse():
    return RateLimiter(partial(get_geocoder_instance().reverse), min_delay_seconds=1)
        
def get_geocode():    
    return RateLimiter(partial(get_geocoder_instance().geocode, addressdetails=True), min_delay_seconds=settings.geocoder.min_delay_seconds)

# Encode the query key.
def encode_query(query):
    try:
        return query.to_json()
    except:
        return str(query)


In [19]:
# Get key.
def get_key(query):
    if query.get('point', None):
        query = query['point']
        return encode_query(query)
    else:
        return encode_query(Box(**query))

# Request a specific location.
def request_location(query, geocode, otherKeys=set(), tr=None, dryrun=None):
    
    # Load local settings.
    tr = tr
    dryrun = dryrun
    keys = otherKeys

    if not settings.get(locations, None):
        settings.locations = Box({})
    
    # Inner logger.
    def log(msg, t=None):
        if msg and settings.verbose:
            print(msg)
        if msg and t:
            t.set_description(msg)
            t.update()
            
    def get_memo(key):
        # First come, first serve.        
        memo = settings.locations.get(key, None)
        if memo:
            set_memo(keys, memo)
            return memo
        return memo

    def set_memo(keys, memo):
        try:
            for key in keys:
                if memo:
                    settings.locations[key] = memo
                else:
                    settings.locations[key] = None
            return memo
        except:
            if key:
                settings.locations[key] = memo
        return memo
    
    # If no query, return.
    if not query:
        log("No query provided. Cannot search for location.")
        return None
        
    # If query present, setup to run.
    else:
        
        # Get the newest key.
        newkey = get_key(query)
        
        # Add key to the collection. 
        keys.add(newkey)  # Duplicates ignored due to set structure.
        
        # Loop for each key provided.
        for key in keys:
            
            # Get memo for current key.
            loc = get_memo(key)
                
            # If memoized value was found:
            if loc:
                log(f"Cached location found for query... {key} -> {loc}")
                return set_memo(keys, loc)
            # If no memoized value found, memoize result of search.
            else:
                if not dryrun:
                    try:
                        log(f"Calling geocoder service for new, unique query... {key}")
                        loc = geocode(query)
                        if loc:
                            log(f"SUCCESS: Found location for query! {key} -> {loc}")
                            return set_memo(keys, loc)
                        else:
                            log(f"ERROR: No location for query found. {key}")
                            continue # Try next key, if one exists.
                    except (RuntimeError, TypeError, NameError) as e:
                        log(f"ERROR: No location for query found. {e}")
                        continue # Try next key, if one exists.
                else:
                    log(f"DRY RUN: No location found due to dryrun setting. {key}")
                    continue # Try next key, if one exists.
        # End of for loop.
    # Return None.
    return None

# https://geopy.readthedocs.io/en/v1/#usage-with-pandas
def request_locations(queries, geocode, dryrun=None):
    if queries:
        results = []
        with trange(len(queries)) as tr:
            for i in tr:
                
                # Used keys.
                keys = set()
                
                # Try full query.
                query = queries[i]
                attempt = request_location(query, geocode, otherKeys=keys, tr=tr, dryrun=dryrun)
                
                
                # Attempt 'city'.
                if not attempt:
                    
                    attempt = request_location({ 'city': query['city'] }, geocode, otherKeys=keys, tr=tr, dryrun=dryrun)
                    
                # Attempt 'postalcode'.
                if not attempt:
                    attempt = request_location({ 'postalcode': query['postalcode'] }, geocode, otherKeys=keys, tr=tr, dryrun=dryrun)
                    
                # Attempt 'county'.
                if not attempt:
                    attempt = request_location({ 'county': query['county'] }, geocode, otherKeys=keys, tr=tr, dryrun=dryrun)
                    
                # Attempt 'state'.
                if not attempt:
                    attempt = request_location({ 'state': query['state'] }, geocode, otherKeys=keys, tr=tr, dryrun=dryrun)

                # Attempt 'default' (New York).
                if not attempt:
                    point = settings.get('defaultLocation', Point(latitude=43.1561681, longitude=-75.8449946, altitude=0))
                    settings['defaultLocation'] = point
                    attempt = request_location({ 'point': point }, get_reverse(), otherKeys=keys, tr=tr, dryrun=False)
                    
                results.append(attempt)
        return results
    return None
    
# Map full address into its own field.
"""
request_locations([{
    'street': "3147 Tiemann Avenue",
    "city": "Bronx",
    "state": "NY"
}]*10, get_geocode())
"""

# Get the addresses for the rows with missing coordinates.
def make_query(row):
    query = {field: row[field] for field in row.index}
    return query

In [20]:

# Get address components and create the query.
locations = location.apply(lambda row: extract_location(row))
mask = locations.isna()

# Address columns and query term columns.
a_columns = [2,4,6,8,5,21]
q_columns = [2,4,6,8,5]
colnames = {
    # These are IMPORTANT names for the geopy library. Do not change unless you know what you're doing.
    2: 'street',
    4: 'city',
    5: 'state',
    6: 'postalcode',
    8: 'county',
    21: 'location',
}

# Prepare the addresses table.
addresses = df.loc[:,a_columns]
query_df  = addresses.loc[:,q_columns]
query_df = query_df.rename(columns=colnames)

# Get the query results and store them.
addresses.loc[mask,'query'] = query_df[mask].apply(lambda row: make_query(row), axis=1)
addresses = addresses.rename(columns=colnames)
addresses

Unnamed: 0,street,city,postalcode,county,state,location,query
0,2197 NEW ENGLAND THRUWAY,BRONX,10475,BRONX,NY,,"{'street': '2197 NEW ENGLAND THRUWAY', 'city':..."
1,43-45 CARROLL STREET,BINGHAMTON,13901,BROOME,NY,"43-45 CARROLL STREET\n BINGHAMTON, NY 13901\n ...",
2,357 MAIN STREET,ARMONK,10504,WESTCHESTER,NY,"357 MAIN STREET\n ARMONK, NY 10504\n (41.12418...",
3,350 5TH AVENUE,NEW YORK,10118,NEW YORK,NY,"350 5TH AVENUE\n NEW YORK, NY 10118\n (40.7480...",
4,312 GREENWICH STREET,HEMPSTEAD,11550,NASSAU,NY,"312 GREENWICH STREET\n HEMPSTEAD, NY 11550\n (...",
...,...,...,...,...,...,...,...
528,9317 AVENUE L,BROOKLYN,11236,KINGS,NY,"9317 AVENUE L\n BROOKLYN, NY 11236\n (40.63681...",
529,2900 VETERANS MEMORIAL HIGHWAY,BOHEMIA,11716,SUFFOLK,NY,"2900 VETERANS MEMORIAL HIGHWAY\n BOHEMIA, NY 1...",
530,3071 COUNTY COMPLEX DRIVE,CANANDAIGUA,14424,ONTARIO,NY,"3071 COUNTY COMPLEX DRIVE\n CANANDAIGUA, NY 14...",
531,3599 STATE ROUTE 46,BOUCKVILLE,13310,MADISON,NY,"3599 STATE ROUTE 46\n BOUCKVILLE, NY 13310\n (...",


In [30]:
# Execute query on array of queries.
requests = pd.Series(request_locations(addresses.loc[mask,'query'].tolist(), get_geocode(), dryrun=settings.get('dryrun', False)))
requests.index = addresses[mask].index
requests

# Set the results back on the locations.
locations.loc[mask] = requests
locations.head()

  0%|          | 0/36 [00:00<?, ?it/s]

Cached location found for query... {"street": "2197 NEW ENGLAND THRUWAY", "city": "BRONX", "postalcode": 10475, "county": "BRONX", "state": "NY"} -> 2197, New England Thruway, The Bronx, Bronx County, New York, 10475, United States
Cached location found for query... {"street": "FROST MILL ROAD", "city": "MILL NECK", "postalcode": 11765, "county": "NASSAU", "state": "NY"} -> Frost Mill Road, Mill Neck, Oyster Bay, Nassau County, New York, 11765, United States
Cached location found for query... {"street": "71 MARGARET TERRANCE MMRL WAY", "city": "AKWESASNE", "postalcode": 13655, "county": "FRANKLIN", "state": "NY"} -> Franklin County, New York, 13655, United States
Cached location found for query... {"street": "PARKS & RECREATION TOWN HALL", "city": "NEW HYDE PARK", "postalcode": 11040, "county": "NASSAU", "state": "NY"} -> New Hyde Park, North Hempstead, Nassau County, New York, United States
Cached location found for query... {"street": "P.O. BOX 321", "city": "WHITE PLAINS", "postalco

0    (2197, New England Thruway, The Bronx, Bronx C...
1    (43-45 CARROLL STREET, BINGHAMTON, NY, 13901, ...
2    (357 MAIN STREET, ARMONK, NY, 10504, (41.12418...
3    (350 5TH AVENUE, NEW YORK, NY, 10118, (40.7480...
4    (312 GREENWICH STREET, HEMPSTEAD, NY, 11550, (...
Name: 21, dtype: object

In [22]:
def get_coords(loc):
    try:
        return loc.latitude, loc.longitude
    except:
        p = loc[4]
        p = re.sub('[()]', '', p)
        lat, lon = p.split(',')
        return float(lat), float(lon)
    
def get_latitude(loc):
    lat, _ = get_coords(loc)
    return lat if lat else float("nan")

def get_longitude(loc):
    _, lon = get_coords(loc)
    return lon if lon else float("nan")

In [23]:
# Fill addresses.
addresses.loc[:,'latitude'] = locations.apply(lambda row: get_latitude(row))
addresses.loc[:,'longitude'] = locations.apply(lambda row: get_longitude(row))
addresses


Unnamed: 0,street,city,postalcode,county,state,location,query,latitude,longitude
0,2197 NEW ENGLAND THRUWAY,BRONX,10475,BRONX,NY,,"{'street': '2197 NEW ENGLAND THRUWAY', 'city':...",40.882327,-73.829770
1,43-45 CARROLL STREET,BINGHAMTON,13901,BROOME,NY,"43-45 CARROLL STREET\n BINGHAMTON, NY 13901\n ...",,42.096971,-75.906545
2,357 MAIN STREET,ARMONK,10504,WESTCHESTER,NY,"357 MAIN STREET\n ARMONK, NY 10504\n (41.12418...",,41.124182,-73.714492
3,350 5TH AVENUE,NEW YORK,10118,NEW YORK,NY,"350 5TH AVENUE\n NEW YORK, NY 10118\n (40.7480...",,40.748082,-73.984819
4,312 GREENWICH STREET,HEMPSTEAD,11550,NASSAU,NY,"312 GREENWICH STREET\n HEMPSTEAD, NY 11550\n (...",,40.699263,-73.613421
...,...,...,...,...,...,...,...,...,...
528,9317 AVENUE L,BROOKLYN,11236,KINGS,NY,"9317 AVENUE L\n BROOKLYN, NY 11236\n (40.63681...",,40.636811,-73.898189
529,2900 VETERANS MEMORIAL HIGHWAY,BOHEMIA,11716,SUFFOLK,NY,"2900 VETERANS MEMORIAL HIGHWAY\n BOHEMIA, NY 1...",,40.788805,-73.145202
530,3071 COUNTY COMPLEX DRIVE,CANANDAIGUA,14424,ONTARIO,NY,"3071 COUNTY COMPLEX DRIVE\n CANANDAIGUA, NY 14...",,42.884515,-77.213507
531,3599 STATE ROUTE 46,BOUCKVILLE,13310,MADISON,NY,"3599 STATE ROUTE 46\n BOUCKVILLE, NY 13310\n (...",,42.899615,-75.580535


In [24]:
# Add the addresses to the main table.
addresses.head()

def make_full_address(row):
    # 1234 STREET CITY, STATE POSTALCODE    
    return f"{row['street']}\n{row['city']},\n{row['state']} {row['postalcode']}"
    
# Addresses.
df['address.street.line1'] = addresses['street']
df['address.street.line2'] = df[df.columns[3]]
df['address.city'] = addresses['city']
df['address.county'] = addresses['county']
df['address.state'] = addresses['state']
df['address.zipcode'] = addresses['postalcode']
df['address.coordinates.latitude'] = addresses['latitude']
df['address.coordinates.longitude'] = addresses['longitude']
df['address.location'] = addresses.astype(str).apply(lambda row: make_full_address(row), axis=1)

df.loc[:,df.columns[27:]].head()

Unnamed: 0,facility,address.street.line1,address.street.line2,address.city,address.county,address.state,address.zipcode,address.coordinates.latitude,address.coordinates.longitude,address.location
0,METRO NEW YORK DDSO - UNIQUE LIFE INC.,2197 NEW ENGLAND THRUWAY,,BRONX,BRONX,NY,10475,40.882327,-73.82977,"2197 NEW ENGLAND THRUWAY\nBRONX,\nNY 10475"
1,BROOME DDSO - COUNTY URBAN LEAGUE,43-45 CARROLL STREET,,BINGHAMTON,BROOME,NY,13901,42.096971,-75.906545,"43-45 CARROLL STREET\nBINGHAMTON,\nNY 13901"
2,HUDSON VALLEY DDSO - OPENGATE INC.,357 MAIN STREET,,ARMONK,WESTCHESTER,NY,10504,41.124182,-73.714492,"357 MAIN STREET\nARMONK,\nNY 10504"
3,METRO NEW YORK DDSO - COMMUNITY OPTIONS INC.,350 5TH AVENUE,SUITE 5230,NEW YORK,NEW YORK,NY,10118,40.748082,-73.984819,"350 5TH AVENUE\nNEW YORK,\nNY 10118"
4,LONG ISLAND DDSO - CENTER FOR RAPID RECOVERY,312 GREENWICH STREET,,HEMPSTEAD,NASSAU,NY,11550,40.699263,-73.613421,"312 GREENWICH STREET\nHEMPSTEAD,\nNY 11550"


## Ingestion

Ingest the data to the `providers` database, underneath the `services` collection.

First we need to convert our table into a properly structured dictionary object.


In [25]:
# Given a row in the database, construct an object.
record_cols = [
    'facility', 
    'keywords',
    # 'category.disability',
    # 'category.service',
    7, # 'info.phone',
    # 'info.fax',
    'info.website.url',
    'info.website.subdomain',
    'info.website.hostname',
    'info.website.domain',
    # 'info.addressee',
    'address.street.line1',
    'address.street.line2',
    'address.coordinates.longitude',
    'address.coordinates.latitude',
    'address.city',
    'address.county',
    'address.state',
    'address.zipcode',
]

# Convert to a dictionary object.
def boxed(row):    
    # Select only the rows we need.
    subselect = row[record_cols]
    subselect.rename(lambda x: x if x != 7 else 'info.phone', inplace=True)
    return Box(subselect.to_dict(), box_dots=True)

def to_record(row):
    b = boxed(row)
    payload = Box(box_dots=True)
        
    # Set the facility and keywords.
    payload.facility = b.facility
    payload.keywords = b.keywords
    
    # 'category.disability',
    # 'category.service',
    
    # Set the info.
    payload['info'] = {
        'phone': b['info.phone'],
        'website': {
            'url': b['info.website.url'],
            'subdomain': b['info.website.subdomain'],
            'hostname': b['info.website.hostname'],
            'domain': b['info.website.domain'],
        },
    }
    
    # Set the address.
    payload['address'] = {
        'street': {
            'line1': b['address.street.line1'],
            'line2': b['address.street.line2']
        },
        'coordinates': {
            'latitude': b['address.coordinates.latitude'], 
            'longitude': b['address.coordinates.longitude'],  
        },
        'city': b['address.city'],
        'county': b['address.county'],
        'state': b['address.state'],
        'zipcode': b['address.zipcode'],
    }
 
    return payload.to_dict()

# Convert to format for ingestion.
df_out = df[df.columns[:]].apply(lambda row: to_record(row), axis=1)
df_out

0      {'facility': 'METRO NEW YORK DDSO - UNIQUE LIF...
1      {'facility': 'BROOME DDSO - COUNTY URBAN LEAGU...
2      {'facility': 'HUDSON VALLEY DDSO - OPENGATE IN...
3      {'facility': 'METRO NEW YORK DDSO - COMMUNITY ...
4      {'facility': 'LONG ISLAND DDSO - CENTER FOR RA...
                             ...                        
528    {'facility': 'BROOKLYN DDSO - UNIQUE CARE COMM...
529    {'facility': 'LONG ISLAND DDSO - SUFFOLK CO. N...
530    {'facility': 'FINGER LAKES DDSO - ONTARIO CO. ...
531    {'facility': 'CENTRAL NEW YORK DDSO - HERITAGE...
532    {'facility': 'LONG ISLAND DDSO - CENTER FOR FA...
Length: 533, dtype: object

In [26]:
# Get the connector.
from connection.database import Database

# Initialize connection.
Database.initialize(Database.make_options(
    hostname=settings.db.auth.hostname,
    username=settings.db.auth.username,
    password=settings.db.auth.password,
    port=settings.db.auth.port,
))

OK


MongoClient(host=['mongodb.thelionsredmane.stream:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin')

In [27]:
Database.use('providers_demo')

OK


Database(MongoClient(host=['mongodb.thelionsredmane.stream:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin'), 'providers_demo')

In [28]:
destination = list(db.keys())[0]
data = df_out.tolist()
data[0]

{'facility': 'METRO NEW YORK DDSO - UNIQUE LIFE INC.',
 'keywords': 'COMMUNITY HABILITATION,DAY HABILITATION,FAMILY SUPPORT SERVICES,INDIVIDUAL SUPPORT SERVICES,SELF-DIRECTION SERVICES',
 'info': {'phone': '347-449-7890',
  'website': {'url': 'www.uniquelifeinc.org',
   'subdomain': 'www',
   'hostname': 'uniquelifeinc',
   'domain': 'org'}},
 'address': {'street': {'line1': '2197 NEW ENGLAND THRUWAY', 'line2': ''},
  'coordinates': {'latitude': 40.88232658333333, 'longitude': -73.829770125},
  'city': 'BRONX',
  'county': 'BRONX',
  'state': 'NY',
  'zipcode': 10475}}

In [29]:
try:
    Database.insert_many(destination, data)
except (BaseException) as e:
    print(e)

Inserting data into services collection.


- Extract the coordinates from the location field (if present).
- Split coordinates into 'longitude' and 'latitude' features. 


- Determine disability category for row based on values.
- Determine service category based on rows.