# 1. Settings

In [1]:
import pandas as pd
import requests
import bs4
import re
import time
from os import listdir
from PyPDF2 import PdfFileReader

# 2. Metadata file

The Department of Social and Health Services's [assisted living facilities locator](https://fortress.wa.gov/dshs/adsaapps/lookup/bhpublookup.aspx) provides a CSV file that contails the metadata of all the available PDF reports for **assisted living faciliites**, including the URLs where the are saved down, and therefore where we can download them from.

We have already downloaded this file. It is saved down as *source_data/ALF_Listings.csv*

In [2]:
# Pull the original data
df_orig = pd.read_csv('../source_data/Listings/ALF_Listings.csv')

# Create a workable copy of the DF
df = df_orig.copy()

# Standardize the column names
col_names = []
for col_name in df.columns:
    col_name = re.sub(r'(\w)([A-Z])', r'\1_\2', col_name).lower()
    col_name = re.sub('\s|__', '_', col_name)
    col_names.append(col_name)

df.columns = col_names

## Getting to know the metadata file

In [3]:
df.head()

Unnamed: 0,speciality,contract,check_type,license_number,location_number,fac_instance_id,facility_type,facility_name,facility_status,location_address,...,speciality_code,contract_code,facility_po_c,licensed_bed_count,license_expiration_date,check_complete_date,check_type.1,has_reports?,reports_location,unnamed:_31
0,,Assisted Living SOW,,2334,60008896,43697,BH,COVENTRY HOUSE,OP,430 N 2nd Ave,...,,1002XP-12,"Montemayor, Sarah H.",55,3/31/2021 12:00:00 AM,,,Yes,https://fortress.wa.gov/dshs/adsaapps/lookup/B...,
1,,Assisted Living SOW,,1713,60001105,30864,BH,ROSE GARDEN ESTATES,OP,406 S Cascade St,...,,1002XP-12,"Semingson, Alice",26,4/30/2020 12:00:00 AM,,,Yes,https://fortress.wa.gov/dshs/adsaapps/lookup/B...,
2,,Enhanced Adult Res Care and Ex Comm Svcs,,2039,60009516,39377,BH,AVALON PROGRESSIVE CARE LLC,OP,1937 2ND AVE,...,,1036XP-12,"Galles, Carol A.",12,7/31/2020 12:00:00 AM,,,Yes,https://fortress.wa.gov/dshs/adsaapps/lookup/B...,
3,,"Enhanced Adult Residential Care, Assisted Livi...",,2022,60000693,40054,BH,EVERGREEN ESTATES RETIREMENT & ASSISTED LIVING...,OP,1215 Evergreen Ct,...,,"1006XP-12 , 1002XP-12 , 1007XP-12","Camerlo, Peter A.",46,9/30/2020 12:00:00 AM,,,Yes,https://fortress.wa.gov/dshs/adsaapps/lookup/B...,
4,,Assisted Living Facility (ALF),,2154,60008773,41832,BH,Amber Hills,OP,125 N Wamba Rd,...,,1002XP-12,"McDaniel, Thomas",44,11/30/2020 12:00:00 AM,,,Yes,https://fortress.wa.gov/dshs/adsaapps/lookup/B...,


### What field do we use to ID facilities?

In [4]:
num_rows_orig = len(df)

print('Number of rows =', num_rows_orig)
print('Number of unique facility_name =', df['facility_name'].nunique())
print('Number of unique fac_instance_id =', df['fac_instance_id'].nunique())
print('Number of unique license_number =', df['license_number'].nunique())
print('Number of unique location_number =', df['location_number'].nunique())

Number of rows = 536
Number of unique facility_name = 531
Number of unique fac_instance_id = 536
Number of unique license_number = 536
Number of unique location_number = 536


It seems there are various variables uniquely identify each facility. We will use **fac_instance_id** as the unique ID fore each facility.

### Subsetting: Only facilities that have reports

See how many facilities have reports and how many don't

In [5]:
df['has_reports?'].value_counts(dropna=False)

Yes    507
No      29
Name: has_reports?, dtype: int64

In [6]:
df['has_reports?'].value_counts(dropna=False, normalize=True)

Yes    0.945896
No     0.054104
Name: has_reports?, dtype: float64

So there are **536** unique assisted living facilities, and **507** (**95%**) of them have reports.

Let's break down the dataframe, based on whether a facility has reports or not.

In [7]:
# Make a DF for the facilities with no reports
df_no_reports = df[df['has_reports?'] == 'No']
df_no_reports = df_no_reports.reset_index(drop=True)
df_no_reports

# Filter out from dd_fac those facilities with no reports
df = df[df['has_reports?'] == 'Yes']
df = df.reset_index(drop=True)

### Consistency test

In [8]:
# Confirm that the facilities marked as having no reports show no query link
assert df_no_reports['reports_location'].str.strip().all() == ''

# Confirm no rows were lost during the dataframe split
assert len(df) + len(df_no_reports) == num_rows_orig

# 3. Downloading the reports

In [None]:
# As we dowload each report, we will save some of its metadata in this new DF:
df_reports = pd.DataFrame(columns = ['facility_name', 'fac_instance_id', 'location_number', 'license_number',
                                         'contract', 'pdf_name', 'rep_type', 'rep_date', 'download_record'])

pdf_downld_path = '/Volumes/files/COVID19/DSHS_Facility_Reports_Manuel/reports_pdfs/AL_2431/'

# Each iteration of this first loop correspond to a single facility
for index, row in df.iterrows():

    # Send a URL request to the site that contains all the links to each of the reports for a single facility.
    page = requests.get(row['reports_location'])
    soup = bs4.BeautifulSoup(page.text, 'html.parser')
    # The list of links to each of the reports is contained it a 'div' table with id='content_results'
    table = soup.find('div', {"id": "content_results"})
    # The following is the list of all the elements that contain the URLs to the PDFs
    ls_li = table.find_all('li')

    # Each iteration of this second loop corresponds to a single PDF report (from a single facility)
    for li in ls_li:

        # Send the request for the PDF report
        pdf_url = li.find('a').get('href')
        pdf = requests.get('https://fortress.wa.gov' + pdf_url)
        
        # Obtain some metadata from the report
        pdf_name = pdf_url.split("/")[-1]
        rep_type = re.search('\d{2}\/\d{4}\s-\s(.*)$', li.find('a').contents[0]).groups()[0]
        rep_date = re.search('(\d{2}\/\d{4})', li.find('a').contents[0]).groups()[0]

        # We will not download fire inspection reports this time.
        if 'fire' not in rep_type.lower():
            # Save down the PDF and name it using 'pdf_name_new' 
            open(pdf_downld_path + pdf_name, 'wb').write(pdf.content)
            # Save the report's metadata in 'df_reports'
            new_record = {'facility_name':row['facility_name'], 
                          'fac_instance_id':row['fac_instance_id'], 
                          'location_number':row['location_number'], 
                          'license_number':row['license_number'], 
                          'contract':row['contract'],
                          'pdf_name':pdf_name, 
                          'rep_type':rep_type, 
                          'rep_date':rep_date, 
                          'download_record':time.ctime()}
            df_reports = df_reports.append(new_record, ignore_index=True)
            del(new_record)
        del(pdf_url, pdf, pdf_name, rep_type, rep_date)
        time.sleep(3)

    del(page, soup, table, ls_li)

In [None]:
df_reports.to_csv('../output_data/reports_metadata/reports_metadata_alf.csv', index=False)