# A. Settings

In [1]:
import pandas as pd
import requests
import bs4
import re
import time
from os import listdir
from PyPDF2 import PdfFileReader

In [2]:
path = '/Users/mvilla/Documents/repos/covid19_rcf_20200313_big_data/NH/'

# B. Metadata file

The Department of Social and Health Services's [Nursing Home Facilities Locator](https://fortress.wa.gov/dshs/adsaapps/lookup/NHPubLookup.aspx) provides a CSV file that contails the metadata of all the available PDF reports for **nursing home faciliites**, including the URLs where the are saved down, and therefore where we can download them from.

We have already downloaded this file. It is saved down as *source_data/NHF_Listings.csv*

In [3]:
# Pull the original data
df_orig = pd.read_csv(path + 'NHF_Listing_BETA.csv')

# Create a workable copy of the DF
df = df_orig.copy()
df.columns = df.columns.str.strip().str.lower().str.replace('\s', '_')

## Getting to know the metadata file

In [4]:
df.head()

Unnamed: 0,nf_location_num,nf_loc_region_cde,nf_loc_street_address,nf_loc_city,nf_loc_zip_cde,nf_loc_phone_num,nf_loc_fax_num,nf_mailing_address,nf_mailing_city,nf_mailing_state,...,nf_ssps,rcsunit,total_beds_nf_bed_count,xviii_beds_nf_bed_count,xix_beds_nf_bed_count,t1819_beds_nf_bed_count,fain_id,nf_bed_type_desc,has_reports?,reports_location
0,40990,3,2811 NE 139TH ST,VANCOUVER,986862724,3605745247,3605749126,2811 NE 139TH ST,VANCOUVER,WA,...,,C,120,,,120,45544,Title 18/19,Yes,https://fortress.wa.gov/dshs/adsaapps/lookup/N...
1,35900,2,555 16th Ave,Seattle,98122,2063248200,2067098457,555 16th Ave,Seattle,WA,...,,H,103,0.0,0.0,103,45975,Title 18/19,Yes,https://fortress.wa.gov/dshs/adsaapps/lookup/N...
2,40040,3,3309 45th Street Ct,Gig Harbor,98335,2538588688,2538588683,3309 45th Street Ct,Gig Harbor,WA,...,,B,120,,,120,45542,Title 18/19,Yes,https://fortress.wa.gov/dshs/adsaapps/lookup/N...


### What field do we use to ID facilities?

In [5]:
num_rows_orig = len(df) # We will use this value for consistency tests later.

print('Number of rows =', num_rows_orig)
print('Number of unique nf_location_num =', df['nf_location_num'].nunique())
print('Number of unique fain_id =', df['fain_id'].nunique())
print('Number of unique nf_name =', df['nf_name'].nunique())

Number of rows = 3
Number of unique nf_location_num = 3
Number of unique fain_id = 3
Number of unique nf_name = 3


It seems there are various variables uniquely identify each facility. We will use **fain_id** as the unique ID fore each facility.

### Subsetting: Only facilities that have reports

How many facilities have reports and how many don't?

In [6]:
df['has_reports?'].value_counts(dropna=False)

Yes    3
Name: has_reports?, dtype: int64

In [7]:
df['has_reports?'].value_counts(dropna=False, normalize=True)

Yes    1.0
Name: has_reports?, dtype: float64

So there are **208** unique nursing home facilities, out of which **201** (**97%**) have reports.

Let's break down the dataframe, based on whether a facility has reports or not.

In [8]:
# Make a DF for the facilities with no reports
df_no_reports = df[df['has_reports?'] == 'No']
df_no_reports = df_no_reports.reset_index(drop=True)
df_no_reports

# Filter out from dd_fac those facilities with no reports
df = df[df['has_reports?'] == 'Yes']
df = df.reset_index(drop=True)

### Consistency test

In [9]:
# Confirm that the facilities marked as having no reports show no query link
# assert df_no_reports['reports_location'].str.strip().all() == ''

# Confirm no rows were lost during the dataframe split
assert len(df) + len(df_no_reports) == num_rows_orig

In [10]:
df

Unnamed: 0,nf_location_num,nf_loc_region_cde,nf_loc_street_address,nf_loc_city,nf_loc_zip_cde,nf_loc_phone_num,nf_loc_fax_num,nf_mailing_address,nf_mailing_city,nf_mailing_state,...,nf_ssps,rcsunit,total_beds_nf_bed_count,xviii_beds_nf_bed_count,xix_beds_nf_bed_count,t1819_beds_nf_bed_count,fain_id,nf_bed_type_desc,has_reports?,reports_location
0,40990,3,2811 NE 139TH ST,VANCOUVER,986862724,3605745247,3605749126,2811 NE 139TH ST,VANCOUVER,WA,...,,C,120,,,120,45544,Title 18/19,Yes,https://fortress.wa.gov/dshs/adsaapps/lookup/N...
1,35900,2,555 16th Ave,Seattle,98122,2063248200,2067098457,555 16th Ave,Seattle,WA,...,,H,103,0.0,0.0,103,45975,Title 18/19,Yes,https://fortress.wa.gov/dshs/adsaapps/lookup/N...
2,40040,3,3309 45th Street Ct,Gig Harbor,98335,2538588688,2538588683,3309 45th Street Ct,Gig Harbor,WA,...,,B,120,,,120,45542,Title 18/19,Yes,https://fortress.wa.gov/dshs/adsaapps/lookup/N...


# C. Downloading nursing home reports

In [11]:
# As we dowload each report, we will save some of its metadata in this new DF:
df_reports = pd.DataFrame(columns = ['fain_id', 'nf_name', 'pdf_name', 'rep_type', 'rep_date', 'download_record'])


pdf_downld_path = path + 'PDFs/'

# Each iteration of this first loop correspond to a single facility
for index, row in df.iterrows():

    # Send a URL request to the site that contains all the links to each of the reports for a single facility.
    page = requests.get(row['reports_location'])
    soup = bs4.BeautifulSoup(page.text, 'html.parser')
    # The list of links to each of the reports is contained it a 'div' table with id='content_results'
    table = soup.find('div', {"id": "content_results"})
    # The following is the list of all the elements that contain the URLs to the PDFs
    ls_li = table.find_all('li')

    # Each iteration of this second loop corresponds to a single PDF report (from a single facility)
    for li in ls_li:

        # Send the request for the PDF report
        pdf_url = li.find('a').get('href')
        pdf = requests.get('https://fortress.wa.gov' + pdf_url)
        
        # Obtain some metadata from the report
        pdf_name = pdf_url.split("/")[-1]
        rep_type = re.search('\d{2}\/\d{4}\s-\s(.*)$', li.find('a').contents[0]).groups()[0]
        rep_date = re.search('(\d{2}\/\d{4})', li.find('a').contents[0]).groups()[0]

        # We will not download fire inspection reports this time.
        if 'fire' not in rep_type.lower():
            # Save down the PDF
            open(pdf_downld_path + pdf_name, 'wb').write(pdf.content)
            # Save the report's metadata in 'df_reports'
            new_record = {'fain_id':row['fain_id'], 
                          'nf_name':row['nf_name'], 
                          'pdf_name':pdf_name, 
                          'rep_type':rep_type, 
                          'rep_date':rep_date, 
                          'download_record':time.ctime()}
            df_reports = df_reports.append(new_record, ignore_index=True)
            del(new_record)
        del(pdf_url, pdf, pdf_name, rep_type, rep_date)
        time.sleep(1)

    del(page, soup, table, ls_li)

In [12]:
df_reports.to_csv('/Users/mvilla/Documents/repos/covid19_rcf_20200313_big_data/NH/metadata_BETA.csv', index=False)