# SETTINGS

In [1]:
import pandas as pd
from sodapy import Socrata
import re

# IMPORT DATA

### DSHS Data

In [2]:
# Original listings: CSV file directly downloaded from the ALTSA site.
df_listing = pd.read_csv('../A_source_data/Listings/NHF_Listing.csv', dtype='str')
df_listing.columns = df_listing.columns.str.strip().str.lower().str.replace('\s', '_')

# Metadata of results: Data scraped from the ALTSA website each time a PDF was dowloaded
df_meta = pd.read_csv('../C_output_data/reports_metadata/reports_metadata_nhf.csv', dtype='str')
df_meta.columns = df_meta.columns.str.strip().str.lower().str.replace('\s', '_')

# Reports contents: Data scraped from enforcement letters
df_pdfs = pd.read_csv('../C_output_data/reports_contents/nhf_enforcement_letters.csv',
                     parse_dates=['report_date'])

In [3]:
df_meta['rep_type'].value_counts(dropna=False)

Investigation                         1359
Enforcement Letter                     859
Inspection                             584
Informal Dispute Resolution Letter     406
Registered Nurse Exception Letter      132
Name: rep_type, dtype: int64

In [4]:
ii_meta = df_meta[df_meta['rep_type'].isin(['Investigation', 'Inspection'])]
ii_meta = ii_meta['pdf_name'].unique()

letters_meta = df_meta[df_meta['rep_type'].isin(['Enforcement Letter'])]
letters_meta = letters_meta['pdf_name'].unique()

letters_pdf = df_pdfs['pdf_name'].unique()

In [5]:
assert len(letters_meta) == len(letters_pdf)

assert set(letters_meta).intersection(letters_pdf) == set(letters_meta)
assert set(letters_pdf).intersection(letters_meta) == set(letters_pdf)

print(len(letters_meta))

859


In [41]:
providers_ii = df_meta[df_meta['pdf_name'].isin(ii_meta)]
providers_ii = providers_ii['fain_id'].unique()

providers_letters = df_meta[df_meta['pdf_name'].isin(letters_meta)]
providers_letters = providers_letters['fain_id'].unique()

print('Number or providers with inspections/investigations  =', len(providers_ii))
print('Number or providers with enforcement letters =',len(providers_letters))

print('Number or providers with both type of reports  =',len(set(providers_ii).intersection(providers_letters)))

Number or providers with inspections/investigations  = 198
Number or providers with enforcement letters = 182
Number or providers with both type of reports  = 179


In [36]:
df_orphans = df_meta[df_meta['fain_id'].isin(set(providers_letters).difference(providers_ii))]
df_orphans

Unnamed: 0,fain_id,nf_name,pdf_name,rep_type,rep_date,download_record
334,45544,Manor Care Health Services - Salmon Creek,"Manor Care - Salmon Creek (GG, CMP, CF) 12 17 ...",Enforcement Letter,Dec-19,Tue Mar 17 15:36:11 2020
1224,45975,Seattle Medical Post Acute Care,"Seattle Medical Post Acute (GG, CMP, CF) 2 10 ...",Enforcement Letter,Feb-20,Tue Mar 17 16:15:24 2020
1754,45542,Manor Care Health Services (Gig Harbor),"Manor Care Gig Harbor (FP, E prior E) 11 27 19...",Enforcement Letter,Dec-19,Tue Mar 17 16:40:16 2020


In [38]:
df_pdfs[df_pdfs['pdf_name'].isin(df_orphans['pdf_name'])]['mulct_agg'].sum()

5000.0