# Settings

In [1]:
import pandas as pd
import numpy as np
from pprint import pprint

In [2]:
output_path = '../output_data/wac_codes/'

# Original data

In [3]:
df_orig = pd.read_csv('../output_data/reports_contents/nhf_inspections_contents_center.csv')

df = df_orig.copy()
total_reports = df['pdf_name'].nunique()

In [4]:
print('We start with', len(df), 'records. Inisde those records, there are', df['pdf_name'].nunique(), 'unique pdf names.')

We start with 29474 records. Inisde those records, there are 584 unique pdf names.


In [5]:
df.head()

Unnamed: 0,pdf_name,page,num_tables,num_rows,num_cells_row3,tag_list,deficiency_tag,deficiency_text,correction_tag,correction_text,completion_date
0,ARLINGTON HEALTH AND REHABILITATION YOGA12.pdf,1,0,,,,,,,,
1,ARLINGTON HEALTH AND REHABILITATION YOGA12.pdf,2,0,,,,,,,,
2,ARLINGTON HEALTH AND REHABILITATION YOGA12.pdf,3,1,4.0,5.0,['{L 000}'],{L 000}\nState Form 2,WAC - Initial Comments\nNote: According to RC...,{L 000},,
3,AVALON CARE CENTER - PULLMAN COMM11.pdf,1,1,4.0,5.0,"['F 000', 'NATURE']",LABORATORY,Y DIRECTOR'S OR PROVIDER/SUPPLIER REPRESENTATI...,NATURE,TITLE,(X6) DATE
4,AVALON HEALTH & REHABILITATION CENTER - PASCO ...,1,0,,,,,,,,


In [6]:
df.tail()

Unnamed: 0,pdf_name,page,num_tables,num_rows,num_cells_row3,tag_list,deficiency_tag,deficiency_text,correction_tag,correction_text,completion_date
29469,r willow springs care and rehab survey 4-5-201...,24,0,,,,,,,,
29470,r willow springs care and rehab survey 4-5-201...,25,0,,,,,,,,
29471,r willow springs care and rehab survey 4-5-201...,26,0,,,,,,,,
29472,r willow springs care and rehab survey 4-5-201...,27,0,,,,,,,,
29473,r willow springs care and rehab survey 4-5-201...,28,0,,,,,,,,


# Reducing

## Records with *num_tables* == 0

In [7]:
df['num_tables'].value_counts(dropna=False)

0    15057
1    14417
Name: num_tables, dtype: int64

Let's confirm that records where 'num_tables' == 0 are nothing but noise.

In [8]:
df_noise = df[df['num_tables']==0]
df_noise = df_noise.drop(columns=['pdf_name', 'num_tables', 'page'])

print(len(df_noise))
df_noise.isna().all()

15057


num_rows           True
num_cells_row3     True
tag_list           True
deficiency_tag     True
deficiency_text    True
correction_tag     True
correction_text    True
completion_date    True
dtype: bool

Indeed they are. Away with them.

In [9]:
del(df_noise)
df = df[df['num_tables']>0]

## Eliminating obsolete fields

### *num_tables* & *num_rows*

Let's see if *num_tables* and *num_rows* have any useful information left in them. Otherwise, we can drop them.

In [10]:
assert (df['num_tables'] == 1).all()
assert (df['num_rows'] == 4).all()

Indeed, they are no longer useful. Away!

In [11]:
df.drop(columns=['num_tables', 'num_rows'], inplace=True)

Now lets see if we can also eliminate *num_cells_row3*:

### *num_cells_row3*

In [12]:
df['num_cells_row3'].value_counts(dropna=False)

5.0    14411
2.0        6
Name: num_cells_row3, dtype: int64

Let's see the six cases where *num_cells_row3* == 2:

In [13]:
df[df['num_cells_row3'] == 2]

Unnamed: 0,pdf_name,page,num_cells_row3,tag_list,deficiency_tag,deficiency_text,correction_tag,correction_text,completion_date
1903,R Aldercrest Health & Rehab Center complaint 0...,111,2.0,[' F 585'],F 585,Grievances\nCFR(s): 483.10(j)(1)-(4)\n§483.10(...,,,
1904,R Aldercrest Health & Rehab Center complaint 0...,112,2.0,[' F 585'],F 585,Continued From Page 1\nresidents' rights is co...,,,
2560,R Avalon Care Center - Pullman survey L2D611 7...,69,2.0,[' F 661'],F 661,Discharge Summary\nCFR(s): 483.21(c)(2)(i)-(iv...,,,
2561,R Avalon Care Center - Pullman survey L2D611 7...,70,2.0,[' F 661'],F 661,Continued From Page 1\nReference (WAC) 388-97/...,,,
2609,R Avalon Care Center- Othello LLC PD8D11 Inspe...,48,2.0,[' F 645'],F 645,PASARR Screening for MD & ID\nCFR(s): 483.20(k...,,,
2610,R Avalon Care Center- Othello LLC PD8D11 Inspe...,49,2.0,[' F 645'],F 645,Continued From Page 1\nhealth needs. Findings ...,,,


As expected, in those cases we simply did not have any data for *correction_tag*, *correction_text* and *completion_date*. The variable is not useful anymore.

In [14]:
df.drop(columns=['num_cells_row3'], inplace=True)

### Our slimmed-down DF:

In [15]:
print(df.shape)
print('We now have', len(df), 'records and', df['pdf_name'].nunique(), 'unique pdf names.')
print('This means we managed to extract infromation from', round(df['pdf_name'].nunique()/total_reports*100),'% of the', total_reports, 'inspection reports for nursing home facilities.')

(14417, 8)
We now have 14417 records and 342 unique pdf names.
This means we managed to extract infromation from 59 % of the 584 inspection reports for nursing home facilities.


In [16]:
df.head()

Unnamed: 0,pdf_name,page,tag_list,deficiency_tag,deficiency_text,correction_tag,correction_text,completion_date
2,ARLINGTON HEALTH AND REHABILITATION YOGA12.pdf,3,['{L 000}'],{L 000}\nState Form 2,WAC - Initial Comments\nNote: According to RC...,{L 000},,
3,AVALON CARE CENTER - PULLMAN COMM11.pdf,1,"['F 000', 'NATURE']",LABORATORY,Y DIRECTOR'S OR PROVIDER/SUPPLIER REPRESENTATI...,NATURE,TITLE,(X6) DATE
5,AVALON HEALTH & REHABILITATION CENTER - PASCO ...,2,"['{F 000}', 'NATURE']",LABORATORY,Y DIRECTOR'S OR PROVIDER/SUPPLIER REPRESENTATI...,NATURE,TITLE,(X6) DATE
6,AVAMERE OLYMPIC REHABILITATION OF SEQUIM JX6T1...,1,"['F 000', 'NATURE']",LABORATORY,Y DIRECTOR'S OR PROVIDER/SUPPLIER REPRESENTATI...,NATURE,TITLE,(X6) DATE
8,AVAMERE TRANSITIONAL CARE OF PUGET SOUND M9QP1...,2,"['{E 000}', '{F 000}', 'NATURE']",LABORATORY,Y DIRECTOR'S OR PROVIDER/SUPPLIER REPRESENTATI...,NATURE,TITLE,(X6) DATE


In [17]:
df.tail()

Unnamed: 0,pdf_name,page,tag_list,deficiency_tag,deficiency_text,correction_tag,correction_text,completion_date
28340,WESLEY HOMES HEALTH CENTER I5XT12.pdf,3,['{L 000}'],{L 000}\nState Form 2,WAC - Initial Comments\nNote: According to RC...,{L 000},,
28432,r avamere bellingham healthcare and rehab - su...,53,['L1080'],L1080,WAC 388-97-1080 Nursing Services\n(1) The nurs...,L1080,,4/21/17
28433,r avamere bellingham healthcare and rehab - su...,54,['L1080'],L1080,Continued From page 1\nnursing home's geograph...,L1080,,
28434,r avamere bellingham healthcare and rehab - su...,55,"['L1080', 'L1380']",L1380,WAC 388-97-1380 Tuberculosis - Testing \nRequi...,L1380,,4/21/17
28435,r avamere bellingham healthcare and rehab - su...,56,['L1380'],L1380,Continued From page 3\ntesting within three da...,L1380,An audit was completed to ensure all \nemploye...,


# WAC codes

The field *deficiency_text* contains, among a lot of information, the codes from **WA State Legislature Title 338, Chapter 97 (Nursing Homes)** that were found to be defficient in the facility in question. We want to identify whenever such a code is mentioned in that field and scrap it out.

In [18]:
# Isolate the 'deficiency_text' series and index it using the 'pdf_name' series
df_wac = df['deficiency_text']
df_wac.index = df['pdf_name']

# Using regex, extract from the indexed series all occurrences WAC codes.
# Note: This will turn the series into a dataframe
df_wac = df_wac.str.extractall('(WAC\s?\d+-\d+-\d+)')
df_wac = df_wac.reset_index().drop(columns=['match'])
df_wac.columns = ['pdf_name', 'wac']

In [19]:
df_wac.head(10)

Unnamed: 0,pdf_name,wac
0,R St. Francis of Bellingham Survey QMTV11 12-...,WAC 388-97-0640
1,R St. Francis of Bellingham Survey QMTV11 12-...,WAC 388-97-1000
2,R St. Francis of Bellingham Survey QMTV11 12-...,WAC 388-97-1060
3,R St. Francis of Bellingham Survey QMTV11 12-...,WAC 388-97-1060
4,R St. Francis of Bellingham Survey QMTV11 12-...,WAC 388-97-1620
5,R St. Francis of Bellingham Survey QMTV11 12-...,WAC 388-97-1620
6,R St. Francis of Bellingham Survey QMTV11 12-...,WAC 388-97-0640
7,R St. Francis of Bellingham Survey QMTV11 12-...,WAC 388-97-1680
8,R St. Francis of Bellingham Survey QMTV11 12-...,WAC 388-97-1080
9,R ALASKA GARDENS HEALTH AND REHABILITATION FKJ...,WAC 388-97-1060


In [20]:
df_wac.tail(10)

Unnamed: 0,pdf_name,wac
953,R brookfield health and rehab of cascadia or4u...,WAC 388-97-1480
954,R highland health and rehabilitation c21j11 su...,WAC 388-97-0080
955,R highland health and rehabilitation c21j11 su...,WAC 388-97-1060
956,R highland health and rehabilitation c21j11 su...,WAC 388-97-1260
957,R highland health and rehabilitation c21j11 su...,WAC 388-97-1020
958,R highland health and rehabilitation c21j11 su...,WAC 388-97-0260
959,R highland health and rehabilitation c21j11 su...,WAC 388-97-1480
960,r avamere bellingham healthcare and rehab - su...,WAC 388-97-1080
961,r avamere bellingham healthcare and rehab - su...,WAC 388-97-1380
962,r avamere bellingham healthcare and rehab - su...,WAC 388-97-1360


## Add WAC metadata

In another script, we scrape the [site from the WA State Legislature that contains the description of the coded corresponding to Title 338, Chapter 97 (Nursing Homes)](https://apps.leg.wa.gov/wac/default.aspx?cite=388-97). We now add that information to our data frame.

In [21]:
df_temp = pd.read_csv(output_path + 'wac_codes_df_t338c97.csv')
df_temp['ttl_chp_sec'] = 'WAC ' + df_temp['ttl_chp_sec']

In [22]:
df_temp.head()

Unnamed: 0,sub_chp_num,sub_chp_name,section,ttl_chp_sec,ttl_chp_sec_desc
0,SUBCHAPTER I,"RESIDENT RIGHTS, CARE AND RELATED SERVICES",Definitions,WAC 388-97-0001,Definitions.
1,SUBCHAPTER I,"RESIDENT RIGHTS, CARE AND RELATED SERVICES","Admission, Transfer and Discharge",WAC 388-97-0020,Nursing facility care.
2,SUBCHAPTER I,"RESIDENT RIGHTS, CARE AND RELATED SERVICES","Admission, Transfer and Discharge",WAC 388-97-0040,Discrimination prohibited.
3,SUBCHAPTER I,"RESIDENT RIGHTS, CARE AND RELATED SERVICES","Admission, Transfer and Discharge",WAC 388-97-0060,Nursing facility admission and payment require...
4,SUBCHAPTER I,"RESIDENT RIGHTS, CARE AND RELATED SERVICES","Admission, Transfer and Discharge",WAC 388-97-0080,Discharge planning.


In [23]:
df_wac = df_wac.join(df_temp.set_index('ttl_chp_sec'), 
                     how='left', on='wac')

In [24]:
df_wac.head()

Unnamed: 0,pdf_name,wac,sub_chp_num,sub_chp_name,section,ttl_chp_sec_desc
0,R St. Francis of Bellingham Survey QMTV11 12-...,WAC 388-97-0640,SUBCHAPTER I,"RESIDENT RIGHTS, CARE AND RELATED SERVICES",Resident Rights,Prevention of abuse.
1,R St. Francis of Bellingham Survey QMTV11 12-...,WAC 388-97-1000,SUBCHAPTER I,"RESIDENT RIGHTS, CARE AND RELATED SERVICES",Resident Assessment and Plan of Care,Resident assessment.
2,R St. Francis of Bellingham Survey QMTV11 12-...,WAC 388-97-1060,SUBCHAPTER I,"RESIDENT RIGHTS, CARE AND RELATED SERVICES",Quality of Care,Quality of care.
3,R St. Francis of Bellingham Survey QMTV11 12-...,WAC 388-97-1060,SUBCHAPTER I,"RESIDENT RIGHTS, CARE AND RELATED SERVICES",Quality of Care,Quality of care.
4,R St. Francis of Bellingham Survey QMTV11 12-...,WAC 388-97-1620,SUBCHAPTER I,"RESIDENT RIGHTS, CARE AND RELATED SERVICES",Administration,General administration.


In [25]:
print(df_wac['section'].value_counts(dropna=False).sum())
df_wac['section'].value_counts(dropna=False)

963


Infection Control                                                                       190
Quality of Care                                                                         166
Administration                                                                          128
Nursing Services                                                                        122
Resident Rights                                                                          88
Resident Assessment and Plan of Care                                                     64
Quality of Life                                                                          54
Specialized Habilitative and Rehabilitative Services                                     35
Admission, Transfer and Discharge                                                        27
NaN                                                                                      22
Preadmission Screening and Resident Review (PASRR) in Medicaid Certified Facilit

## Add report metadata

In [26]:
del(df_temp)
df_temp = pd.read_csv('../output_data/reports_metadata/reports_metadata_nhf.csv')

In [27]:
df_temp

Unnamed: 0,fain_id,nf_name,pdf_name,rep_type,rep_date,download_record
0,43484,"AVALON CARE CENTER - OTHELLO, LLC","Avalon Care Center - Othello (G, CMP, CF) 5 3 ...",Enforcement Letter,May-19,Tue Mar 17 15:24:14 2020
1,43484,"AVALON CARE CENTER - OTHELLO, LLC","Avalon Care Center - Othello (Hx E, prior D) 8...",Enforcement Letter,Aug-18,Tue Mar 17 15:24:15 2020
2,43484,"AVALON CARE CENTER - OTHELLO, LLC",R Avalon Care Center- Othello LLC PD8D11 Inspe...,Inspection,Jun-19,Tue Mar 17 15:24:22 2020
3,43484,"AVALON CARE CENTER - OTHELLO, LLC",R Avalon Care Center - Othello LLC complaint H...,Inspection,Sep-18,Tue Mar 17 15:24:25 2020
4,43484,"AVALON CARE CENTER - OTHELLO, LLC",R avalon care center - othello llc i1ye11 surv...,Inspection,Sep-17,Tue Mar 17 15:24:29 2020
...,...,...,...,...,...,...
3335,43608,YAKIMA VALLEY SCHOOL,R Yakima Valley School CI 11-16-2017 - tj.pdf,Investigation,Nov-17,Tue Mar 17 17:41:33 2020
3336,43608,YAKIMA VALLEY SCHOOL,YAKIMA VALLEY SCHOOL KHMJ12.pdf,Investigation,Jun-17,Tue Mar 17 17:41:35 2020
3337,43608,YAKIMA VALLEY SCHOOL,YAKIMA VALLEY SCHOOL OSMX12.pdf,Investigation,Jun-17,Tue Mar 17 17:41:36 2020
3338,43608,YAKIMA VALLEY SCHOOL,r yakima valley school complaint 5-12-2017.pdf,Investigation,May-17,Tue Mar 17 17:41:38 2020


In [28]:
df_wac = df_wac.join(df_temp.set_index('pdf_name'), 
                     how='left', on='pdf_name')

In [29]:
print(df_wac['section'].value_counts(dropna=False).sum())
df_wac['section'].value_counts(dropna=False)

963


Infection Control                                                                       190
Quality of Care                                                                         166
Administration                                                                          128
Nursing Services                                                                        122
Resident Rights                                                                          88
Resident Assessment and Plan of Care                                                     64
Quality of Life                                                                          54
Specialized Habilitative and Rehabilitative Services                                     35
Admission, Transfer and Discharge                                                        27
NaN                                                                                      22
Preadmission Screening and Resident Review (PASRR) in Medicaid Certified Facilit

In [32]:
del(df_temp)
df_temp = df_wac[df_wac['section'] == 'Infection Control']

In [51]:
print(df_temp['nf_name'].value_counts(dropna=False).sum())
print(df_temp['nf_name'].nunique())
df_temp['nf_name'].value_counts(dropna=False).head(30)

190
100


Tacoma Nursing and Rehabilitation Center              10
FRANKE TOBEY JONES                                     6
Alaska Gardens Health and Rehabilitation Center        4
Regency Olympia Rehabilitation And Nursing Center      4
AVALON CARE CENTER - OTHELLO, LLC                      4
WOODLAND CONVALESCENT CENTER                           4
Advanced Post Acute                                    4
LIFE CARE CENTER OF RICHLAND                           4
BEACON HILL REHABILITATION                             3
Vancouver Specialty and Rehabilitative Care            3
AVAMERE OLYMPIC REHABILITATION OF SEQUIM               3
PACIFIC CARE AND REHABILITATION                        3
Sequim Health & Rehabilitation Center                  3
COLVILLE TRIBAL CONVALESCENT CENTER                    3
Good Samaritan Society - Stafholt                      3
PUYALLUP NURSING AND REHABILITATION CENTER             3
Manor Care Health Services (Tacoma)                    3
PROVIDENCE MARIANWOOD          

In [50]:
pd.DataFrame(df_temp['nf_name'].value_counts(dropna=False)).to_csv('../output_data/infection_control_violations_by_facility.csv')