# What this script does

We create a dataset of all deficiencies committed by WA-based nursing homes from 2017.

# I. SETTINGS

In [None]:
import pandas as pd
from os import listdir
import re

# II. IMPORT DATA

## Deficienies (CMS)

In [None]:
# Obtain a list of all the downloaded adult family PDF reports
source_path = '../../covid19_nursing_homes_big_data/Full-Statement-of-Deficiencies-June-2020/'
file_list = listdir(source_path)

# Weed out any files in the folder that are not PDFs
file_list = [file for file in file_list if re.search('\.xlsx$', file)]
file_list = pd.Series(file_list)

df_sod_orig = pd.DataFrame()

for file in file_list:
    print(file)
    df_temp = pd.read_excel(source_path + file,
                            header=0, 
                            usecols=range(0,13), 
                            dtype={'facility_id':object, 'zip':object, 'deficiency_tag':object})
    df_sod_orig = pd.concat([df_sod_orig, df_temp])
    del(df_temp)

df_sod_orig = df_sod_orig.reset_index(drop=True)

# Save down a CSV version
df_sod_orig.to_csv('../../covid19_nursing_homes_big_data/cms_sod_txt.csv', index=False)

## F-Tags (CMS)

The SOD dataframe above contains tha tag code for each defficiency recorded, but it doesn't contain the general group that each of those tag belongs to. The information is containd in [this list of the revised F-tags](https://www.cms.gov/Medicare/Provider-Enrollment-and-Certification/GuidanceforLawsAndRegulations/Downloads/List-of-Revised-FTags.pdf). 

A version is in this [F-Tag crosswalk Excel file](https://www.cms.gov/Medicare/Provider-Enrollment-and-Certification/GuidanceforLawsAndRegulations/Downloads/F-Tag-Crosswalk.xlsx). This is the data we are importing now and that will be adding to the SOD dataset later in the script:

In [None]:
df_tags_orig = pd.read_excel('../A_source_data/CMS/LTC FTags_Phase 2_Crosswalk.xlsx',
                             sheet_name='Sortable by Tags', usecols='A:H')
df_tags_orig.columns = ['tag', 'sqc_tag?', 'tag_title', 'cfr', 'tag_group', 'phase3', 'tag_old', 'moved_text']

So now we have a data frame that contains all the deficiencies found in all surveys carried out, and another dataframe that contains detailed information about the tags used to classify those deficiencies. We need to join both dataframes.

In [None]:
df_sod_orig.columns

## Severity code descriptions

The SOD dataframe also contains codes for the severity of each deficiency, but not a description of the severity level of each of those codes. Those descriptions can be found in the docment [Design for Nursing Home Compare
Five-Star Quality Rating System:
Technical Users’ Guide](https://www.cms.gov/Medicare/Provider-Enrollment-and-Certification/CertificationandComplianc/downloads/usersguide.pdf). The following mapping is based on that document:

In [None]:
severity = [['A', 'No actual harm with potential for minimal harm - Isolated'],
            ['B', 'No actual harm with potential for minimal harm - Pattern'],
            ['C', 'No actual harm with potential for minimal harm - Widespread'],
            ['D', 'No actual harm with potential for more than minimal harm that is not immediate jeopardy - Isolated'],
            ['E', 'No actual harm with potential for more than minimal harm that is not immediate jeopardy - Pattern'],
            ['F', 'No actual harm with potential for more than minimal harm that is not immediate jeopardy - Widespread'],
            ['G', 'Actual harm that is not immediate jeopardy - Isolated'],
            ['H', 'Actual harm that is not immediate jeopardy - Pattern'],
            ['I', 'Actual harm that is not immediate jeopardy - Widespread'],
            ['J', 'Immediate jeopardy to resident health or safety - Isolated'],
            ['K', 'Immediate jeopardy to resident health or safety - Pattern'],
            ['L', 'Immediate jeopardy to resident health or safety - Widespread']]

severity = pd.DataFrame(severity, columns=['scope_severity', 'severity_desc'])
severity

# Consitency test
assert set(df_sod_orig['scope_severity']).issubset(set(severity['scope_severity']))

# III. REDUCING: WA STATE

In [None]:
# del(df_sod_wa)
df_sod_wa = df_sod_orig.copy()
print(df_sod_wa.shape)

# Reduce to only WA homes
df_sod_wa = df_sod_wa[df_sod_wa['state'] == 'WA']
print(df_sod_wa.shape)

# Add the severity descriptions
df_sod_wa = df_sod_wa.join(severity.set_index('scope_severity'), on='scope_severity', how='left')

# Create a proper date column
df_sod_wa['inspection_dt'] = pd.to_datetime(df_sod_wa['inspection_date'])

# Eliminate unnecesary fields and reset index
df_sod_wa = df_sod_wa.drop(['address', 'city', 'state', 'zip', 'inspection_date'], axis=1)
df_sod_wa = df_sod_wa.drop_duplicates().reset_index(drop=True)

# Change some column names into something easier to use
df_sod_wa = df_sod_wa.rename(columns={'deficiency_tag':'tag', 
                                      'scope_severity':'severity_code'})

print(df_sod_wa.shape)

In [None]:
df_sod_wa

# IV. KEYWORD SEARCH

In [None]:
df_sod_wa.to_csv('/Users/mvilla/Downloads/full_text_sod_wa.csv', index=False)

In [None]:
kwds = ["resident council", "call light", "short staff", 
        "resident and family interviews", "staff interviews", "resident interviews"]

In [None]:
for k in kwds:
    df_temp = df_sod_wa[df_sod_wa['inspection_text'].str.lower().str.contains(k)]
    print(k)
    print(df_temp['facility_name'].value_counts(dropna=False).head(20))
    print('\r')

In [None]:
kwds = 'resident council|call light|short staff|resident and family interviews|staff interviews|resident interviews'

In [None]:
df_sod_wa_kwd = df_sod_wa[df_sod_wa['inspection_text'].str.lower().str.contains(kwds)]
df_sod_wa_kwd = df_sod_wa_kwd.drop(['complaint','standard','eventid','severity_desc'], axis=1)
df_sod_wa_kwd = df_sod_wa_kwd.sort_values(['facility_name','inspection_dt'])
df_sod_wa_kwd = df_sod_wa_kwd[['facility_name','facility_id','tag','severity_code','inspection_text','inspection_dt','tag','severity_code']]
df_sod_wa_kwd

In [None]:
df_sod_wa_kwd.to_csv('/Users/mvilla/Downloads/full_text_sod_wa_kwds.csv', index=False)