# Settings

In [1]:
import pandas as pd
import numpy as np
from pprint import pprint

# Original data

In [2]:
df_orig = pd.read_csv('../output_data/reports_contents/nhf_inspections_contents_perifery.csv')
df = df_orig.copy()

In [3]:
print('We start with', len(df), 'records. Inisde those records, there are', df['pdf_name'].nunique(), 'unique pdf names.')

We start with 29474 records. Inisde those records, there are 584 unique pdf names.


In [4]:
df.head()

Unnamed: 0,pdf_name,num_pages,page,num_tables,omb_no,provider_id,survey_comp_date,provider_name,provider_address,form_id,event_id,facility_id
0,ARLINGTON HEALTH AND REHABILITATION YOGA12.pdf,3,1,0,,,,,,,,
1,ARLINGTON HEALTH AND REHABILITATION YOGA12.pdf,3,2,0,,,,,,,,
2,ARLINGTON HEALTH AND REHABILITATION YOGA12.pdf,3,3,1,Washington,WA18800,05/29/2018,,,,,
3,AVALON CARE CENTER - PULLMAN COMM11.pdf,1,1,1,0938-0391,505246,09/22/2017,AVALON CARE CENTER - PULLMAN,"NORTHWEST 1310 DEANE, PULLMAN, WA 99163",,,
4,AVALON HEALTH & REHABILITATION CENTER - PASCO ...,2,1,0,,,,,,,,


In [5]:
df.tail()

Unnamed: 0,pdf_name,num_pages,page,num_tables,omb_no,provider_id,survey_comp_date,provider_name,provider_address,form_id,event_id,facility_id
29469,r willow springs care and rehab survey 4-5-201...,28,24,0,,,,,,,,
29470,r willow springs care and rehab survey 4-5-201...,28,25,0,,,,,,,,
29471,r willow springs care and rehab survey 4-5-201...,28,26,0,,,,,,,,
29472,r willow springs care and rehab survey 4-5-201...,28,27,0,,,,,,,,
29473,r willow springs care and rehab survey 4-5-201...,28,28,0,,,,,,,,


# Reducing

### Records with num_tables == 0

Let's confirm that records where 'num_tables' == 0 are nothing but noise.

In [6]:
df_noise = df[['omb_no', 'provider_id', 'survey_comp_date', 
               'provider_name', 'provider_address', 'form_id', 'event_id', 'facility_id']]
df_noise = df_noise[df['num_tables']==0]

df_noise.isna().all().all()

True

Indeed they are. Away with them.

In [7]:
del(df_noise)
df = df[df['num_tables']>0]

# Consistency test:
# We should now have a unique value for 'num_tables: 1
assert (df['num_tables'] == 1).all()

In [8]:
print('We now have', len(df), 'records and', df['pdf_name'].nunique(), 'unique pdf names.')

We now have 14417 records and 342 unique pdf names.


### Standardizing form_id

In [9]:
df['form_id'].value_counts(dropna=False)

CMS-2567(02-99)    12641
NaN                 1004
6899                 770
CMS2567(0299)          2
Name: form_id, dtype: int64

It seems obvious there is only one correct form ID: **CMS-2567(02-99)**

There are a couple of cases where all missing are the dashes, so we coorect for that:

In [10]:
df['form_id'] = df['form_id'].str.replace('CMS2567(0299)', 'CMS-2567(02-99)', regex=False)

In [11]:
df['form_id'].value_counts(dropna=False)

CMS-2567(02-99)    12643
NaN                 1004
6899                 770
Name: form_id, dtype: int64

The cases where the value is **6869** are also incorrect. We deal with them later in the script.

In [12]:
print('We still have', len(df), 'records and', df['pdf_name'].nunique(), 'unique pdf names.')

We still have 14417 records and 342 unique pdf names.


### Records whose only difference is the page number

Next, we can elimiate all records that, excpet for the page number, are otherwise the same.

In [13]:
df = df.loc[df.drop(columns=['page']).drop_duplicates().index.tolist(), ]

repeated_pdf_names = df['pdf_name'].value_counts()
repeated_pdf_names = repeated_pdf_names.reset_index()
repeated_pdf_names.columns = ['pdf_name', 'freq']
repeated_pdf_names = repeated_pdf_names[repeated_pdf_names['freq']>1]
repeated_pdf_names = repeated_pdf_names.reset_index(drop=True)

In [14]:
print('We are left with', len(df), 'records, containing', df['pdf_name'].nunique(), 'unique pdf names, out of which', len(repeated_pdf_names), 'appear more than once.')

We are left with 535 records, containing 342 unique pdf names, out of which 173 appear more than once.


### Records where 'omb_no' == 'Washington

Let's look at the data of some random pdf names to see if we can detect some patterns:

In [15]:
x = repeated_pdf_names['pdf_name'].sample().tolist()[0]
print(x)
df[df['pdf_name'] == x]

R RIVERSIDE NURSING & REHAB CTR complaint TNPN11 03-13-2019-st.pdf


Unnamed: 0,pdf_name,num_pages,page,num_tables,omb_no,provider_id,survey_comp_date,provider_name,provider_address,form_id,event_id,facility_id
17428,R RIVERSIDE NURSING & REHAB CTR complaint TNPN...,50,2,1,0938-0391,505358,03/13/2019,RIVERSIDE NURSING & REHAB CTR,"1305 ALEXANDER STREET, CENTRALIA, WA 98531",CMS-2567(02-99),TNPN11,WA18200
17472,R RIVERSIDE NURSING & REHAB CTR complaint TNPN...,50,46,1,Washington,WA18200,03/13/2019,,,6899,,


It seems likely those records where omb_no == 'Washington' are just noise. Let's confirm:

In [16]:
df_noise = df[df['omb_no'] == 'Washington']
df_noise[['provider_name', 'provider_address', 'form_id', 'event_id', 'facility_id']].isna().all()

provider_name        True
provider_address     True
form_id             False
event_id             True
facility_id          True
dtype: bool

Except for 'form_id', all fields are full of Nan values. Let's take a look at 'form_id' to se if its non-NaN values are also just noise.

In [17]:
df_noise['form_id'].value_counts(dropna=False)

6899    144
NaN      38
Name: form_id, dtype: int64

Indeed. We can do away with them:

In [18]:
df = df[df['omb_no'] != 'Washington']

del(repeated_pdf_names)
repeated_pdf_names = df['pdf_name'].value_counts()
repeated_pdf_names = repeated_pdf_names.reset_index()
repeated_pdf_names.columns = ['pdf_name', 'freq']
repeated_pdf_names = repeated_pdf_names[repeated_pdf_names['freq']>1]
repeated_pdf_names = repeated_pdf_names.reset_index(drop=True)

In [19]:
print('We are down to', len(df), 'records and', df['pdf_name'].nunique(), 'unique pdf names, out of which', len(repeated_pdf_names), 'appear more than once.')

We are down to 353 records and 333 unique pdf names, out of which 20 appear more than once.


### Repeated PDF names

Let's see how the df looks for only those repeated pdf names:

In [20]:
df_repeated = df[df['pdf_name'].isin(repeated_pdf_names['pdf_name'])]
df_repeated

Unnamed: 0,pdf_name,num_pages,page,num_tables,omb_no,provider_id,survey_comp_date,provider_name,provider_address,form_id,event_id,facility_id
1794,R Aldercrest Health & Rehab Center complaint 0...,115,2,1,0938-0391,505236,09/13/2019,ALDERCREST HEALTH & REHAB CENTER,"21400 72ND AVENUE WEST, EDMONDS, WA 98026",,,
1903,R Aldercrest Health & Rehab Center complaint 0...,115,111,1,SURVEY,505236,9/13/2019,ALDERCREST HEALTH & REHAB CENTER,"21400 72ND AVENUE WEST, EDMONDS, WA",,,
2493,R Avalon Care Center - Pullman survey L2D611 7...,70,2,1,0938-0391,505246,07/27/2018,AVALON CARE CENTER - PULLMAN,"NORTHWEST 1310 DEANE, PULLMAN, WA 99163",CMS-2567(02-99),L2D611,WA21200
2560,R Avalon Care Center - Pullman survey L2D611 7...,70,69,1,SURVEY,505246,7/27/2018,AVALON CARE CENTER - PULLMAN,"NORTHWEST 1310 DEANE, PULLMAN, WA",,L2D611,
2563,R Avalon Care Center- Othello LLC PD8D11 Inspe...,49,2,1,0938-0391,505255,05/03/2019,AVALON CARE CENTER - OTHELLO LLC,"495 NORTH THIRTEENTH STREET, OTHELLO, WA 99344",CMS-2567(02-99),PD8D11,WA19800
2609,R Avalon Care Center- Othello LLC PD8D11 Inspe...,49,48,1,SURVEY,505255,5/3/2019,AVALON CARE CENTER - OTHELLO LLC,"495 NORTH THIRTEENTH STREET, OTHELLO, WA",,PD8D11,
3028,R Avamere Olympia Rehabilitation of Sequim com...,82,2,1,0938-0391,505327,12/27/2018,AVAMERE OLYMPIC REHABILITATION OF SEQUIM,"1000 5TH AVENUE SOUTH, SEQUIM, WA 98382",CMS-2567(02-99),FI7N12,WA26500
3032,R Avamere Olympia Rehabilitation of Sequim com...,82,6,1,0938-0391,505327,10/24/2018,AVAMERE OLYMPIC REHABILITATION OF SEQUIM,"1000 5TH AVENUE SOUTH, SEQUIM, WA 98382",CMS-2567(02-99),FI7N11,WA26500
4145,R Bethany at Pacific complaint U3RT11 03-06-20...,93,2,1,0938-0391,505404,05/01/2019,BETHANY AT PACIFIC,"916 PACIFIC AVENUE 3RD-5TH FLOORS, EVERETT, WA...",CMS-2567(02-99),U3RT12,WA02200
4149,R Bethany at Pacific complaint U3RT11 03-06-20...,93,6,1,0938-0391,505404,03/06/2019,BETHANY AT PACIFIC,"916 PACIFIC AVENUE 3RD-5TH FLOORS, EVERETT, WA...",CMS-2567(02-99),U3RT11,WA02200


The first that looks clear is that the three records where 'omb_no' == 'SURVEY' are noise.

In [21]:
df = df[df['omb_no'] != 'SURVEY']

repeated_pdf_names = df['pdf_name'].value_counts()
repeated_pdf_names = repeated_pdf_names.reset_index()
repeated_pdf_names.columns = ['pdf_name', 'freq']
repeated_pdf_names = repeated_pdf_names[repeated_pdf_names['freq']>1]
repeated_pdf_names = repeated_pdf_names.reset_index(drop=True)

In [22]:
print('We are further down to', len(df), 'records and', df['pdf_name'].nunique(), 'unique pdf names, out of which', len(repeated_pdf_names), 'appear more than once.')

We are further down to 350 records and 333 unique pdf names, out of which 17 appear more than once.


In [23]:
df.to_csv('../output_data/reports_contents/nhf_inspections_contents_perifery_reduced.csv')