In [1]:
import pandas as pd
import re
import os
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import yapf.yapflib.yapf_api

## Combining the different spreadsheets into one

In [None]:
all_nh_data = pd.DataFrame()
for csv in os.listdir("merged_data"):
    if "text" in csv:
        file_name = "merged_data/{}".format(csv)
        nh_1 = pd.read_csv(file_name, index_col=0, dtype=str)
        nh = pd.concat([all_nh_data, nh_1])

### Saving inspection text separately

In [53]:
nh_inspection = nh[['facility_id', 'eventid',
                    'inspection_text', 'deficiency_tag', 'scope_severity']]
nh_insp = pa.Table.from_pandas(nh_inspection)
pq.write_table(nh_insp, "nh_insp_txt.parquet")

## Clean up and data type conversions

In [5]:
nh.drop(columns=['address', 'facility_name',
        'phone', 'inspection_text'], inplace=True)
nh.drop(columns=['overall_rating', 'overall_rating_fn', 'survey_rating',
                 'survey_rating_fn', 'quality_rating', 'quality_rating_fn',
                 'ls_quality_rating', 'ls_quality_rating_fn', 'ss_quality_rating',
                 'ss_quality_rating_fn', 'staffing_rating', 'staffing_rating_fn',
                 'rn_staffing_rating', 'rn_staffing_rating_fn'],
        inplace=True)

In [6]:
nh['zip'] = nh['zip'].astype(int)
nh['filedate'] = pd.to_datetime(nh['filedate'])
nh['inspection_date'] = pd.to_datetime(nh['inspection_date'])
nh['participation_date'] = pd.to_datetime(nh['participation_date'])
nh['ccrc_facil'] = nh['ccrc_facil'].astype(bool)
cols2int = ['cmplnt_cnt', 'fine_cnt',
            'fine_tot', 'payden_cnt', 'tot_penlty_cnt']
cols2int += ['aidhrd', 'vochrd', 'rnhrd',
             'totlichrd', 'tothrd', 'pthrd', 'incident_cnt']
cols2int += ['snf vbp ranking', 'achievement score',
             'improvement score', 'incentive payment multiplier']
cols2int += ['bedcert', 'restot', 'deficiency_tag']
nh[cols2int] = nh[cols2int].apply(pd.to_numeric)

In [77]:
nh.iloc[:,32:41]=nh.iloc[:,32:41].apply(pd.to_numeric, errors='coerce',axis=1)

**View of subsets of columns \n----------\n")**

In [None]:
def fncy(x): return x+10 if (x % 10) > 0 else x


i = 0
print("View of subsets of columns \n----------\n")
for col in range(0, fncy(len(nh.columns)), 10):
    if col != 0:
        print(f"^^^Columns from ^^^{i}-->{col}^^^^")
        print("----------")
        print(nh.iloc[:2, i:col])
        print("\n")
    i = col

In [159]:
nh_data = pa.Table.from_pandas(nh)
pq.write_table(nh_data, "nh_data_subset.parquet")

## Explore NH data - working with nh_data_subset.parquet file

In [None]:
nh1=pq.read_table("nh_data_subset.parquet")

def_tag = pd.read_csv(
    '/Users/sandeep/Downloads/NH_CitationDescriptions_Mar2023.csv', encoding='latin-1')

def_tag_pa = pa.Table.from_pandas(def_tag)

def_tag_pa['Deficiency Tag Number'].type

def_tag.head()

nh1 = nh1.append_column(
    'year_inspection', pa.compute.year(nh1['inspection_date']))

nh1 = nh1.append_column('deficiency_tag1', pa.compute.cast(
    nh1['deficiency_tag'], pa.int64()))

nh1 = nh1.join(def_tag_pa, 'deficiency_tag1', 'Deficiency Tag Number')

nh1.group_by(['year_inspection']
             ).aggregate([("bedcert", "count")]
                         ).combine_chunks().sort_by('year_inspection'
                                                    ).to_pandas()

a_table = nh1.group_by(['Deficiency Category', 'year_inspection']
                       ).aggregate([("bedcert", "count")]
                                   ).combine_chunks().sort_by('year_inspection'
                                                              ).to_pandas()

#a_table[a_table['year_inspection']==2019].plot(x='Deficiency Category', y='bedcert_count', kind='bar')

a_table.iloc[20:40]

pd.pivot_table(a_table, index='Deficiency Category',
               columns='year_inspection',
               values='bedcert_count')

a_table.sort_values(['bedcert_count'], ascending=False)

In [408]:
nh_pd = nh1.to_pandas()

In [418]:
nh_pd.loc[(nh_pd['Deficiency Category'] == "Quality of Life and Care Deficiencies") &
          (nh_pd['year_inspection'] == 2015), 'facility_id'].unique().shape

(1242,)

In [435]:
nh1 = nh1.append_column('Deficiency Category_new', nh1.column(
    'Deficiency Category').dictionary_encode())

In [436]:
nh1.schema

facility_id: string
state: string
zip: int64
inspection_date: timestamp[us]
deficiency_tag: string
scope_severity: string
complaint: string
standard: string
eventid: string
county_ssa: string
county_name: string
ownership: string
bedcert: double
restot: double
certification: string
inhosp: string
lbn: string
participation_date: timestamp[us]
ccrc_facil: bool
sffstatus: string
oldsurvey: string
chow_last_12mos: string
resfamcouncil: string
sprinkler_status: string
staffing_flag: string
pt_staffing_flag: string
aidhrd: double
vochrd: double
rnhrd: double
totlichrd: double
tothrd: double
pthrd: double
cm_aide: double
cm_lpn: double
cm_rn: double
cm_total: double
adj_aide: double
adj_lpn: double
adj_rn: double
adj_total: double
weighted_all_cycles_score: double
incident_cnt: double
cmplnt_cnt: double
fine_cnt: double
fine_tot: double
payden_cnt: double
tot_penlty_cnt: double
filedate: timestamp[us]
snf vbp ranking: double
achievement score: double
improvement score: double
incentive paymen

## working with Quality of Life and Care Deficiencies

In [515]:
nh_qc=nh1.filter(pa.compute.equal(nh1['Deficiency Category_new'],  
                                  "Quality of Life and Care Deficiencies"))

In [510]:
nh_qc=nh_qc.remove_column(nh_qc.schema.get_field_index('Deficiency Category'))
nh_qc=nh_qc.remove_column(nh_qc.schema.get_field_index('deficiency_tag'))

In [517]:
p_nh_qc=nh_qc.to_pandas()

In [553]:
th_yr = pd.pivot_table(p_nh_qc, index='state', columns='year_inspection', 
               values='Deficiency Category_new', aggfunc='count', 
               margins=True).sort_values("All",
                                         ascending=False).iloc[:,3:6]

In [598]:
grwth1716=(th_yr[2017]/th_yr[2016]-1)*100
grwth1817=(th_yr[2018]/th_yr[2017]-1)*100

In [603]:
grwth = pd.DataFrame(data=[grwth1716, grwth1817]).transpose()
# grwth=pd.DataFrame(, columns=['grwth1716'])
mst_ch = pd.merge(th_yr, grwth, left_index=True, right_index=True)

In [613]:
mst_ch.sort_values(2018, ascending=False)

Unnamed: 0_level_0,2016,2017,2018,0,1
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
All,20864.0,24837.0,30271.0,19.04237,21.878649
CA,2384.0,3021.0,3632.0,26.719799,20.225091
IL,1246.0,1881.0,2832.0,50.963082,50.558214
OH,915.0,1161.0,2028.0,26.885246,74.677003
TX,1162.0,1543.0,1848.0,32.788296,19.766688
PA,1031.0,1368.0,1539.0,32.686712,12.5
MI,731.0,1162.0,1459.0,58.960328,25.55938
IN,834.0,1143.0,1408.0,37.05036,23.184602
WA,535.0,782.0,1107.0,46.168224,41.560102
FL,845.0,875.0,1016.0,3.550296,16.114286


In [620]:
pa.compute.value_counts(nh_qc['year_inspection'])

<pyarrow.lib.StructArray object at 0x7fbe618161a0>
-- is_valid: all not null
-- child 0 type: int64
  [
    2016,
    2017,
    2018,
    2019,
    2015,
    2014,
    2013
  ]
-- child 1 type: int64
  [
    20864,
    24837,
    30271,
    5918,
    2807,
    59,
    2
  ]

<pyarrow.lib.ChunkedArray object at 0x7fbdf2ba06d0>
[
  [
    2016,
    2017,
    2017,
    2018,
    2016,
    ...
    2017,
    2018,
    2018,
    2018,
    2018
  ],
  [
    2019,
    2019,
    2019,
    2019,
    2015,
    ...
    2016,
    2017,
    2017,
    2018,
    2019
  ],
...,
  [
    2016,
    2016,
    2016,
    2016,
    2017,
    ...
    2016,
    2016,
    2017,
    2017,
    2017
  ],
  [
    2016,
    2016,
    2017,
    2017,
    2017,
    ...
    2016,
    2018,
    2016,
    2018,
    2018
  ]
]