# Nursing Home Data Analysis using PySpark ML
Using NH 2567 deficiency tags data, merge with Facility-level Characteristics to study `(TBD)`  
https://www.kaggle.com/code/fatmakursun/pyspark-ml-tutorial-for-beginners  

**Date: June 1, 2024**  
**Author: Sandeep**  


In [1]:
%load_ext jupyter_black

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import when, col, udf, pandas_udf, PandasUDFType
import pyspark.sql.functions as F
from collections import Counter
from pyspark.sql.types import *
import statsmodels.api as sm
import numpy as np

In [3]:
from pyspark.ml.regression import LinearRegression
from pyspark.mllib.evaluation import RegressionMetrics

In [31]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from pyspark.ml.feature import (
    VectorAssembler,
    StandardScaler,
    StringIndexer,
    OneHotEncoder,
)
from pyspark.ml.evaluation import RegressionEvaluator

In [5]:
# Visualization
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

pd.set_option("display.max_columns", 200)
pd.set_option("display.max_colwidth", 400)

from matplotlib import rcParams

sns.set(context="notebook", style="whitegrid", rc={"figure.figsize": (18, 4)})
rcParams["figure.figsize"] = 18, 4

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [6]:
# setting random seed for notebook reproducability
rnd_seed = 23
np.random.seed = rnd_seed
np.random.set_state = rnd_seed

In [7]:
spark = SparkSession.builder.appName("NH_data_analysis").getOrCreate()
spark.sql("set spark.sql.legacy.timeParserPolicy=CORRECTED")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/04 12:22:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


DataFrame[key: string, value: string]

In [8]:
sc = spark.sparkContext
sc

In [9]:
nh_data = spark.read.parquet("nh_insp_char_2019_2022")

                                                                                

In [10]:
nh_data.count()

60668

In [11]:
nh_data = nh_data.dropDuplicates()
nh_data_d = nh_data.dropna(how="any")
nh_data_d.count()

24/06/04 12:22:14 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

44867

## ML experiments

The test-drive outcome is the `number of tags` that a facility receives per year - more tags is bad. Using a cross-section of data from 2022, which spans all states, and NHs from all states that have been inspected.

In [145]:
n2022 = nh_data_d.filter(nh_data_d.insp_year == 2022)

In [146]:
featureCols = [
    "state",
    "no_of_tags_peryear",
    "severe_tags_applied",
    "fac_times_inpsected",
    "count_nh_insp_st_yr",
    "ownership",
    "bedcert",
    "restot",
    # "chow_last_12mos",
    "resfamcouncil",
    "aidhrd",
    "vochrd",
    "rnhrd",
    # "totlichrd",
    # "tothrd",
    "cm_aide",
    "cm_lpn",
    "cm_rn",
    # "adj_aide",
    # "adj_lpn",
    # "adj_rn",
]

In [147]:
n2022 = n2022.select(featureCols)

In [148]:
# inps = ["ownership", "chow_last_12mos", "resfamcouncil", "state"]
# outs = ["ownIndex", "lst12Index", "resfamIndx", "stIndex"]

inps = ["ownership", "resfamcouncil", "state"]
outs = ["ownIndex", "resfamIndx", "stIndex"]

indexer = StringIndexer(inputCols=inps, outputCols=outs)
# indexed.show()

In [149]:
n2022_t = indexer.fit(n2022).transform(n2022)

                                                                                

In [150]:
featureCols.remove("state")
featureCols.remove("resfamcouncil")
featureCols.remove("ownership")

In [151]:
featureCols += outs

In [152]:
len(featureCols)  # + len(st_names)

15

In [153]:
featureCols

['no_of_tags_peryear',
 'severe_tags_applied',
 'fac_times_inpsected',
 'count_nh_insp_st_yr',
 'bedcert',
 'restot',
 'aidhrd',
 'vochrd',
 'rnhrd',
 'cm_aide',
 'cm_lpn',
 'cm_rn',
 'ownIndex',
 'resfamIndx',
 'stIndex']

In [154]:
n2022_t = n2022_t.select(featureCols)

In [155]:
# target col is "no_of_tags_peryear"
featureCols.remove("no_of_tags_peryear")

In [157]:
encoder = OneHotEncoder(inputCols=["stIndex"], outputCols=["stIndex2"])
model = encoder.fit(n2022_t)
n2022_t = model.transform(n2022_t)
# encoded.show()

In [159]:
featureCols.remove("stIndex")

In [160]:
assembler = VectorAssembler(
    inputCols=featureCols + ["stIndex2"],
    outputCol="features",
)

In [161]:
assembled_df = assembler.transform(n2022_t.na.drop())

In [162]:
# Initialize the `standardScaler`
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")

In [163]:
# Fit the DataFrame to the scaler
scaled_df = standardScaler.fit(assembled_df).transform(assembled_df)

                                                                                

In [164]:
# scaled_df.select(["features", "features_scaled"]).show()

In [165]:
# Inspect the result
# scaled_df.select("features", "features_scaled").show(10, truncate=False)

In [166]:
train_data, test_data = scaled_df.randomSplit([0.8, 0.2], seed=rnd_seed)

In [44]:
train_data.head()

                                                                                

Row(no_of_tags_peryear=1, severe_tags_applied=0, fac_times_inpsected=1, count_nh_insp_st_yr=15, bedcert=67, restot=66.2, aidhrd=2.42317, vochrd=0.33724, rnhrd=1.90859, cm_aide=1.9972, cm_lpn=0.70855, cm_rn=0.3291, ownIndex=0.0, resfamIndx=0.0, stIndex=49.0, stIndex2=SparseVector(52, {49: 1.0}), features=SparseVector(66, {1: 1.0, 2: 15.0, 3: 67.0, 4: 66.2, 5: 2.4232, 6: 0.3372, 7: 1.9086, 8: 1.9972, 9: 0.7086, 10: 0.3291, 13: 49.0, 63: 1.0}), features_scaled=SparseVector(66, {1: 0.5335, 2: 0.0461, 3: 1.1251, 4: 1.4146, 5: 4.1827, 6: 0.9324, 7: 4.4687, 8: 12.4074, 9: 8.5604, 10: 2.8153, 13: 4.2442, 63: 27.8747}))

In [167]:
# Initialize `lr`
lr = LinearRegression(
    featuresCol="features_scaled",
    labelCol="no_of_tags_peryear",
    predictionCol="pr_no_of_tags_peryear",
    maxIter=10,
    regParam=0.3,
    elasticNetParam=0.8,
    standardization=False,
)

In [168]:
linearModel = lr.fit(train_data)

                                                                                

In [169]:
linearModel.coefficients.toArray(), 0, linearModel.intercept

(array([ 2.3457793 ,  2.57524982, -0.10826511,  0.        ,  0.        ,
        -0.11830393,  0.        , -0.12504005,  0.        ,  0.        ,
         0.        , -0.07487621,  0.        , -0.3070102 ,  0.        ,
         0.        , -0.0359105 ,  0.1049263 ,  0.        ,  0.        ,
         0.26635876,  0.        ,  0.        ,  0.        ,  0.        ,
        -0.15359992,  0.        ,  0.261833  ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.30943394,  0.138388  ,  0.        ,  0.        ,  0.        ,
        -0.10954851,  0.02858452,  0.        ,  0.01627647,  0.34803056,
        -0.02597418,  0.19275402,  0.        ,  0.        ,  0.09022829,
         0.        ,  0.        ,  0.        ,  0.22451154,  0.06393095,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ]),
 0,
 4.071445574081553)

In [170]:
len(["intercept"] + featureCols + st_names)

66

In [None]:
st_names = pd.DataFrame.from_records(
    n2022_t.select("stIndex2").schema[0].metadata["ml_attr"]["attrs"]["binary"]
)["name"].to_list()

In [174]:
coeff_df = pd.DataFrame(
    {
        "Features": ["intercept"] + featureCols + st_names,
        "Co-efficients": np.insert(
            linearModel.coefficients.toArray(), 0, linearModel.intercept
        ),
    }
)
coeff_df = coeff_df[["Features", "Co-efficients"]]

In [178]:
coeff_df.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65
Features,intercept,severe_tags_applied,fac_times_inpsected,count_nh_insp_st_yr,bedcert,restot,aidhrd,vochrd,rnhrd,cm_aide,cm_lpn,cm_rn,ownIndex,resfamIndx,TX,CA,OH,IL,PA,FL,IN,MI,MO,NY,IA,NC,MN,WI,MA,LA,GA,KS,TN,AR,OK,NJ,VA,WA,NE,CO,CT,SC,AZ,MS,OR,WV,AL,MD,ME,KY,NM,RI,SD,NH,NV,UT,ND,MT,HI,WY,VT,ID,DE,AK,DC,PR
Co-efficients,4.071446,2.345779,2.57525,-0.108265,0.0,0.0,-0.118304,0.0,-0.12504,0.0,0.0,0.0,-0.074876,0.0,-0.30701,0.0,0.0,-0.03591,0.104926,0.0,0.0,0.266359,0.0,0.0,0.0,0.0,-0.1536,0.0,0.261833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.309434,0.138388,0.0,0.0,0.0,-0.109549,0.028585,0.0,0.016276,0.348031,-0.025974,0.192754,0.0,0.0,0.090228,0.0,0.0,0.0,0.224512,0.063931,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [179]:
predictions = linearModel.transform(test_data)

In [180]:
predandlabels = predictions.select("pr_no_of_tags_peryear", "no_of_tags_peryear")

In [181]:
predandlabels.show()



+---------------------+------------------+
|pr_no_of_tags_peryear|no_of_tags_peryear|
+---------------------+------------------+
|    4.916221030969628|                 1|
|    4.896801203638635|                 1|
|    4.674808507206015|                 1|
|    4.656281836282518|                 1|
|    4.472395093864433|                 1|
|     4.76982263707548|                 1|
|   4.7489794612979805|                 1|
|     3.85533027488467|                 1|
|   3.8648587461438493|                 1|
|   4.4662897347663515|                 1|
|    7.335722528744217|                 1|
|   4.7184271007616285|                 1|
|    4.902871205925292|                 1|
|    4.744696730258871|                 1|
|    4.714747484826725|                 1|
|    4.914179403287346|                 1|
|    4.967096986023156|                 1|
|    4.835774233522725|                 1|
|    4.176707499803331|                 1|
|   6.7186629463795455|                 1|
+----------

                                                                                

In [182]:
# Get the RMSE
print("RMSE: {0}".format(linearModel.summary.rootMeanSquaredError))
print("MAE: {0}".format(linearModel.summary.meanAbsoluteError))
# Get the R2
print("R2: {0}".format(linearModel.summary.r2))

RMSE: 5.343239933126266
MAE: 3.91668541843798
R2: 0.40941305767644687


In [183]:
evaluator = RegressionEvaluator(
    predictionCol="pr_no_of_tags_peryear",
    labelCol="no_of_tags_peryear",
    metricName="rmse",
)
print("RMSE: {0}".format(evaluator.evaluate(predandlabels)))



RMSE: 5.256142291936482


                                                                                

## learn

In [278]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation

In [279]:
data = [
    (Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),
    (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
    (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
    (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),),
]
df = spark.createDataFrame(data, ["features"])

In [280]:
df.show()

                                                                                

+--------------------+
|            features|
+--------------------+
|(4,[0,3],[1.0,-2.0])|
|   [4.0,5.0,0.0,3.0]|
|   [6.0,7.0,0.0,8.0]|
| (4,[0,3],[9.0,1.0])|
+--------------------+



In [281]:
r1 = Correlation.corr(df, "features").head()

24/06/03 14:11:01 WARN PearsonCorrelation: Pearson correlation matrix contains NaN values.


In [282]:
print("Pearson correlation matrix:\n" + str(r1[0]))

Pearson correlation matrix:
DenseMatrix([[1.        , 0.05564149,        nan, 0.40047142],
             [0.05564149, 1.        ,        nan, 0.91359586],
             [       nan,        nan, 1.        ,        nan],
             [0.40047142, 0.91359586,        nan, 1.        ]])


In [283]:
r2 = Correlation.corr(df, "features", "spearman").head()


print("Spearman correlation matrix:\n" + str(r2[0]))

Spearman correlation matrix:
DenseMatrix([[1.        , 0.10540926,        nan, 0.4       ],
             [0.10540926, 1.        ,        nan, 0.9486833 ],
             [       nan,        nan, 1.        ,        nan],
             [0.4       , 0.9486833 ,        nan, 1.        ]])


24/06/03 14:11:27 WARN PearsonCorrelation: Pearson correlation matrix contains NaN values.


In [290]:
from pyspark.ml.stat import Summarizer
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

df = sc.parallelize(
    [
        Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)),
        Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0)),
    ]
).toDF()

# create summarizer for multiple metrics "mean" and "count"
summarizer = Summarizer.metrics("mean", "count")

# compute statistics for multiple metrics with weight
df.select(summarizer.summary(df.features, df.weight)).show(truncate=False)

# compute statistics for multiple metrics without weight
df.select(summarizer.summary(df.features)).show(truncate=False)

# compute statistics for single metric "mean" with weight
df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False)

# compute statistics for single metric "mean" without weight
df.select(Summarizer.mean(df.features)).show(truncate=False)

                                                                                

+-----------------------------------+
|aggregate_metrics(features, weight)|
+-----------------------------------+
|{[1.0,1.0,1.0], 1}                 |
+-----------------------------------+

+--------------------------------+
|aggregate_metrics(features, 1.0)|
+--------------------------------+
|{[1.0,1.5,2.0], 2}              |
+--------------------------------+

+--------------+
|mean(features)|
+--------------+
|[1.0,1.0,1.0] |
+--------------+

+--------------+
|mean(features)|
+--------------+
|[1.0,1.5,2.0] |
+--------------+



In [292]:
df.show()

                                                                                

+------+-------------+
|weight|     features|
+------+-------------+
|   1.0|[1.0,1.0,1.0]|
|   0.0|[1.0,2.0,3.0]|
+------+-------------+



                                                                                