In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, KBinsDiscretizer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

pd.options.mode.chained_assignment = None
pd.options.display.max_rows = 500
pd.options.display.max_columns = 500
pd.options.display.width = 1000

# NOTE: This may not be necessary in future versions of scikit-learn, but it is
#       necessary here to get the feature names out of the ColumnTransformer.
#       All of the other transformers support this method.
SimpleImputer.get_feature_names_out = (
    lambda self, names=None: self.feature_names_in_
)
plt.style.use('seaborn-dark')

In [None]:
provider_info_df = pd.read_csv('NH_ProviderInfo_Jan2022.csv')
provider_info_df.info()

In [None]:
provider_info_df.info()

In [None]:
content = [
    'Disk info : How to format, my hard disk',
    'Hard disk format problems'
]
content_df = pd.DataFrame(content, columns=['some_text'])
content_df['some_text'].to_numpy()

In [None]:
vectorizer = TfidfVectorizer(min_df=5, max_df=100, stop_words='english')
content_prepared = pd.DataFrame(
    vectorizer.fit_transform(provider_info_df['Legal Business Name'].to_numpy()).toarray(), 
    columns=vectorizer.get_feature_names_out())
content_prepared.describe()

In [None]:
def make_bow_transformers(bow_fields, min_df=5, max_df=100):
    for bow_field in bow_fields:
        yield (f"bow_{bow_field}", TfidfVectorizer(min_df=min_df, max_df=max_df, stop_words='english'), bow_field)

In [None]:
bow_fields = ['Provider Name', 'Legal Business Name']

ct = ColumnTransformer(
    remainder='drop',
    transformers=[
    ] + list(make_bow_transformers(bow_fields))
)

X_prepared = pd.DataFrame(
    data=ct.fit_transform(provider_info_df).toarray(), 
    columns=ct.get_feature_names_out()
)
X_prepared.head()

In [None]:
cat_fields=['Provider State', 'Ownership Type']
num_fields=['Number of Certified Beds']
cat_transformer = Pipeline(
    steps=[
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]
)
num_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)
ct = ColumnTransformer(
    remainder='drop',
    transformers=[
        ('cats', cat_transformer, cat_fields),
        ('nums', num_transformer, num_fields)
    ]
)
foo = pd.DataFrame(
    ct.fit_transform(provider_info_df).toarray(),
    columns=ct.get_feature_names_out()
)
foo.describe()

In [None]:

provider_info_df['Total Amount of Fines in Dollars'].describe()

In [None]:
sns.histplot(provider_info_df['Total Amount of Fines in Dollars'], bins=10)

Probably a good opportunity to remove outliers later.

In [None]:
foo = provider_info_df[provider_info_df['Total Amount of Fines in Dollars'] < 300000]
sns.histplot(foo['Total Amount of Fines in Dollars'], bins=10)

In [None]:
provider_info_df['Average Number of Residents per Day Footnote'].value_counts()

In [None]:
provider_info_prepared_df = provider_info_df.drop(axis='columns', labels=[
    'Provider Address', 'Provider Phone Number', 'Provider County Name', 
    'Rating Cycle 1 Standard Survey Health Date', 'Rating Cycle 2 Standard Health Survey Date', 'Rating Cycle 3 Standard Health Survey Date',
    'Number of Fines', 'Total Number of Penalties', 'Location', 'Processing Date'
])

In [None]:
cat_cols = [
    'Provider City', 'Provider State', 'Provider Zip Code', 'Provider SSA County Code', 'Ownership Type', 'Provider Type', 'Provider Resides in Hospital', 
    'Continuing Care Retirement Community', 'Special Focus Status', 'Abuse Icon', 'Most Recent Health Inspection More Than 2 Years Ago', 
    'Provider Changed Ownership in Last 12 Months', 'With a Resident and Family Council', 'Automatic Sprinkler Systems in All Required Areas',
    'Long-Stay QM Rating Footnote', 'Short-Stay QM Rating Footnote', 'Staffing Rating Footnote', 'RN Staffing Rating Footnote', 'Reported Staffing Footnote', 
    'Physical Therapist Staffing Footnote', 'Total nursing staff turnover footnote', 'Registered Nurse turnover footnote', 'Administrator turnover footnote']
for cat_col in cat_cols:
    provider_info_df[cat_col] = provider_info_df[cat_col].astype('category')
provider_info_df.info()

In [None]:
provider_info_df.drop(axis='columns', labels=cat_cols).columns

In [None]:
num_cols = [
    'Number of Certified Beds', 'Average Number of Residents per Day', 'Overall Rating', 'Health Inspection Rating', 'QM Rating', 'Long-Stay QM Rating', 
    'Short-Stay QM Rating', 'Staffing Rating', 'RN Staffing Rating', 'Reported Nurse Aide Staffing Hours per Resident per Day',
    'Reported LPN Staffing Hours per Resident per Day', 'Reported RN Staffing Hours per Resident per Day', 'Reported Licensed Staffing Hours per Resident per Day',
    'Reported Total Nurse Staffing Hours per Resident per Day', 'Total number of nurse staff hours per resident per day on the weekend',
    'Registered Nurse hours per resident per day on the weekend', 'Reported Physical Therapist Staffing Hours per Resident Per Day',
    'Total nursing staff turnover', 'Registered Nurse turnover',  'Number of administrators who have left the nursing home',
    'Case-Mix Nurse Aide Staffing Hours per Resident per Day', 'Case-Mix LPN Staffing Hours per Resident per Day', 'Case-Mix RN Staffing Hours per Resident per Day',
    'Case-Mix Total Nurse Staffing Hours per Resident per Day', 'Adjusted Nurse Aide Staffing Hours per Resident per Day',
    'Adjusted LPN Staffing Hours per Resident per Day', 'Adjusted RN Staffing Hours per Resident per Day', 'Adjusted Total Nurse Staffing Hours per Resident per Day',
    'Rating Cycle 1 Total Number of Health Deficiencies', 'Rating Cycle 1 Number of Standard Health Deficiencies',
    'Rating Cycle 1 Number of Complaint Health Deficiencies', 'Rating Cycle 1 Health Deficiency Score', 'Rating Cycle 1 Number of Health Revisits',
    'Rating Cycle 1 Health Revisit Score', 'Rating Cycle 1 Total Health Score', 'Rating Cycle 2 Total Number of Health Deficiencies',
    'Rating Cycle 2 Number of Standard Health Deficiencies', 'Rating Cycle 2 Number of Complaint Health Deficiencies',
    'Rating Cycle 2 Health Deficiency Score', 'Rating Cycle 2 Number of Health Revisits', 'Rating Cycle 2 Health Revisit Score',
    'Rating Cycle 2 Total Health Score', 'Rating Cycle 3 Total Number of Health Deficiencies', 'Rating Cycle 3 Number of Standard Health Deficiencies',
    'Rating Cycle 3 Number of Complaint Health Deficiencies', 'Rating Cycle 3 Health Deficiency Score', 'Rating Cycle 3 Number of Health Revisits',
    'Rating Cycle 3 Health Revisit Score', 'Rating Cycle 3 Total Health Score', 'Total Weighted Health Survey Score', 'Number of Facility Reported Incidents',
    'Number of Substantiated Complaints', 'Number of Citations from Infection Control Inspections']



In [None]:
corr_matrix = provider_info_df.corr()
corr_matrix['Total Amount of Fines in Dollars'].sort_values(ascending=False)

In [None]:
state_averages_df = pd.read_csv('NH_StateUSAverages_Jan2022.csv')
state_averages_df.head()

In [None]:
quality_mds_df = pd.read_csv('NH_QualityMsr_MDS_Jan2022.csv', low_memory=False)
quality_mds_df.head()

In [None]:
quality_mds_df.groupby('Federal Provider Number').mean().describe()

In [None]:
import numpy as np
np.random.randn(5, 3)