In [75]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import plotly.express as px

from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
import datetime

In [53]:
#Import data
HAI_tidy = pd.read_csv('.\data\HAI_tidy_Wrangled.csv')
HAI_tidy.shape

(3096, 77)

In [54]:
#Drop anything with NA in the predictor col
HAI_tidy.dropna(subset=['HAI_2_SIR_Score'], inplace=True)
HAI_tidy.shape

(2219, 77)

In [55]:
#Drop info cols
HAI_tidy.drop(['Phone Number', 'Location'], axis="columns", inplace=True)
HAI_tidy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2219 entries, 0 to 3095
Data columns (total 75 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Facility ID                   2219 non-null   object 
 1   HAI_1_CILOWER                 1630 non-null   float64
 2   HAI_1_CIUPPER                 1936 non-null   float64
 3   HAI_1_DOPC                    2217 non-null   float64
 4   HAI_1_ELIGCASES               2217 non-null   float64
 5   HAI_1_NUMERATOR               2217 non-null   float64
 6   HAI_1_SIR_Score               1936 non-null   float64
 7   HAI_2_CILOWER                 1883 non-null   float64
 8   HAI_2_CIUPPER                 2219 non-null   float64
 9   HAI_2_DOPC                    2219 non-null   float64
 10  HAI_2_ELIGCASES               2219 non-null   float64
 11  HAI_2_NUMERATOR               2219 non-null   float64
 12  HAI_2_SIR_Score               2219 non-null   float64
 13  HAI

In [39]:
#Drop upper and lower limts from HAI data
HAI_tidy = HAI_tidy[HAI_tidy.columns.drop(HAI_tidy.filter(regex='CILOWER|CIUPPER').columns )]

In [57]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

HAI_tidy = HAI_tidy.select_dtypes(include=numerics)
HAI_tidy.columns

Index(['HAI_1_CILOWER', 'HAI_1_CIUPPER', 'HAI_1_DOPC', 'HAI_1_ELIGCASES',
       'HAI_1_NUMERATOR', 'HAI_1_SIR_Score', 'HAI_2_CILOWER', 'HAI_2_CIUPPER',
       'HAI_2_DOPC', 'HAI_2_ELIGCASES', 'HAI_2_NUMERATOR', 'HAI_2_SIR_Score',
       'HAI_3_CILOWER', 'HAI_3_CIUPPER', 'HAI_3_DOPC', 'HAI_3_ELIGCASES',
       'HAI_3_NUMERATOR', 'HAI_3_SIR_Score', 'HAI_4_CILOWER', 'HAI_4_CIUPPER',
       'HAI_4_DOPC', 'HAI_4_ELIGCASES', 'HAI_4_NUMERATOR', 'HAI_4_SIR_Score',
       'HAI_5_CILOWER', 'HAI_5_CIUPPER', 'HAI_5_DOPC', 'HAI_5_ELIGCASES',
       'HAI_5_NUMERATOR', 'HAI_5_SIR_Score', 'HAI_6_CILOWER', 'HAI_6_CIUPPER',
       'HAI_6_DOPC', 'HAI_6_ELIGCASES', 'HAI_6_NUMERATOR', 'HAI_6_SIR_Score',
       'ZIP Code', 'H_CLEAN_STAR_RATING', 'H_COMP_1_STAR_RATING',
       'H_COMP_2_STAR_RATING', 'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING',
       'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING',
       'H_HSP_RATING_STAR_RATING', 'H_QUIET_STAR_RATING',
       'H_RECMND_STAR_RATING', 'H_STAR_RATING', '

In [58]:
X_train, X_test, y_train, y_test = train_test_split(HAI_tidy.drop(columns='HAI_2_SIR_Score'), 
                                                    HAI_tidy.HAI_2_SIR_Score, test_size=0.3, 
                                                    random_state=47)

In [60]:
#Code task 4#
#Calculate the mean of `y_train`
train_mean = y_train.mean()
train_mean

0.7137044430135226

In [61]:
#Fit the dummy regressor on the training data
dumb_reg = DummyRegressor(strategy='mean')
dumb_reg.fit(X_train, y_train)
y_tr_pred = dumb_reg.predict(X_train)
y_te_pred = train_mean * np.ones(len(y_test))

Check the "mean" model metrics

In [62]:
r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)

(0.0, -0.0010222673257951342)

In [63]:
mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)

(0.4233626112183842, 0.4496509574610026)

In [64]:
mean_squared_error(y_train, y_tr_pred), mean_squared_error(y_test, y_te_pred)

(0.32846713157716884, 0.3541035498299305)

Simplistic regression with imputation, taken from the guided capstone

In [65]:
# These are the values we'll use to fill in any missing values.  
X_defaults_median = X_train.median()
X_defaults_median

HAI_1_CILOWER         0.2250
HAI_1_CIUPPER         1.6555
HAI_1_DOPC         4544.0000
HAI_1_ELIGCASES       3.8595
HAI_1_NUMERATOR       2.0000
                     ...    
SEP_1                59.0000
SEP_SH_3HR           86.0000
SEP_SH_6HR           70.0000
SEV_SEP_3HR          81.0000
SEV_SEP_6HR          90.0000
Length: 62, dtype: float64

In [66]:
#Call `X_train` and `X_test`'s `fillna()` method, passing `X_defaults_median` as the values to use
#Assign the results to `X_tr` and `X_te`, respectively
X_tr = X_train.fillna(X_defaults_median)
X_te = X_test.fillna(X_defaults_median)

In [67]:
#Call the StandardScaler`s fit method on `X_tr` to fit the scaler
#then use it's `transform()` method to apply the scaling to both the train and test split
#data (`X_tr` and `X_te`), naming the results `X_tr_scaled` and `X_te_scaled`, respectively
scaler = StandardScaler()
scaler.fit(X_tr)
X_tr_scaled = scaler.transform(X_tr)
X_te_scaled = scaler.transform(X_te)
X_tr_scaled[:5,:5]

array([[-0.59173428,  0.22088663, -0.36822895, -0.44907312, -0.37431823],
       [-0.20804527,  0.63770751, -0.658202  , -0.61764412, -0.59876465],
       [-0.20804527, -0.1497946 , -0.56681782, -0.56301718, -0.59876465],
       [-0.51429246,  0.55715863, -0.54568913, -0.49137681, -0.37431823],
       [-0.20804527, -1.34473325,  0.63127259,  0.27689495, -0.59876465]])

In [68]:
lm = LinearRegression().fit(X_tr_scaled, y_train)

In [69]:
#Code task 11#
#Call the `predict()` method of the model (`lm`) on both the (scaled) train and test data
#Assign the predictions to `y_tr_pred` and `y_te_pred`, respectively
y_tr_pred = lm.predict(X_tr_scaled)
y_te_pred = lm.predict(X_te_scaled)

In [71]:
# r^2 - train, test
median_r2 = r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)
median_r2

(0.8846634838634866, 0.8589095100010494)

In [72]:
#Code task 12#
#Now calculate the mean absolute error scores using `sklearn`'s `mean_absolute_error` function
# as we did above for R^2
# MAE - train, test
median_mae = mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)
median_mae

(0.13727833787715046, 0.15456243429613584)

In [73]:
#Code task 13#
#And also do the same using `sklearn`'s `mean_squared_error`
# MSE - train, test
median_mse = mean_squared_error(y_train, y_tr_pred), mean_squared_error(y_test, y_te_pred)
np.sqrt(median_mse)

array([0.19463878, 0.22340462])