In [1]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import plotly.express as px

from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
import datetime

In [2]:
#Import data
HAI_tidy = pd.read_csv('.\data\HAI_tidy_Wrangled.csv')
HAI_tidy.shape

(3096, 77)

In [3]:
#Drop anything with NA in the predictor col
HAI_tidy.dropna(subset=['HAI_5_SIR_Score'], inplace=True)
HAI_tidy.shape

(1715, 77)

In [4]:
#Drop info cols
HAI_tidy.drop(['Phone Number', 'Location'], axis="columns", inplace=True)
HAI_tidy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1715 entries, 0 to 3095
Data columns (total 75 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Facility ID                   1715 non-null   object 
 1   HAI_1_CILOWER                 1484 non-null   float64
 2   HAI_1_CIUPPER                 1675 non-null   float64
 3   HAI_1_DOPC                    1712 non-null   float64
 4   HAI_1_ELIGCASES               1712 non-null   float64
 5   HAI_1_NUMERATOR               1712 non-null   float64
 6   HAI_1_SIR_Score               1675 non-null   float64
 7   HAI_2_CILOWER                 1568 non-null   float64
 8   HAI_2_CIUPPER                 1689 non-null   float64
 9   HAI_2_DOPC                    1712 non-null   float64
 10  HAI_2_ELIGCASES               1712 non-null   float64
 11  HAI_2_NUMERATOR               1712 non-null   float64
 12  HAI_2_SIR_Score               1689 non-null   float64
 13  HAI

In [5]:
#Drop upper and lower limts from HAI data
HAI_tidy = HAI_tidy[HAI_tidy.columns.drop(HAI_tidy.filter(regex='CILOWER|CIUPPER|DOPC|ELIG|NUMERATOR|STAR').columns )]

In [6]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

HAI_tidy = HAI_tidy.select_dtypes(include=numerics)
HAI_tidy.columns

Index(['HAI_1_SIR_Score', 'HAI_2_SIR_Score', 'HAI_3_SIR_Score',
       'HAI_4_SIR_Score', 'HAI_5_SIR_Score', 'HAI_6_SIR_Score', 'ZIP Code',
       'H_CLEAN_LINEAR_SCORE', 'H_COMP_1_LINEAR_SCORE',
       'H_COMP_2_LINEAR_SCORE', 'H_COMP_3_LINEAR_SCORE',
       'H_COMP_5_LINEAR_SCORE', 'H_COMP_6_LINEAR_SCORE',
       'H_COMP_7_LINEAR_SCORE', 'H_HSP_RATING_LINEAR_SCORE',
       'H_QUIET_LINEAR_SCORE', 'H_RECMND_LINEAR_SCORE', 'SEP_1', 'SEP_SH_3HR',
       'SEP_SH_6HR', 'SEV_SEP_3HR', 'SEV_SEP_6HR'],
      dtype='object')

In [17]:
X_train, X_test, y_train, y_test = train_test_split(HAI_tidy.drop(columns='HAI_5_SIR_Score'), 
                                                    HAI_tidy.HAI_5_SIR_Score, test_size=0.3, 
                                                    random_state=47)
print(X_train.columns) #removed target

Index(['HAI_1_SIR_Score', 'HAI_2_SIR_Score', 'HAI_3_SIR_Score',
       'HAI_4_SIR_Score', 'HAI_6_SIR_Score', 'ZIP Code',
       'H_CLEAN_LINEAR_SCORE', 'H_COMP_1_LINEAR_SCORE',
       'H_COMP_2_LINEAR_SCORE', 'H_COMP_3_LINEAR_SCORE',
       'H_COMP_5_LINEAR_SCORE', 'H_COMP_6_LINEAR_SCORE',
       'H_COMP_7_LINEAR_SCORE', 'H_HSP_RATING_LINEAR_SCORE',
       'H_QUIET_LINEAR_SCORE', 'H_RECMND_LINEAR_SCORE', 'SEP_1', 'SEP_SH_3HR',
       'SEP_SH_6HR', 'SEV_SEP_3HR', 'SEV_SEP_6HR'],
      dtype='object')


In [8]:
#Code task 4#
#Calculate the mean of `y_train`
train_mean = y_train.mean()
train_mean

0.8090583333333325

In [9]:
#Fit the dummy regressor on the training data
dumb_reg = DummyRegressor(strategy='mean')
dumb_reg.fit(X_train, y_train)
y_tr_pred = dumb_reg.predict(X_train)
y_te_pred = train_mean * np.ones(len(y_test))

Check the "mean" model metrics

In [10]:
r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)

(0.0, -0.0010541157440056015)

In [11]:
mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)

(0.49731080555555557, 0.45229762135922325)

In [12]:
mean_squared_error(y_train, y_tr_pred), mean_squared_error(y_test, y_te_pred)

(0.4560547265972222, 0.38023078641248653)

Simplistic regression with imputation, taken from the guided capstone

##### These are the values we'll use to fill in any missing values.  
X_defaults_median = X_train.median()
X_defaults_median

In [13]:
#Call `X_train` and `X_test`'s `fillna()` method, passing `X_defaults_median` as the values to use
#Assign the results to `X_tr` and `X_te`, respectively
X_tr = X_train.fillna(X_defaults_median)
X_te = X_test.fillna(X_defaults_median)

NameError: name 'X_defaults_median' is not defined

In [None]:
#Call the StandardScaler`s fit method on `X_tr` to fit the scaler
#then use it's `transform()` method to apply the scaling to both the train and test split
#data (`X_tr` and `X_te`), naming the results `X_tr_scaled` and `X_te_scaled`, respectively
scaler = StandardScaler()
scaler.fit(X_tr)
X_tr_scaled = scaler.transform(X_tr)
X_te_scaled = scaler.transform(X_te)
X_tr_scaled[:5,:5]

In [None]:
lm = LinearRegression().fit(X_tr_scaled, y_train)

In [None]:
#Code task 11#
#Call the `predict()` method of the model (`lm`) on both the (scaled) train and test data
#Assign the predictions to `y_tr_pred` and `y_te_pred`, respectively
y_tr_pred = lm.predict(X_tr_scaled)
y_te_pred = lm.predict(X_te_scaled)

In [None]:
# r^2 - train, test
median_r2 = r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)
median_r2

In [None]:
#Code task 12#
#Now calculate the mean absolute error scores using `sklearn`'s `mean_absolute_error` function
# as we did above for R^2
# MAE - train, test
median_mae = mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)
median_mae

In [None]:
#Code task 13#
#And also do the same using `sklearn`'s `mean_squared_error`
# MSE - train, test
median_mse = mean_squared_error(y_train, y_tr_pred), mean_squared_error(y_test, y_te_pred)
np.sqrt(median_mse)