In [1]:
# Common imports
import pandas as pd
import numpy as np
import os
import csv
from sklearn import datasets, linear_model, metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

# to make this notebook's output stable across runs
np.random.seed(197)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

In [2]:
# 1) Data Processing
# a) Import the data: Only keep numeric data 
data = pd.read_csv("ProviderInfo.csv")
dat1 = data.select_dtypes(include = 'number')
dat1 = dat1.drop(columns = ['PHONE','COUNTY_SSA'])
dat1.head(10)

Unnamed: 0,ZIP,BEDCERT,RESTOT,OVERALL_RATING,SURVEY_RATING,QUALITY_RATING,STAFFING_RATING,RN_STAFFING_RATING,AIDHRD,VOCHRD,...,ADJ_AIDE,ADJ_LPN,ADJ_RN,ADJ_TOTAL,INCIDENT_CNT,CMPLNT_CNT,FINE_CNT,FINE_TOT,PAYDEN_CNT,TOT_PENLTY_CNT
0,35653.0,57.0,51.5,5.0,5.0,5.0,4.0,4.0,3.43572,1.16495,...,3.11741,1.2475,0.83853,5.13047,0.0,0.0,0.0,0.0,0.0,0.0
1,35150.0,85.0,74.2,3.0,3.0,5.0,1.0,1.0,,,...,,,,,0.0,0.0,1.0,15259.0,1.0,2.0
2,35768.0,50.0,,1.0,2.0,2.0,1.0,1.0,,,...,,,,,0.0,0.0,0.0,0.0,0.0,0.0
3,35206.0,92.0,79.8,2.0,2.0,4.0,3.0,3.0,2.32722,0.82104,...,2.40074,0.86962,0.56463,3.83026,0.0,1.0,0.0,0.0,0.0,0.0
4,35111.0,103.0,98.1,3.0,3.0,4.0,3.0,2.0,2.33617,0.92407,...,2.55126,1.08955,0.3036,3.95709,0.0,0.0,0.0,0.0,0.0,0.0
5,35611.0,149.0,119.7,5.0,3.0,5.0,4.0,3.0,2.57869,1.01443,...,2.56783,1.04823,0.46444,4.07866,0.0,1.0,0.0,0.0,0.0,0.0
6,36025.0,124.0,96.0,5.0,4.0,5.0,3.0,4.0,1.99985,0.62768,...,2.12102,0.70311,0.75448,3.52979,1.0,1.0,0.0,0.0,0.0,0.0
7,35045.0,201.0,136.6,3.0,2.0,3.0,5.0,5.0,2.64483,0.86299,...,2.48334,0.98515,0.93497,4.28916,0.0,0.0,0.0,0.0,0.0,0.0
8,35611.0,170.0,137.6,1.0,1.0,3.0,3.0,2.0,2.66372,1.09313,...,2.53573,1.18122,0.38165,4.09048,0.0,0.0,0.0,0.0,0.0,0.0
9,36092.0,121.0,112.9,3.0,3.0,5.0,1.0,1.0,2.39901,0.99975,...,2.38173,1.08929,0.26696,3.76695,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# b)  Remove any rows that have an NaN value
dat1.replace(["NaN", 'NaT'], np.nan, inplace = True)
cleaned_df= dat1.dropna(how='any', axis = 0)
cleaned_df.head(10)

Unnamed: 0,ZIP,BEDCERT,RESTOT,OVERALL_RATING,SURVEY_RATING,QUALITY_RATING,STAFFING_RATING,RN_STAFFING_RATING,AIDHRD,VOCHRD,...,ADJ_AIDE,ADJ_LPN,ADJ_RN,ADJ_TOTAL,INCIDENT_CNT,CMPLNT_CNT,FINE_CNT,FINE_TOT,PAYDEN_CNT,TOT_PENLTY_CNT
0,35653.0,57.0,51.5,5.0,5.0,5.0,4.0,4.0,3.43572,1.16495,...,3.11741,1.2475,0.83853,5.13047,0.0,0.0,0.0,0.0,0.0,0.0
3,35206.0,92.0,79.8,2.0,2.0,4.0,3.0,3.0,2.32722,0.82104,...,2.40074,0.86962,0.56463,3.83026,0.0,1.0,0.0,0.0,0.0,0.0
4,35111.0,103.0,98.1,3.0,3.0,4.0,3.0,2.0,2.33617,0.92407,...,2.55126,1.08955,0.3036,3.95709,0.0,0.0,0.0,0.0,0.0,0.0
5,35611.0,149.0,119.7,5.0,3.0,5.0,4.0,3.0,2.57869,1.01443,...,2.56783,1.04823,0.46444,4.07866,0.0,1.0,0.0,0.0,0.0,0.0
6,36025.0,124.0,96.0,5.0,4.0,5.0,3.0,4.0,1.99985,0.62768,...,2.12102,0.70311,0.75448,3.52979,1.0,1.0,0.0,0.0,0.0,0.0
7,35045.0,201.0,136.6,3.0,2.0,3.0,5.0,5.0,2.64483,0.86299,...,2.48334,0.98515,0.93497,4.28916,0.0,0.0,0.0,0.0,0.0,0.0
8,35611.0,170.0,137.6,1.0,1.0,3.0,3.0,2.0,2.66372,1.09313,...,2.53573,1.18122,0.38165,4.09048,0.0,0.0,0.0,0.0,0.0,0.0
9,36092.0,121.0,112.9,3.0,3.0,5.0,1.0,1.0,2.39901,0.99975,...,2.38173,1.08929,0.26696,3.76695,0.0,0.0,0.0,0.0,0.0,0.0
10,35674.0,109.0,78.9,4.0,4.0,2.0,4.0,5.0,1.58755,0.55164,...,1.84108,0.6387,1.26169,3.65833,0.0,0.0,0.0,0.0,0.0,0.0
12,36535.0,154.0,122.3,4.0,2.0,5.0,4.0,5.0,1.94081,0.60088,...,2.0891,0.68041,0.9912,3.70024,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# c) Split into train / test set using an 80/20 split.
X_train, X_test, y_train, y_test = train_test_split(cleaned_df.drop(columns = 'OVERALL_RATING'), cleaned_df['OVERALL_RATING'], test_size=0.2)

In [15]:
#d) Scale all input features
scaler = StandardScaler()
X_train_norm = pd.DataFrame(scaler.fit_transform(X_train,y=None))
X_train_norm.columns = cleaned_df.drop(columns = 'OVERALL_RATING').columns.values.tolist()
X_train_norm.head(10)

Unnamed: 0,ZIP,BEDCERT,RESTOT,SURVEY_RATING,QUALITY_RATING,STAFFING_RATING,RN_STAFFING_RATING,AIDHRD,VOCHRD,RNHRD,...,ADJ_AIDE,ADJ_LPN,ADJ_RN,ADJ_TOTAL,INCIDENT_CNT,CMPLNT_CNT,FINE_CNT,FINE_TOT,PAYDEN_CNT,TOT_PENLTY_CNT
0,1.029334,-0.158091,-0.828219,0.145672,0.823997,-0.96692,-0.292591,-1.995858,0.388789,-0.48563,...,-1.622387,0.640139,-0.484452,-1.005118,0.255041,-0.173447,0.719005,0.277398,-0.278219,0.474697
1,0.006643,-1.111541,-1.125039,-1.409187,-1.631336,-0.134157,1.28191,-1.384977,-1.284875,0.392617,...,-1.286131,-1.223552,0.985258,-0.963293,0.575425,0.622529,2.000427,1.238041,-0.278219,1.501988
2,1.489125,-1.325246,-1.395222,-1.409187,-0.812892,1.531369,1.28191,0.287078,-1.029505,5.326699,...,0.513368,-1.421891,3.895242,2.360976,-0.065342,-0.571435,-0.562417,-0.262013,-0.278219,-0.552593
3,-1.449884,6.614698,7.733899,1.700531,0.823997,-0.134157,-0.292591,0.497013,-0.660159,0.146074,...,-0.110655,-1.146983,-0.257236,-0.668896,-0.385726,-0.571435,-0.562417,-0.262013,-0.278219,-0.552593
4,1.566316,-0.799204,-0.725474,0.923102,-0.812892,-1.799683,-1.867091,0.707242,0.990511,-0.395884,...,0.694836,1.096696,-0.450913,0.668684,-0.385726,-0.438772,-0.562417,-0.262013,-0.278219,-0.552593
5,-0.263787,-0.289601,-0.128028,-1.409187,-0.812892,-0.96692,-0.292591,-0.806549,-0.627744,-0.003864,...,-0.949105,-0.906741,-0.282903,-1.092737,-0.385726,0.887854,0.719005,3.651453,-0.278219,0.474697
6,0.384172,0.614533,0.490347,0.923102,-1.631336,-0.134157,0.494659,-0.874466,0.173857,0.140179,...,-1.189994,0.106285,0.179175,-0.699013,-0.065342,0.357204,-0.562417,-0.262013,-0.278219,-0.552593
7,-1.399728,0.236441,0.509374,0.923102,0.823997,-0.134157,-0.292591,-0.43909,0.396371,-0.303622,...,-0.362763,0.310635,-0.478993,-0.331974,-0.385726,-0.571435,-0.562417,-0.262013,-0.278219,-0.552593
8,0.951098,0.532339,0.298175,-0.631758,0.005552,-1.799683,-1.867091,-0.249351,0.983946,-0.930296,...,-0.427284,0.506727,-1.203473,-0.718239,0.255041,0.091878,0.719005,0.298418,-0.278219,0.474697
9,-0.681477,-0.371795,-0.263119,0.923102,0.823997,-0.96692,-1.079841,-1.069946,0.539467,-0.782932,...,-0.771932,0.712536,-0.889399,-0.652939,-0.385726,-0.571435,-0.562417,-0.262013,-0.278219,-0.552593


In [17]:
# 2. Model #1: Logistic Regression
# a) Pick up from step d in Problem 1 (use the same data that has been scaled): Using LogisticRegression(), build a model to predict the "OVERALL_RATING"
log_reg = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(X_train_norm, y_train)
log_reg.score(X_train_norm, y_train)

0.8530699871189351

In [18]:
# b) For error evaluation, start by calculating the score (returns the mean accuracy).
train_mean = X_train.mean()
train_std = X_train.std()
X_test_scaled = (X_test - train_mean)/train_std
log_reg.score(X_test_scaled, y_test)

0.8396291208791209

In [26]:
# c) Calculate the confusion matrix and classification report (both are in sklearn.metrics)
y_pred = pd.DataFrame(log_reg.predict(X_test_scaled))
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

[[253  96   0   0   0]
 [ 56 409  73   0   0]
 [  0  64 320  81   0]
 [  0   0  23 632  49]
 [  0   0   0  25 831]]
             precision    recall  f1-score   support

        1.0       0.82      0.72      0.77       349
        2.0       0.72      0.76      0.74       538
        3.0       0.77      0.69      0.73       465
        4.0       0.86      0.90      0.88       704
        5.0       0.94      0.97      0.96       856

avg / total       0.84      0.84      0.84      2912



In [29]:
# 3. Model #2: PCA(n_components = 2) + Logistic Regression
# a) We will now transform the X_train & X_test data using PCA with 2 components. 

# import PCA object from sklearn
from sklearn.decomposition import PCA

# limit PCA object to 2 components 
pca_two = PCA(n_components=2)

# use pca object to fit & apply pca transformation to data
X_train_pca = pca_two.fit_transform(X_train_norm)
pca_fit = pca_two.fit(X_train_norm)

In [30]:
# b) Then use the transformed data (X_train_pca) to fit a Logistic Regression model
log_reg_pca = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(X_train_pca, y_train)

In [43]:
# c) Calculate the same error metrics as those from Model #1.
X_test_PCA = pca_fit.transform(X_test_scaled)
y_pred_PCA = pd.DataFrame(log_reg_pca.predict(X_test_PCA))
print(log_reg_pca.score(X_test_PCA, y_test))
print(metrics.confusion_matrix(y_test, y_pred_PCA))
print(metrics.classification_report(y_test, y_pred_PCA))

0.3784340659340659
[[130 189   1  10  19]
 [ 98 253   1  29 157]
 [ 36 193   2  34 200]
 [ 23 191   2  46 442]
 [  4 136   1  44 671]]
             precision    recall  f1-score   support

        1.0       0.45      0.37      0.41       349
        2.0       0.26      0.47      0.34       538
        3.0       0.29      0.00      0.01       465
        4.0       0.28      0.07      0.11       704
        5.0       0.45      0.78      0.57       856

avg / total       0.35      0.38      0.31      2912



In [45]:
# 4. Model #3: PCA(n_components = 16) + Logistic Regression
# a) We will now transform the X_train & X_test data using PCA with 16 components. 
# limit PCA object to 16 components 
pca_16 = PCA(n_components=16)

# use pca object to fit & apply pca transformation to data
X_train_pca16 = pca_16.fit_transform(X_train_norm)
pca_fit16 = pca_16.fit(X_train_norm)

# b) Then use the transformed data (X_train_pca) to fit a Logistic Regression model.
log_reg_pca16 = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(X_train_pca16, y_train)

# predict values with 16 component PCA
X_test_PCA16 = pca_fit16.transform(X_test_scaled)
y_pred_PCA16 = pd.DataFrame(log_reg_pca16.predict(X_test_PCA16))

# c) Calculate the same error metrics as those from Model #1
print(log_reg_pca16.score(X_test_PCA16, y_test))
print(metrics.confusion_matrix(y_test, y_pred_PCA16))
print(metrics.classification_report(y_test, y_pred_PCA16))

0.8334478021978022
[[254  95   0   0   0]
 [ 53 418  67   0   0]
 [  0  52 342  71   0]
 [  0   0  34 608  62]
 [  0   0   0  51 805]]
             precision    recall  f1-score   support

        1.0       0.83      0.73      0.77       349
        2.0       0.74      0.78      0.76       538
        3.0       0.77      0.74      0.75       465
        4.0       0.83      0.86      0.85       704
        5.0       0.93      0.94      0.93       856

avg / total       0.83      0.83      0.83      2912



5. Between Model #2 and Model #3, which performed the best? 
We can see model 3 performed much better than 2. Model 2 only contained 2 orthogonal components but this resulted in an overall accuracy of less than 40% which isn't very good. However, in model #3 we reduced our amount of inputs from 27 to 16 principle components and the model performed almost the same as model #1.