In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
%matplotlib inline
import matplotlib 
import numpy as np

#Read in Raw Data CSV
raw = pd.read_csv("C:/Users/dtcarroll/Desktop/2016 Oct SAS pack/Raw.csv")

#Lets prepare the data to train a classification model
Y = raw["Star Rating"]
X = raw.drop("Star Rating",axis=1)
X = X.drop("PROVIDER_ID",axis=1)

#Split data into train/test
X_train, X_test, y_train, y_test = train_test_split(
X, Y,
test_size=0.33, random_state=42)
print(X_train.shape)
print(X_test.shape)

#Most ML solutions do not handle NAN values effectively - we will try to replace them with the mean for the column.
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp = imp.fit(X_train)
X_train = imp.transform(X_train)
print(X_train.shape)
imp2 = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp2 = imp2.fit(X_test)
X_test = imp.transform(X_test)
print(X_test.shape)

(3053, 127)
(1504, 127)
(3053L, 127L)
(1504L, 127L)


In [2]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import  RidgeClassifierCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform as sp_rand


clf0 = MLPClassifier(solver='lbfgs', alpha=1e-5,
                                 hidden_layer_sizes=(5, 2), random_state=1)


clf1 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                 max_depth = 1, random_state=0).fit(X_train,y_train)


clf2 = AdaBoostClassifier(n_estimators=200)


clf3 = KNeighborsClassifier(n_neighbors=10, weights='uniform', algorithm='auto', leaf_size=30, p=2,
                           metric='minkowski', metric_params=None, n_jobs=1)

clf4 = RandomForestClassifier(n_estimators = 200, max_depth = None, min_samples_split=2,random_state=0)

cl5 = RidgeClassifierCV(alphas=(0.1, 1.0, 10.0), fit_intercept=True, normalize=False,
                        scoring=None, cv=None, class_weight=None)

eclf = VotingClassifier(estimators=[('MLP', clf0), ('GB', clf1), ('ADA', clf2)], voting='hard')

for clf, label in zip([clf0,clf1, clf2, clf3, clf4, cl5, eclf], ['Multi Layer Perceptron', 'Gradient Boosting', 'Ada Boost',
                                                      'K-Nearest Neighbors', 'Random Forest Classifier','Ridge Classifier - CV', 'Ensemble']):
    
    clf.fit(X_train,y_train)
    predict = clf.predict(X_test)
    score = accuracy_score(y_test, predict)
    print("Accuracy: %0.2f [%s]" % (score, label))

Accuracy: 0.55 [Multi Layer Perceptron]
Accuracy: 0.76 [Gradient Boosting]
Accuracy: 0.59 [Ada Boost]
Accuracy: 0.52 [K-Nearest Neighbors]
Accuracy: 0.76 [Random Forest Classifier]
Accuracy: 0.67 [Ridge Classifier - CV]
Accuracy: 0.62 [Ensemble]


In [23]:
# It looks like we can get some decent performance out of Gradient Boosting and Random Forest 
# Lets look at those results in particular.

clf1 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                 max_depth = 2, random_state=0).fit(X_train,y_train)

clf1.fit(X_train,y_train)
predict = clf1.predict(X_test)
score = accuracy_score(y_test, predict)
print "Accuracy : ", score

Accuracy :  0.792553191489


In [24]:
# Looks pretty cool, lets try and run the whole dataset to see how we do
X = imp.transform(X)
final_predict = clf1.predict(X)

In [25]:
len(final_predict)

4557

In [26]:
len(raw)

4557

In [27]:
raw['Predicted Star Rating'] = final_predict

In [28]:
raw.head()

Unnamed: 0,PROVIDER_ID,HAI_1_DEN,HAI_2_DEN,HAI_3_DEN,HAI_4_DEN,HAI_5_DEN,HAI_6_DEN,HAI_1,HAI_2,HAI_3,...,MORT_30_CABG,MORT_30_CABG_DEN,READM_30_CABG,READM_30_CABG_DEN,AMI_7A,AMI_7A_DEN,CAC_3,CAC_3_DEN,Star Rating,Predicted Star Rating
0,10001,9.198,20.321,5.347,1.721,7.111,47.018,1.74,1.427,1.87,...,0.042,278.0,0.151,273.0,,,,,3,3
1,10005,3.767,10.965,2.594,,1.441,17.293,1.062,0.456,0.386,...,,,,,,,,,3,3
2,10006,17.577,23.383,3.226,,3.173,33.115,0.341,0.641,0.31,...,0.041,139.0,0.152,134.0,,,,,2,2
3,10007,,1.547,,,,3.699,,0.0,,...,,,,,,,,,3,3
4,10008,,,,,,1.711,,,,...,,,,,,,,,3,3


In [29]:
anmc = raw.loc[raw['PROVIDER_ID'] == 20026]

In [30]:
anmc


Unnamed: 0,PROVIDER_ID,HAI_1_DEN,HAI_2_DEN,HAI_3_DEN,HAI_4_DEN,HAI_5_DEN,HAI_6_DEN,HAI_1,HAI_2,HAI_3,...,MORT_30_CABG,MORT_30_CABG_DEN,READM_30_CABG,READM_30_CABG_DEN,AMI_7A,AMI_7A_DEN,CAC_3,CAC_3_DEN,Star Rating,Predicted Star Rating
95,20026,5.819,5.277,4.554,,1.974,24.518,0.687,0.948,1.098,...,,,,,,,0.92,12.0,2,2
