## ML with Medicare data

# Scikit + EC2

Recommended instance type: c5.4xlarge

# Download data

In [2]:
!aws s3 cp s3://rikturr/2015_partB_sparse.npz .
!aws s3 cp s3://rikturr/2015_partB_lookup.csv .

download: s3://rikturr/2015_partB_sparse.npz to ./2015_partB_sparse.npz
download: s3://rikturr/2015_partB_lookup.csv to ./2015_partB_lookup.csv


In [1]:
import scipy.sparse as sp
import pandas as pd
import numpy as np

random_state = 42
labels = pd.read_csv('2015_partB_lookup.csv')
features = sp.load_npz('2015_partB_sparse.npz')

In [2]:
labels.head()

Unnamed: 0,npi,provider_type
0,1003000126,Internal Medicine
1,1003000142,Anesthesiology
2,1003000407,Family Practice
3,1003000522,Family Practice
4,1003000530,Internal Medicine


In [3]:
features

<516476x4206 sparse matrix of type '<class 'numpy.float64'>'
	with 5596950 stored elements in Compressed Sparse Column format>

In [4]:
features[0].todense()

matrix([[0., 0., 0., ..., 0., 0., 0.]])

# Do some machine learning!

### create train/test split

In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(features, 
                                                    labels['provider_type'], 
                                                    test_size=0.3, 
                                                    random_state=random_state)

In [6]:
x_train.shape

(361533, 4206)

In [7]:
y_train.shape

(361533,)

### train Random Forest classifier

(look at resource util)

In [8]:
%%time
from sklearn.ensemble import RandomForestClassifier
from multiprocessing import cpu_count

rf = RandomForestClassifier(n_estimators=20, n_jobs=cpu_count(), random_state=random_state)
rf.fit(x_train, y_train)

CPU times: user 8min 46s, sys: 18.7 s, total: 9min 5s
Wall time: 32.1 s


### look at results

In [9]:
from sklearn.metrics import accuracy_score

predicted = rf.predict(x_test)
accuracy_score(y_test, predicted)

0.7061887274675203

In [10]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predicted))

                      precision    recall  f1-score   support

      Anesthesiology       0.98      0.98      0.98     10030
          Cardiology       0.90      0.91      0.91      6206
Diagnostic Radiology       0.99      0.99      0.99      8716
  Emergency Medicine       0.80      0.84      0.82     12199
     Family Practice       0.52      0.56      0.54     24208
   Internal Medicine       0.66      0.66      0.66     29214
  Nurse Practitioner       0.51      0.55      0.53     23458
       Ophthalmology       0.99      0.98      0.99      5181
  Orthopedic Surgery       0.80      0.87      0.83      6338
  Physical Therapist       1.00      1.00      1.00     13009
 Physician Assistant       0.49      0.35      0.41     16384

           micro avg       0.71      0.71      0.71    154943
           macro avg       0.79      0.79      0.79    154943
        weighted avg       0.70      0.71      0.70    154943



In [11]:
pd.crosstab(y_test, predicted)

col_0,Anesthesiology,Cardiology,Diagnostic Radiology,Emergency Medicine,Family Practice,Internal Medicine,Nurse Practitioner,Ophthalmology,Orthopedic Surgery,Physical Therapist,Physician Assistant
provider_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Anesthesiology,9790,0,3,7,22,55,104,1,17,2,29
Cardiology,4,5631,6,3,44,313,152,1,4,0,48
Diagnostic Radiology,8,9,8617,3,13,11,24,0,3,0,28
Emergency Medicine,8,4,10,10287,499,223,414,1,17,0,736
Family Practice,9,25,29,584,13595,5375,3489,5,139,0,958
Internal Medicine,47,437,17,282,5878,19155,2723,5,68,8,594
Nurse Practitioner,50,74,10,558,4001,2438,13011,10,225,7,3074
Ophthalmology,1,0,0,2,8,5,57,5088,2,1,17
Orthopedic Surgery,21,1,3,6,42,56,277,2,5510,6,414
Physical Therapist,0,0,0,1,0,0,2,0,0,13006,0


# Do lots of ML!

### save result files to s3

In [12]:
import boto3
import io

s3 = boto3.resource('s3')

def to_csv_s3(df, key, bucket, index=False):
    buf = io.StringIO()
    df.to_csv(buf, index=index)
    s3.Object(bucket, key).put(Body=buf.getvalue())

In [13]:
test_df = pd.DataFrame({'col1': [1, 2]})
test_df

Unnamed: 0,col1
0,1
1,2


In [34]:
to_csv_s3(test_df, 'test.csv', 'rikturr-private')

In [35]:
!aws s3 cp s3://rikturr-private/test.csv .

download: s3://rikturr-private/test.csv to ./test.csv           


In [36]:
!head test.csv

col1
1
2


### run an experiment

In [None]:
%%time
from sklearn.model_selection import cross_validate

ntrees = [3, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
# ntrees = [2, 3]
out = pd.DataFrame()

for nt in ntrees:
    rf = RandomForestClassifier(n_estimators=nt, n_jobs=cpu_count(), random_state=random_state)
    
    scores = cross_validate(estimator=rf, X=x_train, y=y_train, 
                            scoring=['accuracy', 'balanced_accuracy'], cv=3, n_jobs=1)
    scores['ntrees'] = nt
    out = pd.concat([out, pd.DataFrame(scores)])

    to_csv_s3(out, 'rf_trees/results.csv', 'rikturr-private')



In [None]:
!sudo shutdown -h now