# Keras + EC2

Recommended instance type: p2.xlarge

In [1]:
!aws s3 cp s3://rikturr/2015_partB_sparse.npz .
!aws s3 cp s3://rikturr/2015_partB_lookup.csv .

download: s3://rikturr/2015_partB_sparse.npz to ./2015_partB_sparse.npz
download: s3://rikturr/2015_partB_lookup.csv to ./2015_partB_lookup.csv


In [5]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-0.19.1-cp36-cp36m-manylinux1_x86_64.whl (12.4MB)
[K    100% |████████████████████████████████| 12.4MB 110kB/s eta 0:00:01
[?25hInstalling collected packages: scikit-learn
Successfully installed scikit-learn-0.19.1


In [2]:
import scipy.sparse as sp
import pandas as pd
import numpy as np

random_state = 42
labels = pd.read_csv('2015_partB_lookup.csv')
features = sp.load_npz('2015_partB_sparse.npz')

  (fname, cnt))
  (fname, cnt))


In [3]:
labels.head()

Unnamed: 0,npi,provider_type
0,1003000126,Internal Medicine
1,1003000142,Anesthesiology
2,1003000407,Family Practice
3,1003000522,Family Practice
4,1003000530,Internal Medicine


In [4]:
features

<516476x4206 sparse matrix of type '<class 'numpy.float64'>'
	with 5596950 stored elements in Compressed Sparse Column format>

In [9]:
from sklearn.preprocessing import MaxAbsScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

pipeline = Pipeline([('scale', MaxAbsScaler()), ('zero_var', VarianceThreshold(0))])
preprocessed = pipeline.fit_transform(features)
y = to_categorical(labels['provider_type'].astype('category').cat.codes)

x_train, x_test, y_train, y_test = train_test_split(preprocessed, 
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=random_state)

x_train, x_val, y_train, y_val = train_test_split(x_train, 
                                                  y_train,
                                                  test_size=0.1,
                                                  random_state=random_state)

In [11]:
x_train.shape

(325379, 4206)

In [12]:
y_train.shape

(325379, 11)

In [13]:
from keras.models import *
from keras.layers import *

input_layer = Input(shape=(x_train.shape[1],))
x = Dense(500, activation='relu')(input_layer)
x = Dense(100, activation='relu')(x)
x = Dense(50, activation='relu', name='encoded')(x)
output_layer = Dense(y_train.shape[1], activation='softmax')(x)

model = Model(input_layer, output_layer)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 4206)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 500)               2103500   
_________________________________________________________________
dense_5 (Dense)              (None, 100)               50100     
_________________________________________________________________
encoded (Dense)              (None, 50)                5050      
_________________________________________________________________
dense_6 (Dense)              (None, 11)                561       
Total params: 2,159,211
Trainable params: 2,159,211
Non-trainable params: 0
_________________________________________________________________


In [15]:
from keras.callbacks import TensorBoard
tensorboard = TensorBoard(log_dir='/tmp/tensorboard')

to run tensorboard (port must be open in security group):
`tensorboard --logdir=/tmp/tensorboard --host=0.0.0.0`

In [17]:
def sparse_generator(x, y=None, batch_size=32):
    index = np.arange(x.shape[0])
    start = 0
    while True:
        if start == 0 and y is not None:
            np.random.shuffle(index)

        batch = index[start:start + batch_size]

        if y is not None:
            yield x[batch].toarray(), y[batch]
        else:
            yield x[batch].toarray()

        start += batch_size
        if start >= x.shape[0]:
            start = 0

In [19]:
model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
batch_size = 32
model.fit_generator(sparse_generator(x_train, y_train, batch_size),
                    epochs=10,
                    steps_per_epoch=-0 - - x_train.shape[0] / batch_size,
                    validation_data=sparse_generator(x_val, y_val, batch_size),
                    validation_steps=-0 - - x_val.shape[0] / batch_size,
                    callbacks=[tensorboard])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f155bd815f8>