# Play with keras model

In [1]:
%pylab inline
import numpy as np
import csv
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils
from sklearn import metrics
from sklearn.cross_validation import KFold
from sklearn.preprocessing import StandardScaler
from sklearn import utils
from preprocess import loadPreData

'''
    This demonstrates how to reach a 0.80 ROC AUC score (local 4-fold validation)
    in the Kaggle Nile virus prediction challenge. 

    The model trains in a few seconds on CPU.
'''

Populating the interactive namespace from numpy and matplotlib


Using TensorFlow backend.


'\n    This demonstrates how to reach a 0.80 ROC AUC score (local 4-fold validation)\n    in the Kaggle Nile virus prediction challenge. \n\n    The model trains in a few seconds on CPU.\n'

In [2]:
data = loadPreData()

Index([u'Address', u'Species', u'Block', u'Street', u'Trap',
       u'AddressNumberAndStreet', u'Latitude', u'Longitude',
       u'AddressAccuracy', u'NumMosquitos', u'WnvPresent'],
      dtype='object')
Index([u'Id', u'Address', u'Species', u'Block', u'Street', u'Trap',
       u'AddressNumberAndStreet', u'Latitude', u'Longitude',
       u'AddressAccuracy'],
      dtype='object')


In [3]:
X = data['train'].get_values().astype(float)
y = data['labels']
#y = np.array([y,1-y]).T
Y = np_utils.to_categorical(y)

array([[ 1.,  0.],
       [ 0.,  1.],
       [ 1.,  0.]])

In [4]:
input_dim = X.shape[1]
output_dim = 2

In [5]:
def scoreAUC(y,probs):
    ps = np.linspace(0.,1.,num=100)
    prs = []
    nrs = []
    for p in ps:
        preds = probs[:,0]<p
        pr = np.sum((y & preds))/float(np.sum(y))
        nr = np.sum((1-y & 1-preds))/float(np.sum(1-y))
        nrs.append(nr)
        prs.append(pr)
    xs = 1-np.array(nrs)
    ys = np.array(prs)
    dxs = xs[1:] - xs[:-1]
    ays = .5*(ys[1:] + ys[:-1])
    auc = np.sum(ays*dxs)
    return {'score':auc,'fpr':xs,'tpr':ys}

In [6]:
def build_model(input_dim, output_dim):
    model = Sequential()
    model.add(Dense(32, input_dim=input_dim, kernel_initializer='random_uniform'))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))

    model.add(Dense(32, kernel_initializer='random_uniform'))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))

    model.add(Dense(output_dim, kernel_initializer='random_uniform'))

    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
    return model

In [71]:
def testModel():    
    def logit(x):
        return 1/(1+np.exp(-x))

    X = (np.random.rand(5000,5))
    covs = np.tile(100.*(1.-2.*np.random.rand(5)),(5000,1))
    y = np.sum(X*covs,axis=1)

    y =  (logit(y)>0.5).astype(float)
    print y.shape
    y = np_utils.to_categorical(y)
    
    X_valid = np.random.rand(1000,5)
    
    y_valid = np.sum(covs[:1000,:]*X_valid,axis=1)
    y_valid =  (logit(y_valid)>0.5).astype(float)
    y_valid = np_utils.to_categorical(y_valid)
    
    model = build_model(5, 2)
    print 'Data input shape {}, data output shape {}'.format(X.shape,y.shape)
    model.fit(X, y, nb_epoch=10, batch_size=32, validation_data=(X_valid, y_valid), verbose=True)
    return {'model':model,'X_valid':X_valid,'y_valid':y_valid}

In [72]:
res = testModel()

(5000,)
Data input shape (5000, 5), data output shape (5000, 2)
Train on 5000 samples, validate on 1000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [73]:
res['model'].predict_proba(res['X_valid'])

  32/1000 [..............................] - ETA: 5s

array([[  8.18462521e-02,   9.18153763e-01],
       [  1.00000000e+00,   2.92689997e-08],
       [  9.99997497e-01,   2.45466754e-06],
       ..., 
       [  9.96360719e-01,   3.63926683e-03],
       [  4.45448160e-01,   5.54551840e-01],
       [  9.99990821e-01,   9.21040009e-06]], dtype=float32)

In [9]:
#class_weight = {0 : 1.,1: 50.}

nb_folds = 4
kfolds = KFold(len(y), nb_folds)
av_roc = 0.
f = 0
for train, valid in kfolds:
    print('---'*20)
    print('Fold', f)
    print('---'*20)
    f += 1
    X_train = X[train]
    X_valid = X[valid]
    Y_train = Y[train]
    Y_valid = Y[valid]
    y_valid = y[valid]
    print("fold has {} WNV present".format(np.sum(Y_train[:,1])))
    class_weight = utils.class_weight.compute_class_weight('balanced', np.unique(y[train]), y[train])
    print("Class weight: {}".format(class_weight))
    print("Building model...")
    model = build_model(input_dim, output_dim)

    print("Training model...")

    model.fit(X_train, Y_train, epochs=20, batch_size=32, validation_data=(X_valid, Y_valid), 
              class_weight=class_weight,verbose=True)
    valid_preds = model.predict_proba(X_valid)
    valid_preds = valid_preds[:, 1]
    roc = metrics.roc_auc_score(y_valid, valid_preds)
    print("ROC:", roc)
    av_roc += roc

print('Average ROC:', av_roc/nb_folds)

------------------------------------------------------------
('Fold', 0)
------------------------------------------------------------
fold has 366.0 WNV present
Class weight: [  0.52435778  10.7636612 ]
Building model...
Training model...
Train on 7879 samples, validate on 2627 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
  32/2627 [..............................] - ETA: 3s('ROC:', 0.60732895057219383)
------------------------------------------------------------
('Fold', 1)
------------------------------------------------------------
fold has 495.0 WNV present
Class weight: [ 0.53351842  7.95858586]
Building model...
Training model...
Train on 7879 samples, validate on 2627 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoc

In [8]:
np.sum(model.predict_proba(X_valid)[:,1])



264.66864

In [10]:
Y = np_utils.to_categorical(y)
model = build_model(input_dim, output_dim)
model.fit(X, Y, nb_epoch=1, batch_size=32, verbose=1,class_weight=class_weight)

Epoch 1/1


<keras.callbacks.History at 0x122a225d0>

In [11]:
X_sub = data['test'].get_values()

In [12]:
subs = model.predict_proba(X_sub)
subs[:,0]



array([ 0.94704348,  0.94707501,  0.94701201, ...,  0.93396246,
        0.93398762,  0.93389934], dtype=float32)

In [102]:
np.sum(data['labels']==1)/float(data['labels'].size)

0.052446221206929371

In [129]:
print X_valid.shape

(5253, 43)


In [133]:
data['train'].columns[['Date' in data['train'].columns]]

  result = getitem(key)


Index([], dtype='object')