In [3]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import time
from sklearn import metrics

In [4]:
vbc = pd.read_csv("../data/vb_data_3_categZone.csv")
vb = pd.read_csv("../data/vb_data_3_numZone.csv")
print(len(vb), 'lines loaded')

146050 lines loaded


In [5]:
X = vb.drop(['Season', 'GameID', 'PlayerTeam', 'PlayerName', 'RewardDistance', 'RewardValue'], axis=1)
cols = [col for col in list(X.columns) if X[col].dtype == 'object']
X = pd.get_dummies(data=X, columns = cols)
Y = vb.RewardValue
print(len(X.columns), 'columns in dataframe')

204 columns in dataframe


In [6]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X)
X_n = scaler.transform(X)

In [7]:
# define base model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(200, input_dim=204, kernel_initializer='normal', activation='relu'))
    model.add(Dense(20, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='tanh'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [7]:
kfold = KFold(n_splits=5)
X_tr = {}
y_tr = {}
X_val = {}
y_val = {}
k = 0
for tr_idx, val_idx in kfold.split(X):
    X_tr[k] = X.iloc[tr_idx]
    y_tr[k] = Y.iloc[tr_idx]
    X_val[k] = X.iloc[val_idx]
    y_val[k] = Y.iloc[val_idx]
    k += 1
print('Split complete.')
models = [baseline_model() for i in range(5)]
print('Models generated.')

Split complete.




Models generated.


In [12]:
for n in range(25):
    t = time.time()
    print('Epoch', n+1)
    scores = []
    for k in range(5):
        X_train = X_tr[k]
        X_valid = X_val[k]
        y_train = y_tr[k]
        y_valid = y_val[k]
        model = models[k]
        
        model.fit(X_train, y_train, epochs=1, shuffle=False, verbose=0, batch_size=64)
        y = model.predict(X_valid)
        scores.append(metrics.mean_squared_error(y_valid,y))
        
    print("Time elapsed:", time.time()-t)
    print("Score mean:", np.mean(scores))
        

#model.fit(X_n,Y,validation_split=0.2, epochs=20, shuffle=True, verbose=1, callbacks=[])

Epoch 1
Time elapsed: 16.139482259750366
Score mean: 0.6522400407360897
Epoch 2
Time elapsed: 15.269855499267578
Score mean: 0.6404054861137791
Epoch 3
Time elapsed: 14.931118488311768
Score mean: 0.6361113393328477
Epoch 4
Time elapsed: 15.008963108062744
Score mean: 0.6328264099011712
Epoch 5
Time elapsed: 15.26750659942627
Score mean: 0.628726791523796
Epoch 6
Time elapsed: 15.225233316421509
Score mean: 0.6273099413392984
Epoch 7
Time elapsed: 14.975981712341309
Score mean: 0.6270094393768403
Epoch 8
Time elapsed: 15.243955612182617
Score mean: 0.6263003127347523
Epoch 9
Time elapsed: 15.204801559448242
Score mean: 0.6254043085632721
Epoch 10
Time elapsed: 15.23819613456726
Score mean: 0.6268102942064017
Epoch 11
Time elapsed: 15.289322853088379
Score mean: 0.6256943419277629
Epoch 12
Time elapsed: 15.050822734832764
Score mean: 0.6264673125002007
Epoch 13
Time elapsed: 15.242889642715454
Score mean: 0.6284831227347792
Epoch 14
Time elapsed: 15.122716188430786
Score mean: 0.6308123

KeyboardInterrupt: 

In [185]:
model = baseline_model()
model.fit(X_n,Y,epochs=9,verbose=1,shuffle=False, batch_size=64)

Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


<tensorflow.python.keras.callbacks.History at 0x7f5d0ffc1128>

In [187]:
from sklearn import metrics
yp = model.predict(X_n)
print(metrics.mean_squared_error(Y,yp))

0.6114518472302143


In [27]:
import time
t = time.time()
results = cross_val_score(estimator, X, Y, cv=kfold)
print("Result: %.4f (%.2f) MSE" % (results.mean(), results.std()))
print(results)
print("Time elapsed:", time.time()-t)

Result: -0.9239 (0.15) MSE
[-0.99801213 -0.99808782 -0.99778056 -0.62329292 -1.00216949]
Time elapsed: 212.7288966178894


In [177]:
import pickle
pickle.dump(yp, open('nn_regr.pkl', 'wb'))

In [10]:
t = time.time()
kfold = KFold(n_splits=5)

for k in range(1,16):
    estimator = KerasRegressor(build_fn=baseline_model, epochs=k, batch_size=64, verbose=0)
    results = cross_val_score(estimator, X_n, Y, cv=kfold)
    print("CV Result for %d epochs: %.5f (%.2f) MSE" % (k, results.mean(), results.std()))
    
    estimator = KerasRegressor(build_fn=baseline_model, epochs=k, batch_size=64, verbose=0)
    estimator.fit(X,Y)
    yp = estimator.predict(X)
    print("Result for %d epochs: %.5f MSE" % (k, metrics.mean_squared_error(Y,yp)))
    print("Time elapsed:", time.time()-t)
    print()

CV Result for 1 epochs: -0.64906 (0.01) MSE
Result for 1 epochs: 0.65829 MSE
Time elapsed: 27.53672742843628



KeyboardInterrupt: 

In [22]:
from sklearn import metrics
estimator = KerasRegressor(build_fn=baseline_model, epochs=9, batch_size=64, verbose=1)
estimator.fit(X,Y)
yp = estimator.predict(X)
print(metrics.mean_squared_error(Y,yp))

Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
0.6060967747455163
