In [1]:
## from blog:
## https://towardsdatascience.com/building-a-deep-learning-model-using-keras-1548ca149d37

In [15]:
import pandas as pd
import numpy as np

In [3]:
train_df = pd.read_csv('data/hourly_wages_data.csv')

train_df.head()

Unnamed: 0,wage_per_hour,union,education_yrs,experience_yrs,age,female,marr,south,manufacturing,construction
0,5.1,0,8,21,35,1,1,0,1,0
1,4.95,0,9,42,57,1,1,0,1,0
2,6.67,0,12,1,19,0,0,0,1,0
3,4.0,0,12,4,22,0,0,0,0,0
4,7.5,0,12,17,35,0,1,0,0,0


In [4]:
# get independent features

train_X = train_df.drop(['wage_per_hour'], axis=1)

train_X.head()

Unnamed: 0,union,education_yrs,experience_yrs,age,female,marr,south,manufacturing,construction
0,0,8,21,35,1,1,0,1,0
1,0,9,42,57,1,1,0,1,0
2,0,12,1,19,0,0,0,1,0
3,0,12,4,22,0,0,0,0,0
4,0,12,17,35,0,1,0,0,0


In [5]:
## get dependent variable

train_y = train_df[['wage_per_hour']]

train_y.head()

Unnamed: 0,wage_per_hour
0,5.1
1,4.95
2,6.67
3,4.0
4,7.5


In [6]:
from keras.models import Sequential
from keras.layers import Dense

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [8]:
model = Sequential()
## According to Keras documentation: the Sequential model is a linear stack of layers.

In [9]:
# get number of columns in training data:
n_cols = train_X.shape[1]

In [10]:
# add layers
model.add(Dense(10, activation = 'relu', input_shape=(n_cols,)))
model.add(Dense(10, activation = 'relu'))
model.add(Dense(1))

In [11]:
# compile model using mse as a measure of model performance
model.compile(optimizer='adam', loss = 'mean_squared_error')

In [12]:
# Set up early stopping to stop the number of times the data gets run through the model.
# We use three to say that if the data gets passed through the model 3 times and the model stops improving
# we will stop the model fitting process.

from keras.callbacks import EarlyStopping

#set early stopping monitor so the model stops training when it won't imrpove anymore.
early_stopping_monitor = EarlyStopping(patience = 3)

In [13]:
# train model
model.fit(train_X, train_y, validation_split = 0.2, epochs=30, callbacks = [early_stopping_monitor])

Train on 427 samples, validate on 107 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x182b260cf8>

In [49]:
# test model on what our training data is. the blog just assumed some random test data, but let's see how accurate
# we are with the train data.

train_y_predictions = model.predict(train_X)
print("accuracy :", np.mean((1 - abs((np.array(train_y) - train_y_predictions)/np.array(train_y))) * 100))

accuracy : 58.31915865056455


In [50]:
## train new model with higher capacity

model_mc = Sequential()

In [51]:
# add layers
model_mc.add(Dense(200, activation = 'relu', input_shape = (n_cols,)))
model_mc.add(Dense(200, activation = 'relu'))
model_mc.add(Dense(200, activation = 'relu'))
model_mc.add(Dense(1))

# compile
model_mc.compile(optimizer = 'adam', loss = 'mean_squared_error')

# train
model_mc.fit(train_X, train_y, validation_split = 0.2, epochs = 30, callbacks = [early_stopping_monitor])

Train on 427 samples, validate on 107 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30


<keras.callbacks.History at 0x1828e8dc50>

In [55]:
train_y_predictions = model.predict(train_X)

In [56]:
print("accuracy :", np.mean((1 - abs((np.array(train_y) - train_y_predictions)/np.array(train_y))) * 100))

accuracy : 58.31915865056455
