# Part A:  Build a Regression Model in Keras, Baseline Model with Un-normalized Data, Epochs = 50, 1 Hidden Layer with 10 Nodes

In [1]:
# Import the dataset and display a sample

import pandas as pd
import numpy as np
concrete_data = pd.read_csv('https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DL0101EN/labs/data/concrete_data.csv')
concrete_data.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [2]:
# Check the size of the dataset

concrete_data.shape

(1030, 9)

In [3]:
# Looks at the stats of  dataset

concrete_data.describe()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


In [4]:
# Check for missing values

concrete_data.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

In [6]:
# Split the dataset into predictors and target

concrete_data_columns = concrete_data.columns

predictors = concrete_data[concrete_data_columns[concrete_data_columns != 'Strength']] # all columns except Strength
target = concrete_data['Strength'] # Strength column

In [7]:
# Quick check of the predictors dataset

predictors.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360


In [8]:
# Quick check of the target dataset

target.head()

0    79.99
1    61.89
2    40.27
3    41.05
4    44.30
Name: Strength, dtype: float64

In [9]:
# Save the number of predictors to n_cols since we will need this number when building our network.

n_cols = predictors.shape[1] # number of predictors
print(n_cols)

8


In [None]:
# define regression model

def regression_model():
    # create model
    model = Sequential()
    model.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    model.add(Dense(1))
    
    # compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [36]:
# Set up the libraries
import keras
import numpy
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# create a array to hold the mean squared error
mse = numpy.zeros(shape=(50,1))

# Iterate 50 times with different sets of test and train split data
for i in range (1,50):

    # Split the data into 70% train and 30% test
    predictors_train, predictors_test, target_train, target_test = train_test_split(predictors, target, test_size=0.3, random_state=42)
    
    # build the model
    model = regression_model()
    
    # fit the model
    model.fit(predictors_train, target_train, epochs=50,verbose=0)
    
    # predict on the test set
    prediction = model.predict(np.array(predictors_test))
    
    # update the mean squared error
    mse[i-1]=mean_squared_error(target_test, prediction)  

In [38]:
# Calculate mean and standard deviation of Mean Squared Error over 50 runs of the model
mean_mse_epoch_50_8_10_1=numpy.mean(mse)
print("Mean of Mean Squared Error for 50 runs is:", mean_mse_epoch_50_8_10_1)
std_mse_epoch_50_8_10_1=numpy.std(mse)
print("Standard Deviation of Mean Squared Error for 50 runs is:", std_mse_epoch_50_8_10_1)

Mean of Mean Squared Error for 50 runs is: 333.476874216809
Standard Deviation of Mean Squared Error for 50 runs is: 428.3775773177056


# Part B:  Build a Regression Model in Keras, with Normalized Data, Epochs = 50, 1 Hidden Layer with 10 Nodes

In [39]:
# Repeat part A for normalized predictor data
# normalize the predictors dataset
predictors_norm = (predictors - predictors.mean()) / predictors.std()
predictors_norm.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,0.862735,-1.217079,-0.279597
1,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,1.055651,-1.217079,-0.279597
2,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,3.55134
3,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,5.055221
4,-0.790075,0.678079,-0.846733,0.488555,-1.038638,0.070492,0.647569,4.976069


In [45]:
# Set up the libraries
import keras
import numpy
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# create a array to hold the mean squared error
mse = numpy.zeros(shape=(50,1))

# Iterate 50 times with different sets of test and train split data
for i in range (1,50):

    # Split the normalized data into 70% train and 30% test
    predictors_norm_train, predictors_norm_test, target_train, target_test = train_test_split(predictors_norm, target, test_size=0.3, random_state=42)
    
    # build the model
    model = regression_model()
    
    # fit the model on normalized training sets
    model.fit(predictors_norm_train, target_train, epochs=50,verbose=0)
    
    # predict on the normalized test set
    prediction = model.predict(np.array(predictors_norm_test))
    
    # update the mean squared error
    mse[i-1]=mean_squared_error(target_test, prediction)  

In [46]:
# Calculate mean and standard deviation of Mean Squared Error over 50 runs of the model with normalized predictor
mean_mse_norm_epoch_50_8_10_1=numpy.mean(mse)
print("Mean of Mean Squared Error with normalized predictors for 50 runs is:", mean_mse_norm_epoch_50_8_10_1)
std_mse_norm_epoch_50_8_10_1=numpy.std(mse)
print("Standard Deviation of Mean Squared Error normalized predictors for 50 runs is:", std_mse_norm_epoch_50_8_10_1)

Mean of Mean Squared Error with normalized predictors for 50 runs is: 307.42916035317756
Standard Deviation of Mean Squared Error normalized predictors for 50 runs is: 81.2626126854015


# Discussion on Mean Squared Error in Step A and Step B

With unnormalized predictor dataset the following mean and standard deviation of mean squared error was observed:
    Mean of Mean Squared Error for 50 runs is: 333.476874216809.
    Standard Deviation of Mean Squared Error for 50 runs is: 428.3775773177056.
    

With normalized predictor dataset the following mean and standard deviation of mean squared error was observed:
    Mean of Mean Squared Error with normalized predictors for 50 runs is: 307.42916035317756.
    Standard Deviation of Mean Squared Error normalized predictors for 50 runs is: 81.2626126854015.
    
With Normalization in Step B, there is some decrease in Mean of Mean Squared Error.
With Normalization  in Step B, there is significant decrease in Standard Deviation of Mean Squared Error.

# Part C:  Build a Regression Model in Keras, with Normalized Data, Epochs = 100, 1 Hidden Layer with 10 Nodes

In [43]:
# Repeat part B using 100 epochs

# Set up the libraries
import keras
import numpy
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# create a array to hold the mean squared error
mse = numpy.zeros(shape=(50,1))

# Iterate 50 times with different sets of test and train split data
for i in range (1,50):

    # Split the normalized data into 70% train and 30% test
    predictors_norm_train, predictors_norm_test, target_train, target_test = train_test_split(predictors_norm, target, test_size=0.3, random_state=42)
    
    # build the model
    model = regression_model()
    
    # fit the model on normalized training sets
    model.fit(predictors_norm_train, target_train, epochs=100,verbose=0)
    
    # predict on the normalized test set
    prediction = model.predict(np.array(predictors_norm_test))
    
    # update the mean squared error
    mse[i-1]=mean_squared_error(target_test, prediction)

In [44]:
# Calculate mean and standard deviation of Mean Squared Error over 50 runs of the model with normalized predictor
mean_mse_norm_epoch100_8_10_1=numpy.mean(mse)
print("Mean of Mean Squared Error with normalized predictors and Epoch = 100 for 50 runs is:", mean_mse_norm_epoch100_8_10_1)
std_mse_norm_epoch100_8_10_1=numpy.std(mse)
print("Standard Deviation of Mean Squared Error normalized predictors and Epoch = 100 for 50 runs is:", std_mse_norm_epoch100_8_10_1)

Mean of Mean Squared Error with normalized predictors and Epoch = 100 for 50 runs is: 155.09285713529664
Standard Deviation of Mean Squared Error normalized predictors and Epoch = 100 for 50 runs is: 25.194838815728886


# Discussion on Mean Squared Error in Step B and Step C

With normalized predictor dataset with Epoch = 50, the following mean and standard deviation of mean squared error was observed:
    Mean of Mean Squared Error with normalized predictors for 50 runs is: 307.42916035317756.
    Standard Deviation of Mean Squared Error normalized predictors for 50 runs is: 81.2626126854015.

With normalized predictor dataset with Epoch = 100, the following mean and standard deviation of mean squared error was observed:
    Mean of Mean Squared Error with normalized predictors and Epoch = 100 for 50 runs is: 155.09285713529664.
    Standard Deviation of Mean Squared Error normalized predictors and Epoch = 100 for 50 runs is: 25.194838815728886.
    
With increase in Epochs to 100 in Step C, there is signfificant decrease in Mean of Mean Squared Error.
With increase in Epochs to 100 in Step C, there is significant decrease in Standard Deviation of Mean Squared Error.

# Part D:  Build a Regression Model in Keras, with Normalized Data, Epochs = 100, 3 Hidden Layers with 10 Nodes each

In [47]:
# Repeat part C using 3 hidden layers, each of 10 nodes and ReLU activation function

def regression_model():
    # create model
    model = Sequential()
    model.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1))
    
    # compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [48]:
# Set up the libraries
import keras
import numpy
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# create a array to hold the mean squared error
mse = numpy.zeros(shape=(50,1))

# Iterate 50 times with different sets of test and train split data
for i in range (1,50):

    # Split the normalized data into 70% train and 30% test
    predictors_norm_train, predictors_norm_test, target_train, target_test = train_test_split(predictors_norm, target, test_size=0.3, random_state=42)
    
    # build the model
    model = regression_model()
    
    # fit the model on normalized training sets
    model.fit(predictors_norm_train, target_train, epochs=100,verbose=0)
    
    # predict on the normalized test set
    prediction = model.predict(np.array(predictors_norm_test))
    
    # update the mean squared error
    mse[i-1]=mean_squared_error(target_test, prediction)

In [49]:
# Calculate mean and standard deviation of Mean Squared Error over 50 runs of the model with normalized predictor
# and using 3 hidden layers, each of 10 nodes and ReLU activation function

mean_mse_norm_epoch100_8_10_10_10_1=numpy.mean(mse)
print("Mean of Mean Squared Error with normalized predictors and Epoch = 100, 3 hidden layers with 10 nodes each, for 50 runs is:", mean_mse_norm_epoch100_8_10_10_10_1)
std__mse_norm_epoch100_8_10_10_10_1=numpy.std(mse)
print("Standard Deviation of Mean Squared Error normalized predictors and Epoch = 100, 3 hidden layers with 10 nodes each, for 50 runs is:", std__mse_norm_epoch100_8_10_10_10_1)

Mean of Mean Squared Error with normalized predictors and Epoch = 100, 3 hidden layers with 10 nodes each, for 50 runs is: 83.65620047049073
Standard Deviation of Mean Squared Error normalized predictors and Epoch = 100, 3 hidden layers with 10 nodes each, for 50 runs is: 24.4891512682964


# Discussion on Mean Squared Error in Step C and Step D

With normalized predictor dataset with Epoch = 100, the following mean and standard deviation of mean squared error was observed:
    Mean of Mean Squared Error with normalized predictors and Epoch = 100 for 50 runs is: 155.09285713529664.
    Standard Deviation of Mean Squared Error normalized predictors and Epoch = 100 for 50 runs is: 25.194838815728886.

With normalized predictor dataset with Epoch = 100, using using 3 hidden layers, each of 10 nodes, the following mean and standard deviation of mean squared error was observed:
    Mean of Mean Squared Error with normalized predictors and Epoch = 100, 3 hidden layers with 10 nodes each, for 50 runs is: 83.65620047049073.
    Standard Deviation of Mean Squared Error normalized predictors and Epoch = 100, 3 hidden layers with 10 nodes each, for 50 runs is: 24.4891512682964.
    
With increase number of Hidden Layers to 3 in Step D, there is signfificant decrease in Mean of Mean Squared Error.
With increase number of Hidden Layers to 3 in Step D, there is very little decrease in Standard Deviation of Mean Squared Error.
    