### Scott Bing BSSD5350 Homework #5 Question #4

##### Copyright 2019 The TensorFlow Authors.

Licensed under the Apache License, Version 2.0 (the "License");

In [1]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Load a pandas.DataFrame

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://www.tensorflow.org/tutorials/load_data/pandas_dataframe"><img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/load_data/pandas_dataframe.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/docs/blob/master/site/en/tutorials/load_data/pandas_dataframe.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
  <td>
    <a href="https://storage.googleapis.com/tensorflow_docs/docs/site/en/tutorials/load_data/pandas_dataframe.ipynb"><img src="https://www.tensorflow.org/images/download_logo_32px.png" />Download notebook</a>
  </td>
</table>

This tutorial provides an example of how to load pandas dataframes into a `tf.data.Dataset`.

This tutorials uses a small [dataset](https://archive.ics.uci.edu/ml/datasets/heart+Disease) provided by the Cleveland Clinic Foundation for Heart Disease. There are several hundred rows in the CSV. Each row describes a patient, and each column describes an attribute. We will use this information to predict whether a patient has heart disease, which in this dataset is a binary classification task.

## Read data using pandas

In [2]:
import pandas as pd
import tensorflow as tf

Download the csv file containing the heart dataset.

In [3]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv'

csv_file = tf.keras.utils.get_file('forestfires.csv', url)

Read the csv file using pandas.

In [4]:
df = pd.read_csv(csv_file)

In [5]:
df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [6]:
df.dtypes

X          int64
Y          int64
month     object
day       object
FFMC     float64
DMC      float64
DC       float64
ISI      float64
temp     float64
RH         int64
wind     float64
rain     float64
area     float64
dtype: object

Convert `thal` column which is an `object` in the dataframe to a discrete numerical value.

In [7]:
df['month'] = pd.Categorical(df['month'])
df['month'] = df.month.cat.codes
df['day'] = pd.Categorical(df['day'])
df['day'] = df.day.cat.codes

In [8]:
df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,7,0,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,10,5,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,10,2,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,7,0,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,7,3,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [9]:
df.describe()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
count,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0
mean,4.669246,4.299807,5.758221,2.736944,90.644681,110.87234,547.940039,9.021663,18.889168,44.288201,4.017602,0.021663,12.847292
std,2.313778,1.2299,4.373275,1.925061,5.520111,64.046482,248.066192,4.559477,5.806625,16.317469,1.791653,0.295959,63.655818
min,1.0,2.0,0.0,0.0,18.7,1.1,7.9,0.0,2.2,15.0,0.4,0.0,0.0
25%,3.0,4.0,1.0,1.0,90.2,68.6,437.7,6.5,15.5,33.0,2.7,0.0,0.0
50%,4.0,4.0,6.0,3.0,91.6,108.3,664.2,8.4,19.3,42.0,4.0,0.0,0.52
75%,7.0,5.0,11.0,4.0,92.9,142.4,713.9,10.8,22.8,53.0,4.9,0.0,6.57
max,9.0,9.0,11.0,6.0,96.2,291.3,860.6,56.1,33.3,100.0,9.4,6.4,1090.84


## Load data using `tf.data.Dataset`

Use `tf.data.Dataset.from_tensor_slices` to read the values from a pandas dataframe. 

One of the advantages of using `tf.data.Dataset` is it allows you to write simple, highly efficient data pipelines. Read the [loading data guide](https://www.tensorflow.org/guide/data) to find out more.

In [10]:
target = df.pop('area')

In [11]:
dataset = tf.data.Dataset.from_tensor_slices((df.values, target.values))

In [12]:
for feat, targ in dataset.take(5):
  print ('Features: {}, Target: {}'.format(feat, targ))

Features: [ 7.   5.   7.   0.  86.2 26.2 94.3  5.1  8.2 51.   6.7  0. ], Target: 0.0
Features: [  7.    4.   10.    5.   90.6  35.4 669.1   6.7  18.   33.    0.9   0. ], Target: 0.0
Features: [  7.    4.   10.    2.   90.6  43.7 686.9   6.7  14.6  33.    1.3   0. ], Target: 0.0
Features: [ 8.   6.   7.   0.  91.7 33.3 77.5  9.   8.3 97.   4.   0.2], Target: 0.0
Features: [  8.    6.    7.    3.   89.3  51.3 102.2   9.6  11.4  99.    1.8   0. ], Target: 0.0


Since a `pd.Series` implements the `__array__` protocol it can be used transparently nearly anywhere you would use a `np.array` or a `tf.Tensor`.

In [13]:
tf.constant(df['month'])

<tf.Tensor: shape=(517,), dtype=int8, numpy=
array([ 7, 10, 10,  7,  7,  1,  1,  1, 11, 11, 11, 11,  1, 11, 11, 11,  7,
       10,  7,  0, 11, 11,  6,  1,  1,  1, 11, 11, 11, 11, 11, 11, 11, 11,
       11, 11, 10, 10, 10,  7,  5,  1,  1, 11, 11, 11, 11,  5,  7,  7, 11,
        1,  1,  1,  1, 11, 11, 10,  3,  3,  7,  7,  1,  1,  1,  1, 11, 11,
       11,  7,  7, 11,  7,  1, 11,  3,  3,  7,  1,  1,  1,  1,  1,  1,  1,
       11, 11, 11, 11,  7,  1,  7,  1,  1,  1, 11,  3,  7,  1,  1,  1,  1,
        1, 11,  4,  7,  7,  1, 11, 11,  7,  7, 11, 11,  7,  7,  7,  7,  7,
        1,  1,  1, 11, 11, 11, 10,  7, 11, 10, 10,  3,  7,  7, 11,  7,  1,
       11, 11,  5, 11, 11,  1,  1,  5,  1,  1,  7, 11,  1, 11,  6,  5,  5,
       11, 11,  1, 11,  1,  1, 11,  7,  1,  7, 11, 11,  7,  1,  1,  7,  1,
       11,  1,  1, 11,  1,  1,  0,  1, 11,  1, 11, 10,  3, 10,  1, 11,  7,
       11,  7,  7,  7,  1,  1, 11,  1,  1,  0, 11, 11, 11, 11,  7,  3, 10,
        7, 11,  1, 11, 11, 11, 10,  1, 11,  7,  7,  7, 

Shuffle and batch the dataset.

In [14]:
tf.constant(df['day'])

<tf.Tensor: shape=(517,), dtype=int8, numpy=
array([0, 5, 2, 0, 3, 3, 1, 1, 5, 2, 2, 2, 0, 1, 6, 0, 2, 1, 6, 2, 5, 1,
       3, 2, 2, 3, 0, 1, 2, 3, 0, 1, 0, 3, 1, 5, 5, 0, 2, 5, 5, 2, 5, 2,
       6, 6, 1, 1, 1, 1, 4, 3, 6, 6, 4, 4, 5, 3, 1, 0, 3, 3, 4, 3, 1, 4,
       0, 0, 0, 0, 0, 0, 0, 5, 0, 4, 0, 1, 0, 5, 3, 3, 5, 6, 4, 4, 4, 4,
       3, 2, 2, 0, 3, 3, 1, 3, 2, 2, 3, 3, 3, 5, 5, 2, 2, 0, 4, 3, 2, 1,
       0, 0, 3, 1, 5, 5, 2, 2, 1, 4, 1, 1, 3, 5, 0, 3, 1, 0, 6, 3, 2, 1,
       3, 4, 5, 2, 3, 1, 5, 5, 1, 6, 0, 2, 6, 4, 1, 5, 5, 4, 0, 3, 2, 0,
       2, 3, 2, 6, 6, 0, 1, 4, 2, 2, 3, 4, 6, 6, 0, 4, 6, 6, 3, 1, 2, 2,
       4, 3, 6, 5, 3, 1, 3, 1, 0, 5, 3, 1, 2, 3, 0, 4, 5, 6, 5, 0, 4, 4,
       5, 1, 5, 3, 3, 6, 2, 4, 2, 5, 0, 4, 2, 2, 0, 1, 2, 2, 3, 1, 6, 1,
       3, 0, 1, 0, 6, 3, 1, 2, 3, 2, 6, 3, 5, 5, 2, 3, 2, 5, 2, 3, 6, 0,
       3, 3, 3, 3, 3, 6, 6, 6, 6, 6, 6, 4, 4, 4, 2, 2, 2, 2, 1, 0, 0, 0,
       0, 5, 5, 5, 5, 5, 5, 5, 5, 3, 6, 4, 1, 1, 1, 1, 0, 5, 3, 6, 0, 3,
      

In [15]:
train_dataset = dataset.shuffle(len(df)).batch(1)

In [16]:
from tensorflow.keras.layers.experimental import preprocessing



## Create and train a model

In [17]:
def get_compiled_model():
  model = tf.keras.Sequential([
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(1)
  ])

  model.compile(optimizer='adam', 
                loss='mean_squared_error', 
                metrics=['mean_squared_error'])

  #model.compile(optimizer='adam',
  #              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
  #              metrics=['accuracy'])
  return model

In [18]:
model = get_compiled_model()
model.fit(train_dataset, epochs=15)

Epoch 1/15


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x262c0b42708>

## Alternative to feature columns

Passing a dictionary as an input to a model is as easy as creating a matching dictionary of `tf.keras.layers.Input` layers, applying any pre-processing and stacking them up using the [functional api](../../guide/keras/functional.ipynb). You can use this as an alternative to [feature columns](../keras/feature_columns.ipynb).

In [19]:
inputs = {key: tf.keras.layers.Input(shape=(), name=key) for key in df.keys()}
x = tf.stack(list(inputs.values()), axis=-1)

x = tf.keras.layers.Dense(10, activation='relu')(x)
output = tf.keras.layers.Dense(1)(x)

model_func = tf.keras.Model(inputs=inputs, outputs=output)

model_func.compile(optimizer='adam',
                   loss='mean_squared_error',
                   metrics=['mean_squared_error'])

#model_func.compile(optimizer='adam',
#                   loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
#                   metrics=['accuracy'])

The easiest way to preserve the column structure of a `pd.DataFrame` when used with `tf.data` is to convert the `pd.DataFrame` to a `dict`, and slice that dictionary.

In [20]:
dict_slices = tf.data.Dataset.from_tensor_slices((df.to_dict('list'), target.values)).batch(16)

In [21]:
for dict_slice in dict_slices.take(1):
  print (dict_slice)

({'X': <tf.Tensor: shape=(16,), dtype=int32, numpy=array([7, 7, 7, 8, 8, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6])>, 'Y': <tf.Tensor: shape=(16,), dtype=int32, numpy=array([5, 4, 4, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5])>, 'month': <tf.Tensor: shape=(16,), dtype=int32, numpy=array([ 7, 10, 10,  7,  7,  1,  1,  1, 11, 11, 11, 11,  1, 11, 11, 11])>, 'day': <tf.Tensor: shape=(16,), dtype=int32, numpy=array([0, 5, 2, 0, 3, 3, 1, 1, 5, 2, 2, 2, 0, 1, 6, 0])>, 'FFMC': <tf.Tensor: shape=(16,), dtype=float32, numpy=
array([86.2, 90.6, 90.6, 91.7, 89.3, 92.3, 92.3, 91.5, 91. , 92.5, 92.5,
       92.8, 63.5, 90.9, 92.9, 93.3], dtype=float32)>, 'DMC': <tf.Tensor: shape=(16,), dtype=float32, numpy=
array([ 26.2,  35.4,  43.7,  33.3,  51.3,  85.3,  88.9, 145.4, 129.5,
        88. ,  88. ,  73.2,  70.8, 126.5, 133.3, 141.2], dtype=float32)>, 'DC': <tf.Tensor: shape=(16,), dtype=float32, numpy=
array([ 94.3, 669.1, 686.9,  77.5, 102.2, 488. , 495.6, 608.2, 692.6,
       698.6, 698.6, 713. , 665.3, 686.5, 69

In [22]:
model_func.fit(dict_slices, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x262c204b388>