# Convert Pandas dataframe to Tensorflow tensor and dataset

import the required libraries

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

2023-03-25 11:02:27.375804: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


load the CSV file with Pandas

In [2]:
dataframe = pd.read_csv('../../datasets/advertising.csv')
dataframe.head(5)

Unnamed: 0,tv,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


transform the dataframe to a tensor

## Transform a Dataframe to a Dataset

In [3]:
def df_to_ds(dataframe, slice=True, shuffle=True, batch_size=5):
    features = dataframe.copy()
    label = features.pop('sales')
    if slice:
        ds = tf.data.Dataset.from_tensor_slices((dict(features), label))
    else:
        ds = tf.data.Dataset.from_tensors((dict(features), label))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [4]:
df_to_ds(dataframe)

2023-03-25 11:02:30.947451: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


<BatchDataset element_spec=({'tv': TensorSpec(shape=(None,), dtype=tf.float64, name=None), 'radio': TensorSpec(shape=(None,), dtype=tf.float64, name=None), 'newspaper': TensorSpec(shape=(None,), dtype=tf.float64, name=None)}, TensorSpec(shape=(None,), dtype=tf.float64, name=None))>

In [5]:
df_to_ds(dataframe, slice=False)

<BatchDataset element_spec=({'tv': TensorSpec(shape=(None, 200), dtype=tf.float64, name=None), 'radio': TensorSpec(shape=(None, 200), dtype=tf.float64, name=None), 'newspaper': TensorSpec(shape=(None, 200), dtype=tf.float64, name=None)}, TensorSpec(shape=(None, 200), dtype=tf.float64, name=None))>

## Transform a Dataframe to a Tensor

In [6]:
tensor = tf.constant(dataframe) # this only works if all features have the same type
type(tensor)

tensorflow.python.framework.ops.EagerTensor

## From tensors

In [7]:
dataset = tf.data.Dataset.from_tensors(dataframe) # this only works if all features have the same type
type(dataset)

tensorflow.python.data.ops.dataset_ops.TensorDataset

In [8]:
dataset = tf.data.Dataset.from_tensors(tensor)
type(dataset)

tensorflow.python.data.ops.dataset_ops.TensorDataset

## From tensor slices

In [9]:
dataset = tf.data.Dataset.from_tensor_slices(dataframe) # this only works if all features have the same type
type(dataset)

tensorflow.python.data.ops.from_tensor_slices_op.TensorSliceDataset

In [10]:
dataset = tf.data.Dataset.from_tensor_slices(tensor)
type(dataset)

tensorflow.python.data.ops.from_tensor_slices_op.TensorSliceDataset

## Example

In [11]:
features = dataframe.copy()
labels = features.pop('sales')

In [12]:
features.head()

Unnamed: 0,tv,radio,newspaper
0,230.1,37.8,69.2
1,44.5,39.3,45.1
2,17.2,45.9,69.3
3,151.5,41.3,58.5
4,180.8,10.8,58.4


In [13]:
labels

0      22.1
1      10.4
2       9.3
3      18.5
4      12.9
       ... 
195     7.6
196     9.7
197    12.8
198    25.5
199    13.4
Name: sales, Length: 200, dtype: float64

In [14]:
(
    tf.data.Dataset.from_tensor_slices((features, labels))
        .repeat(100)
        .batch(batch_size=10, drop_remainder=True)
)

<BatchDataset element_spec=(TensorSpec(shape=(10, 3), dtype=tf.float64, name=None), TensorSpec(shape=(10,), dtype=tf.float64, name=None))>

## Notes

the `from_tensors` method combines the input and returns a dataset with a single element, wherease the `from_tensor_slices` method creates a dataset with a separate element for each row of the input tensor.

In [15]:
tensor = tf.constant([[4, 2], [5, 3]])
dataset = tf.data.Dataset.from_tensors(tensor)
dataset # [ [4, 2], [5, 3] ]

<TensorDataset element_spec=TensorSpec(shape=(2, 2), dtype=tf.int32, name=None)>

In [16]:
tensor = tf.constant([[4, 2], [5, 3]])
dataset = tf.data.Dataset.from_tensor_slices(tensor)
dataset # [4, 2], [5, 3]

<TensorSliceDataset element_spec=TensorSpec(shape=(2,), dtype=tf.int32, name=None)>

## TextLineDataset

In [17]:
def parse_csv_records(records):
    cols = tf.io.decode_csv(records, record_defaults=[[0], [0], [0], [0]])
    features = {'tv': cols[0], 'radio': cols[1], 'newspaper': cols[2]}
    label = cols[3] # sales
    return features, label

In [18]:
dataset = (
    tf.data.TextLineDataset('../../datasets/advertising.csv')
        .map(parse_csv_records)
        .shuffle(1000) # IMPORTANT: only shuffle the training data
        .repeat(20)
        .batch(100)
)
dataset

<BatchDataset element_spec=({'tv': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'radio': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'newspaper': TensorSpec(shape=(None,), dtype=tf.int32, name=None)}, TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

If we have multiple files on disk, we may want to scan the directory to collect them all

In [19]:
dataset = (
    tf.data.Dataset.list_files('../../datasets/shards/*.csv')
        .flat_map(tf.data.TextLineDataset)
        .map(parse_csv_records)
        .shuffle(1000) # IMPORTANT: only shuffle the training data
        .repeat(20)
        .batch(100)
)
dataset

<BatchDataset element_spec=({'tv': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'radio': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'newspaper': TensorSpec(shape=(None,), dtype=tf.int32, name=None)}, TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

In [20]:
train = pd.read_csv('/Users/spaccs01/.keras/datasets/train.csv')
eval = pd.read_csv('/Users/spaccs01/.keras/datasets/eval.csv')

In [21]:
train

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.2500,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.9250,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1000,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y
...,...,...,...,...,...,...,...,...,...,...
622,0,male,28.0,0,0,10.5000,Second,unknown,Southampton,y
623,0,male,25.0,0,0,7.0500,Third,unknown,Southampton,y
624,1,female,19.0,0,0,30.0000,First,B,Southampton,y
625,0,female,28.0,1,2,23.4500,Third,unknown,Southampton,n


In [22]:
def parse(line):
    cols = tf.io.decode_csv(line)
    features = {
        'sex': cols[1],
        'age': cols[2],
        'n_siblings_spouses': cols[3],
        'parch': cols[4],
        'fare': cols[5],
        'class': cols[6],
        'deck': cols[7],
        'embark_town': cols[8],
        'alone': cols[9]
    }
    label = cols[0] # survived
    return features, label

In [23]:
# dataset = (
#     tf.data.TextLineDataset('/Users/spaccs01/.keras/datasets/train.csv')
#         .map(parse)
#         # .shuffle(1000) # IMPORTANT: only shuffle the training data
#         # .repeat(20)
#         .batch(5)
# )
# dataset

In [24]:
dataset = tf.data.experimental.make_csv_dataset(
    '/Users/spaccs01/.keras/datasets/train.csv',
    batch_size=5,
    label_name='survived',
    na_value="?",
    num_epochs=1,
    ignore_errors=True
)

Instructions for updating:
Use `tf.data.Dataset.ignore_errors` instead.


In [25]:
for batch, label in dataset.take(1):
    for key, value in batch.items():
        print("{:20s}: {}".format(key, value.numpy()))

sex                 : [b'male' b'female' b'male' b'male' b'male']
age                 : [30. 28. 19. 32. 20.]
n_siblings_spouses  : [1 0 0 0 0]
parch               : [0 0 0 0 0]
fare                : [24.     79.2     7.8958 56.4958  9.8458]
class               : [b'Second' b'First' b'Third' b'Third' b'Third']
deck                : [b'unknown' b'unknown' b'unknown' b'unknown' b'unknown']
embark_town         : [b'Cherbourg' b'Cherbourg' b'Southampton' b'Southampton' b'Southampton']
alone               : [b'n' b'y' b'y' b'y' b'y']
