# Convert Pandas dataframe to Tensorflow tensor and dataset

import the required libraries

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

2023-03-06 21:31:34.882031: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


load the CSV file with Pandas

In [2]:
dataframe = pd.read_csv('../datasets/advertising.csv')
dataframe.head(5)

Unnamed: 0,tv,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


transform the dataframe to a tensor

In [3]:
tensor = tf.constant(dataframe)
type(tensor)

2023-03-06 21:31:38.282586: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


tensorflow.python.framework.ops.EagerTensor

## From tensors

In [4]:
dataset = tf.data.Dataset.from_tensors(dataframe)
type(dataset)

tensorflow.python.data.ops.dataset_ops.TensorDataset

In [5]:
dataset = tf.data.Dataset.from_tensors(tensor)
type(dataset)

tensorflow.python.data.ops.dataset_ops.TensorDataset

## From tensor slices

In [6]:
dataset = tf.data.Dataset.from_tensor_slices(dataframe)
type(dataset)

tensorflow.python.data.ops.from_tensor_slices_op.TensorSliceDataset

In [7]:
dataset = tf.data.Dataset.from_tensor_slices(tensor)
type(dataset)

tensorflow.python.data.ops.from_tensor_slices_op.TensorSliceDataset

## Example

In [8]:
features = dataframe.copy()
labels = features.pop('sales')

In [9]:
(
    tf.data.Dataset.from_tensor_slices((features, labels))
        .repeat(100)
        .batch(batch_size=10, drop_remainder=True)
)

<BatchDataset element_spec=(TensorSpec(shape=(10, 3), dtype=tf.float64, name=None), TensorSpec(shape=(10,), dtype=tf.float64, name=None))>

## Notes

the `from_tensors` method combines the input and returns a dataset with a single element, wherease the `from_tensor_slices` method creates a dataset with a separate element for each row of the input tensor.

In [10]:
tensor = tf.constant([[4, 2], [5, 3]])
dataset = tf.data.Dataset.from_tensors(tensor)
dataset # [ [4, 2], [5, 3] ]

<TensorDataset element_spec=TensorSpec(shape=(2, 2), dtype=tf.int32, name=None)>

In [11]:
tensor = tf.constant([[4, 2], [5, 3]])
dataset = tf.data.Dataset.from_tensor_slices(tensor)
dataset # [4, 2], [5, 3]

<TensorSliceDataset element_spec=TensorSpec(shape=(2,), dtype=tf.int32, name=None)>

## TextLineDataset

In [21]:
def parse_csv_records(records):
    cols = tf.io.decode_csv(records, record_defaults=[[0], [0], [0], [0]])
    features = {'tv': cols[0], 'radio': cols[1], 'newspaper': cols[2]}
    label = cols[3]
    return features, label

In [23]:
dataset = tf.data.TextLineDataset('../datasets/advertising.csv')
    .map(parse_csv_records)
    .shuffle(1000) # IMPORTANT: recommended to shuffle only the training data
    .repeat(20)
    .batch(100)

If we have multiple files on disk, we may want to scan the directory to collect them all

In [26]:
tf.data.Dataset.list_files('../datasets/').flat_map_map(tf.data.TextLineDataset)

AttributeError: 'ShuffleDataset' object has no attribute 'flat_map_map'