<a href="https://colab.research.google.com/github/qamtam/Hands-on-machine-learning/blob/main/CH13_with_notes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
    !pip install -q -U tfx==0.21.2
    print("You can safely ignore the package incompatibility errors.")
except Exception:
    pass

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty pony figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "data"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

[K     |████████████████████████████████| 1.1MB 4.6MB/s 
[K     |████████████████████████████████| 245kB 12.9MB/s 
[K     |████████████████████████████████| 1.9MB 19.9MB/s 
[K     |████████████████████████████████| 1.5MB 40.8MB/s 
[K     |████████████████████████████████| 3.0MB 55.6MB/s 
[K     |████████████████████████████████| 59.2MB 1.3MB/s 
[K     |████████████████████████████████| 2.4MB 51.8MB/s 
[K     |████████████████████████████████| 112kB 37.8MB/s 
[K     |████████████████████████████████| 276kB 46.7MB/s 
[K     |████████████████████████████████| 4.9MB 43.6MB/s 
[K     |████████████████████████████████| 153kB 54.4MB/s 
[K     |████████████████████████████████| 51kB 8.7MB/s 
[K     |████████████████████████████████| 61kB 8.5MB/s 
[K     |████████████████████████████████| 1.2MB 43.5MB/s 
[K     |████████████████████████████████| 153kB 58.5MB/s 
[K     |████████████████████████████████| 81kB 11.8MB/s 
[K     |████████████████████████████████| 122kB 50.5MB/s 
[K

In [None]:
#DATASET methods do NOT modify the datasets, they create new ones

#EMBEDDINGS are trainable vecors that represent categories

import tensorflow as tf
from tensorflow import keras
from tensorflow import data

X = tf.range(10) # any data tensor
dataset = data.Dataset.from_tensor_slices(X) # create a whole new dataset from any tensor
dataset

<TensorSliceDataset shapes: (), types: tf.int32>

In [None]:
for item in dataset:
  print(item)

#method of iterating

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [None]:
dataset = dataset.repeat(3).batch(7) #chaining
for item in dataset:
  print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


In [None]:
dataset = dataset.map(lambda x : x*2) #stworzyliśmy przetransormowany dataset
for item in dataset:
  print(item)

tf.Tensor([ 0  2  4  6  8 10 12], shape=(7,), dtype=int32)
tf.Tensor([14 16 18  0  2  4  6], shape=(7,), dtype=int32)
tf.Tensor([ 8 10 12 14 16 18  0], shape=(7,), dtype=int32)
tf.Tensor([ 2  4  6  8 10 12 14], shape=(7,), dtype=int32)
tf.Tensor([16 18], shape=(2,), dtype=int32)


In [None]:
dataset = dataset.unbatch()

In [None]:
for item in dataset:
  print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(18, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(18, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, sh

In [None]:
#prosty "sql"
dataset = dataset.filter(lambda x: x<5)

In [None]:
for item in dataset:
  print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)


In [None]:
#shuffling with buffers
# [0..100], buf size = 5
# [0 1 2 3 4] --> 2 --> [2]
# [0 1 5 3 4] --> 4 --> [2,4]
# [0 1 5 3 6] --> ... --> ...

dataset = data.Dataset.range(10).repeat(42)
dataset = dataset.shuffle(buffer_size=10).batch(6)

for item in dataset:
  print(item)


tf.Tensor([5 7 3 8 0 1], shape=(6,), dtype=int64)
tf.Tensor([1 2 6 9 5 6], shape=(6,), dtype=int64)
tf.Tensor([4 1 9 8 5 2], shape=(6,), dtype=int64)
tf.Tensor([2 4 7 0 0 8], shape=(6,), dtype=int64)
tf.Tensor([7 6 3 6 4 5], shape=(6,), dtype=int64)
tf.Tensor([3 9 0 1 8 3], shape=(6,), dtype=int64)
tf.Tensor([3 4 9 4 2 0], shape=(6,), dtype=int64)
tf.Tensor([7 5 0 1 9 6], shape=(6,), dtype=int64)
tf.Tensor([2 8 6 7 3 2], shape=(6,), dtype=int64)
tf.Tensor([8 9 5 2 4 3], shape=(6,), dtype=int64)
tf.Tensor([7 1 5 1 8 3], shape=(6,), dtype=int64)
tf.Tensor([6 6 0 4 8 9], shape=(6,), dtype=int64)
tf.Tensor([9 2 7 0 7 4], shape=(6,), dtype=int64)
tf.Tensor([2 6 1 9 0 4], shape=(6,), dtype=int64)
tf.Tensor([7 5 4 5 5 6], shape=(6,), dtype=int64)
tf.Tensor([0 3 1 7 1 4], shape=(6,), dtype=int64)
tf.Tensor([8 5 7 2 9 3], shape=(6,), dtype=int64)
tf.Tensor([9 2 3 6 0 8], shape=(6,), dtype=int64)
tf.Tensor([1 2 8 3 8 9], shape=(6,), dtype=int64)
tf.Tensor([2 6 5 5 4 1], shape=(6,), dtype=int64)


In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
housing = fetch_california_housing()

X_train_full, X_test, y_train_full, y_test = train_test_split(housing.data, housing.target.reshape(-1,1))
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full)

Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to /root/scikit_learn_data
  


In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_

In [None]:
#too large datasets we split into many files
#then we ask tf to read some of these files in parallel, maybe creating a dataset on top of that that's shuffled
#example v

def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
  housing_dir = os.path.join("datasets", "housing")
  os.makedirs(housing_dir, exist_ok=True)
  path_format = os.path.join(housing_dir, "my_{}_{:02d}.csv")

  filepaths = []
  m = len(data)
  for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
    part_csv = path_format.format(name_prefix, file_idx)
    filepaths.append(part_csv)
    with open(part_csv, "wt", encoding="utf-8") as f:
      if header is not None:
        f.write(header)
        f.write("\n")
      for row_idx in row_indices:
        f.write(",".join([repr(col) for col in data[row_idx] ])) # przetłumacz kolumnę na string, wpisz do pliku i dodaj przecinek
        f.write("\n")
  return filepaths


In [None]:
train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]

header_cols = housing.feature_names + ["MedianHouseValue"]
print(header_cols)
header = ",".join(header_cols)
print(header)

train_filepaths = save_to_multiple_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_multiple_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_multiple_csv_files(test_data, "test", header, n_parts=10)

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'MedianHouseValue']
MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue


In [None]:
for a,b in enumerate(np.array_split(np.arange(20), 3)):
  print("aa ", a)
  print("\nb ", b)

aa  0

b  [0 1 2 3 4 5 6]
aa  1

b  [ 7  8  9 10 11 12 13]
aa  2

b  [14 15 16 17 18 19]


In [None]:
import pandas as pd
#take a peek at csv
pd.read_csv(train_filepaths[0]).head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue
0,2.5363,44.0,5.06414,1.058309,1184.0,3.451895,33.92,-118.25,0.959
1,3.2321,40.0,4.280822,1.050228,869.0,1.984018,34.16,-118.3,2.436
2,4.7885,36.0,5.935275,1.022654,864.0,2.796117,34.21,-118.34,3.022
3,2.4414,21.0,4.795148,1.075472,1446.0,3.897574,36.86,-120.51,0.719
4,4.2192,14.0,5.816327,1.054945,3597.0,2.823391,32.74,-116.97,1.761


In [None]:
#take a peek at csv in text mode
with open(train_filepaths[0]) as f:
  for i in range(5):
    print(f.readline(), end="")

MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue
2.5363,44.0,5.0641399416909625,1.0583090379008746,1184.0,3.4518950437317786,33.92,-118.25,0.959
3.2321,40.0,4.280821917808219,1.0502283105022832,869.0,1.9840182648401827,34.16,-118.3,2.436
4.7885,36.0,5.935275080906149,1.022653721682848,864.0,2.796116504854369,34.21,-118.34,3.022
2.4414,21.0,4.795148247978437,1.0754716981132075,1446.0,3.8975741239892185,36.86,-120.51,0.719


In [None]:
train_filepaths

['datasets/housing/my_train_00.csv',
 'datasets/housing/my_train_01.csv',
 'datasets/housing/my_train_02.csv',
 'datasets/housing/my_train_03.csv',
 'datasets/housing/my_train_04.csv',
 'datasets/housing/my_train_05.csv',
 'datasets/housing/my_train_06.csv',
 'datasets/housing/my_train_07.csv',
 'datasets/housing/my_train_08.csv',
 'datasets/housing/my_train_09.csv',
 'datasets/housing/my_train_10.csv',
 'datasets/housing/my_train_11.csv',
 'datasets/housing/my_train_12.csv',
 'datasets/housing/my_train_13.csv',
 'datasets/housing/my_train_14.csv',
 'datasets/housing/my_train_15.csv',
 'datasets/housing/my_train_16.csv',
 'datasets/housing/my_train_17.csv',
 'datasets/housing/my_train_18.csv',
 'datasets/housing/my_train_19.csv']

In [None]:
# we've split the file to now create an input pipeline that will simulate how we handle a huge input

filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42
                                          )
print(isinstance(filepath_dataset, tf.data.Dataset))
for filepath in filepath_dataset:
  print(filepath)

True
tf.Tensor(b'datasets/housing/my_train_05.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_16.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_01.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_17.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_00.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_14.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_10.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_02.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_12.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_19.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_07.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_09.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_13.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_15.csv', shape=(), dtype=string)
t

In [None]:
n_readers = 5

dataset = filepath_dataset.interleave(
    lambda file: tf.data.TextLineDataset(file).skip(1),
    cycle_length = n_readers
)
#co tu się wydarzyło?
#stworzyliśmy dataset ze wszystkich ścieżek filepath_dataset

# mało tego, użyliśmy interleave
# interleave będzie brało po 5 plików naraz i zczytywało po linijce do tymczasowego datasetu
# filepath_dataset.interleave()
# interleave w ogóle bierze po linijce i je przeplatuje


# oprócz tego tworzymy wewnątrz interleave 5 kolejnych
# datasetów o nazwie file (nazwa może być dowolna)
# tworzone są one na podstaie brania 5 plików w interleave
# wewnątrz wzywamy funkcję dla każdego z tych plików (tu lambdę)
# i tworzymy dataset omijający pierwszą linię


for line in dataset.take(5):
  print(line)
  # still in tensors
  print(line.numpy())
  # numpy-friendly version

tf.Tensor(b'1.1728,27.0,3.9139240506329114,1.0506329113924051,1704.0,4.313924050632911,36.48,-119.44,0.417', shape=(), dtype=string)
b'1.1728,27.0,3.9139240506329114,1.0506329113924051,1704.0,4.313924050632911,36.48,-119.44,0.417'
tf.Tensor(b'2.2721,45.0,4.253112033195021,1.0428769017980637,1635.0,2.2614107883817427,38.31,-122.29,1.398', shape=(), dtype=string)
b'2.2721,45.0,4.253112033195021,1.0428769017980637,1635.0,2.2614107883817427,38.31,-122.29,1.398'
tf.Tensor(b'3.0375,42.0,5.652298850574713,1.0862068965517242,1459.0,4.192528735632184,34.03,-117.73,1.181', shape=(), dtype=string)
b'3.0375,42.0,5.652298850574713,1.0862068965517242,1459.0,4.192528735632184,34.03,-117.73,1.181'
tf.Tensor(b'2.5809,27.0,4.898601398601398,1.027972027972028,1306.0,4.566433566433567,34.96,-120.57,0.832', shape=(), dtype=string)
b'2.5809,27.0,4.898601398601398,1.027972027972028,1306.0,4.566433566433567,34.96,-120.57,0.832'
tf.Tensor(b'6.5253,26.0,7.297966401414677,1.0194518125552607,3460.0,3.059239610963

In [None]:
short = filepath_dataset.take(2)
new_dataset = short.map(lambda x: tf.data.TFRecordDataset(x).skip(1))
for x in new_dataset.take(1):
  print(x)

<_VariantDataset shapes: (), types: tf.string>


In [None]:
# Uwaga, pole 4 pozostało stringiem
record_defaults=[0, np.nan, tf.constant(np.nan, dtype=tf.float64), "Hello", tf.constant([])]
parsed_fields = tf.io.decode_csv('1,2,3,4,5', record_defaults) # przerabiamy linie 1..5 na tensory
# ponieważ to nie jest "prawdziwe csv" to mamy linijki dummy "1,2,3,4,5" (mini csv)
# jeśli nie ma wartości  to są zastępowane automatycznie wartościami domyślnymi
parsed_fields

[<tf.Tensor: shape=(), dtype=int32, numpy=1>,
 <tf.Tensor: shape=(), dtype=float32, numpy=2.0>,
 <tf.Tensor: shape=(), dtype=float64, numpy=3.0>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'4'>,
 <tf.Tensor: shape=(), dtype=float32, numpy=5.0>]

In [None]:
z = tf.data.TextLineDataset('datasets/housing/my_train_00.csv').skip(1)
for z in z:
  print(z)

tf.Tensor(b'2.5363,44.0,5.0641399416909625,1.0583090379008746,1184.0,3.4518950437317786,33.92,-118.25,0.959', shape=(), dtype=string)
tf.Tensor(b'3.2321,40.0,4.280821917808219,1.0502283105022832,869.0,1.9840182648401827,34.16,-118.3,2.436', shape=(), dtype=string)
tf.Tensor(b'4.7885,36.0,5.935275080906149,1.022653721682848,864.0,2.796116504854369,34.21,-118.34,3.022', shape=(), dtype=string)
tf.Tensor(b'2.4414,21.0,4.795148247978437,1.0754716981132075,1446.0,3.8975741239892185,36.86,-120.51,0.719', shape=(), dtype=string)
tf.Tensor(b'4.2192,14.0,5.816326530612245,1.054945054945055,3597.0,2.8233908948194664,32.74,-116.97,1.761', shape=(), dtype=string)
tf.Tensor(b'3.0903,42.0,3.4307692307692306,0.9138461538461539,1412.0,4.344615384615385,33.94,-118.22,1.535', shape=(), dtype=string)
tf.Tensor(b'2.817,19.0,6.108695652173913,1.1603260869565217,1207.0,3.279891304347826,33.72,-117.29,1.1', shape=(), dtype=string)
tf.Tensor(b'4.6327,34.0,5.552816901408451,0.9577464788732394,880.0,3.098591549

In [None]:
parsed_fields = tf.io.decode_csv(',,,,5', record_defaults)
parsed_fields
#nie zapewniliśmy wartości oprócz 5, zatem zastąpilniśmy dane wartościami domyślnymi


[<tf.Tensor: shape=(), dtype=int32, numpy=0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=nan>,
 <tf.Tensor: shape=(), dtype=float64, numpy=nan>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Hello'>,
 <tf.Tensor: shape=(), dtype=float32, numpy=5.0>]

In [None]:
# piąte pole jest obowiązkowe poprzez tf.constant([]), a tensory nie mogą być puste
try:
    parsed_fields = tf.io.decode_csv(',,,,', record_defaults)
except tf.errors.InvalidArgumentError as ex:
    print(ex)

Field 4 is required but missing in record 0! [Op:DecodeCSV]


In [None]:
# liczba pól musi pasowac idealnie z record_defaults
try:
    parsed_fields = tf.io.decode_csv('1,2,3,4,5,6,7', record_defaults)
except tf.errors.InvalidArgumentError as ex:
    print(ex)

Expect 5 fields but have 7 in record 0 [Op:DecodeCSV]


In [None]:
#preprocesowanie u nas to standaryzacja X
n_inputs = 8 # X_train.shape[-1]
@tf.function
def preprocess(line):
  defs = [0.] * 8 + [tf.constant([], dtype=tf.float32)]
  fields = tf.io.decode_csv(line, record_defaults = defs)
  #decode_csv stworzypo jednym tensorze na jedną komórkę
  x = tf.stack(fields[:-1]) # "wyudłużamy tensor z komórki do wiersza"
  y = tf.stack(fields[-1])
  return (x- X_mean) /X_std, y #standaryzowanie

In [None]:
defs = [0.] * 8 # 8 tabeli powtórzonych i spiętych
defs2 = [0.] * 8 + [tf.constant([], dtype=tf.float32)]

In [None]:
defs

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [None]:
defs2 # lista z dopiętym pustym tensorem w formie listy, w sumie wychodzi lista
#dokładamy pusty tensor na końcu, zeby zasugerować, że to obowiązkowe (cena mieszkań)


[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 <tf.Tensor: shape=(0,), dtype=float32, numpy=array([], dtype=float32)>]

In [None]:
# SPinamy wszystko w całość - oto helper, który ściągnie dane zwielu csv-ek

def csv_reader_dataset(filepaths, repeat=1, n_readers=5,
                       n_read_threads= None, shuffle_buffer_size=10000,
                       n_parse_threads=5, batch_size=32):
  dataset = tf.data.Dataset.list_files(filepaths)
  dataset = dataset.interleave(
        lambda csv: tf.data.TextLineDataset(csv).skip(1),
        cycle_length=n_readers, num_parallel_calls=n_read_threads)
  dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads) #zrobienie preprocesowania
  dataset = dataset.shuffle(shuffle_buffer_size).repeat(repeat) #ostatnie przemieszanie
  #jeśli mamy repeat(0), zbiór się powtarza w sposób nieograniczony, czyli przy każdym epochu 
  #PRZEMIESZA INACZEJ
  return dataset.batch(batch_size).prefetch(1) #opcjonalne batchowanie   

#prefetch for dummies
#prefetch tutaj chyba biezre cały batch na zapas
# miej w zanadrzu parę obiektów
# one będą już gotowe (czyli tu zapasowy batch) w preprocessingu
# kiedy główny batch jest mielony w treningu, drugi już czeka w kolejce
#profit w performensie



In [None]:
tf.random.set_seed(42)

train_set = csv_reader_dataset(train_filepaths, batch_size=3)
for X_batch, y_batch in train_set.take(3): #-1 bierze wszystkie
    print("X =", X_batch)
    print("y =", y_batch)
    print()

X = tf.Tensor(
[[-0.31286746  0.2609108   0.07322088  0.13157646 -0.56217456 -0.01650685
  -0.4937788   0.7337001 ]
 [ 2.061989    1.8417346   0.7736217  -0.2158263  -0.651396   -0.02649488
  -0.7514603   0.53430575]
 [-0.04741677  1.8417346   0.37591818  0.01447502 -0.38806286 -0.09007584
   1.3755722  -0.93123853]], shape=(3, 8), dtype=float32)
y = tf.Tensor([1.035   5.00001 1.645  ], shape=(3,), dtype=float32)

X = tf.Tensor(
[[ 0.45238286 -1.9522425   0.1143576  -0.1570392   1.7601808  -0.02326523
  -0.70929444  1.0028819 ]
 [-0.40013242 -0.21333635 -0.6893344  -0.0696196   0.6323529  -0.02521754
  -0.7186648   0.7187462 ]
 [-0.56078047 -1.1618307   0.2772512   0.5208741  -0.8488958  -0.03848417
  -1.00914     1.3368653 ]], shape=(3, 8), dtype=float32)
y = tf.Tensor([1.787 2.141 1.375], shape=(3,), dtype=float32)

X = tf.Tensor(
[[-0.45195094  1.8417346  -0.2245777  -0.07893521 -0.66092443 -0.06706903
   1.0288763  -1.3399953 ]
 [-0.18416895 -1.0827894   0.34009367  0.11893617 -0.1

In [None]:
# tf.keras

In [None]:
train_set = csv_reader_dataset(train_filepaths, repeat=None) # infinite dataset, so it shuffles differently for each epoch
valid_set = csv_reader_dataset(valid_filepaths) # walidujemy za każdym razem  tak samo
test_set = csv_reader_dataset(test_filepaths)                #repeat none oznacza powtarzanie nieograniczone

keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
model = keras.models.Sequential([
                                 keras.layers.Dense(30, activation="relu", input_shape=X_train.shape[1:]),
                                 keras.layers.Dense(1)
])

model.compile(loss="mse", optimizer = keras.optimizers.SGD(lr = 1e-3))

batch_size = 32
model.fit(train_set, steps_per_epoch = len(X_train) // batch_size, epochs = 10, validation_data = valid_set)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f1f794f8b00>

In [None]:
"""train_set = csv_reader_dataset(train_filepaths, batch_size=3, repeat=None)
for X_batch, y_batch in train_set.take(-1): #-1 bierze wszystkie
    print("X =", X_batch)
    print("y =", y_batch)
    print()

    #looks like an infinite loop"""

'train_set = csv_reader_dataset(train_filepaths, batch_size=3, repeat=None)\nfor X_batch, y_batch in train_set.take(-1): #-1 bierze wszystkie\n    print("X =", X_batch)\n    print("y =", y_batch)\n    print()\n\n    #looks like an infinite loop'

In [None]:
model = keras.models.Sequential([])

In [None]:
model.evaluate(test_set, steps=len(X_test) // batch_size)



0.5400372743606567

In [None]:
new_set = test_set.map(lambda X, y: X) 
X_new = X_test
model.predict(new_set, steps = len(X_new) // batch_size)


array([[2.4011085 ],
       [2.251545  ],
       [1.5973581 ],
       ...,
       [0.80715495],
       [3.8291612 ],
       [1.0472566 ]], dtype=float32)

In [None]:
optimizer = keras.optimizers.Nadam(lr=0.01)
loss_fn = keras.losses.mean_squared_error
n_epochs = 5
batch_size =32
n_steps_per_epoch = len(X_train) // batch_size
total_steps = n_epochs * n_steps_per_epoch

global_step = 0

for X_batch, y_batch in train_set.take(total_steps):
  global_step+=1
  print("\rGlobal step {}/{}".format(global_step, total_steps), end="")
  with tf.GradientTape() as tape:
    y_pred = model(X_batch)
    main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
    loss = tf.add_n([main_loss] + model.losses)
  gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))

Global step 1810/1810

In [None]:
for m in dir(tf.data.Dataset):
    if not (m.startswith("_") or m.endswith("_")):
        func = getattr(tf.data.Dataset, m)
        if hasattr(func, "__doc__"):
            print("● {:21s}{}".format(m + "()", func.__doc__.split("\n")[0]))

● apply()              Applies a transformation function to this dataset.
● as_numpy_iterator()  Returns an iterator which converts all elements of the dataset to numpy.
● batch()              Combines consecutive elements of this dataset into batches.
● cache()              Caches the elements in this dataset.
● cardinality()        Returns the cardinality of the dataset, if known.
● concatenate()        Creates a `Dataset` by concatenating the given dataset with this dataset.
● element_spec()       The type specification of an element of this dataset.
● enumerate()          Enumerates the elements of this dataset.
● filter()             Filters this dataset according to `predicate`.
● flat_map()           Maps `map_func` across this dataset and flattens the result.
● from_generator()     Creates a `Dataset` whose elements are generated by `generator`.
● from_tensor_slices() Creates a `Dataset` whose elements are slices of the given tensors.
● from_tensors()       Creates a `Dataset` 

In [None]:
optimizer = keras.optimizers.Nadam(lr= 0.01)
loss_fn = keras.losses.mean_squared_error
@tf.function
def train(model, n_epochs, batch_size=32, n_readers=5, n_read_threads=5, shuffle_buffer_size = 10000, n_parse_threads=5):
  train_set = csv_reader_dataset(train_filepaths, repeat=n_epochs, n_readers=n_readers,
                                 n_read_threads=n_read_threads, shuffle_buffer_size=shuffle_buffer_size, n_parse_threads=n_parse_threads,
                                 batch_size= batch_size)
  n_steps_per_epoch = len(X_train) // batch_size
  total_steps = n_epochs * n_steps_per_epoch
  global_step = 0
  for X_batch, y_batch in train_set.take(total_steps):
        global_step += 1
        if tf.equal(global_step % 100, 0):
            tf.print("\rGlobal step", global_step, "/", total_steps)
        with tf.GradientTape() as tape:
            y_pred = model(X_batch)
            main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
            loss = tf.add_n([main_loss] + model.losses)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

train(model, 5)

Global step 100 / 1810
Global step 200 / 1810
Global step 300 / 1810
Global step 400 / 1810
Global step 500 / 1810
Global step 600 / 1810
Global step 700 / 1810
Global step 800 / 1810
Global step 900 / 1810
Global step 1000 / 1810
Global step 1100 / 1810
Global step 1200 / 1810
Global step 1300 / 1810
Global step 1400 / 1810
Global step 1500 / 1810
Global step 1600 / 1810
Global step 1700 / 1810
Global step 1800 / 1810


In [None]:
with tf.io.TFRecordWriter("my_data.tfrecord") as f:
    f.write(b"This is the first record")
    f.write(b"And this is the second record")

filepaths = ["my_data.tfrecord"]
dataset = tf.data.TFRecordDataset(filepaths)
for item in dataset:
    print(item)

tf.Tensor(b'This is the first record', shape=(), dtype=string)
tf.Tensor(b'And this is the second record', shape=(), dtype=string)


In [None]:
filepaths = ["my_test_{}.tfrecord".format(i) for i in range(5)] #5 records
for i, filepath in enumerate(filepaths):
    with tf.io.TFRecordWriter(filepath) as f:
        for j in range(3):
            f.write("File {} record {}".format(i, j).encode("utf-8"))

dataset = tf.data.TFRecordDataset(filepaths, num_parallel_reads=3)
for item in dataset:
    print(item)

#taking 3 at the time in parallel, interleave

tf.Tensor(b'File 0 record 0', shape=(), dtype=string)
tf.Tensor(b'File 1 record 0', shape=(), dtype=string)
tf.Tensor(b'File 2 record 0', shape=(), dtype=string)
tf.Tensor(b'File 0 record 1', shape=(), dtype=string)
tf.Tensor(b'File 1 record 1', shape=(), dtype=string)
tf.Tensor(b'File 2 record 1', shape=(), dtype=string)
tf.Tensor(b'File 0 record 2', shape=(), dtype=string)
tf.Tensor(b'File 1 record 2', shape=(), dtype=string)
tf.Tensor(b'File 2 record 2', shape=(), dtype=string)
tf.Tensor(b'File 3 record 0', shape=(), dtype=string)
tf.Tensor(b'File 4 record 0', shape=(), dtype=string)
tf.Tensor(b'File 3 record 1', shape=(), dtype=string)
tf.Tensor(b'File 4 record 1', shape=(), dtype=string)
tf.Tensor(b'File 3 record 2', shape=(), dtype=string)
tf.Tensor(b'File 4 record 2', shape=(), dtype=string)


In [None]:
#protobufs


%%writefile person.proto
syntax = "proto3";
message Person {
  string name = 1;
  int32 id = 2;
  repeated string email = 3;
}

!protoc person.proto --python_out=. --descriptor_set_out=person.desc --include_imports
!ls person*

Overwriting person.proto


In [None]:
from person_pb2 import Person

person = Person(name="Al", id=123, email=["a@b.com"])  # create a Person
print(person)  # display the Person

ModuleNotFoundError: ignored

In [None]:
#preprocessing layer directly in the model
#the alternative is to do it beforehand in nuumpy pandas or scikit learn



class Standarization(keras.layers.Layer):
  def adapt(self, data_sample):
    self.means_ = np.mean(data_sample, axis=0, keepdims=True)
    self.stds_ = np.std(data_sample,axis=0, keepdims=True)
  def call(self, inputs):
    return (inputs - self.means_) / (self.stds_ + keras.backend.epsilon())


STD = Standarization()
STD.adapt(X_train[:500])

model = keras.Sequential()
model.add(STD)
#...
#model.compile(...)
#model.fit(...)
#model.evaluate()

In [None]:
#encoding categorical features babe

#one-hot

import os
import tarfile
import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
fetch_housing_data()
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [None]:
#first we need to hook up each category to index from 0-4

vocab= ['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']
indices = tf.range(len(vocab), dtype = tf.int64)

In [None]:
table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)
# [1h ocean - 0, inland - 1..]
num_oov_buckets = 2
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)
# [1h ocean - 0, inland - 1.., near-ocean - 4, UNKNOWN1 - 5, UNKNOWN2 - 6]

In [None]:
print(table_init)

<dtype: 'int64'>


In [None]:
housing_median_age = tf.feature_column.numeric_column("housing_median_age")

In [None]:
housing_median_age[2]

In [None]:
X_mean

array([ 3.86549233e+00,  2.86990525e+01,  5.42064930e+00,  1.09544048e+00,
        1.42299199e+03,  3.10735866e+00,  3.56339371e+01, -1.19571858e+02])

In [None]:
X_train[:5, 1:2]

NameError: ignored

In [None]:
#okej
#na spokojnie
#
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [None]:
X_mean

array([ 3.89175860e+00,  2.86245478e+01,  5.45593655e+00,  1.09963474e+00,
        1.42428122e+03,  2.95886657e+00,  3.56464315e+01, -1.19584363e+02])

In [None]:
#mikro przykład

#mamy X_std i X_mean ze starego datasetu
#Z user guide scikitlearn wiemy, że zakodowano median age w kolumnie 1

#teraz mamy API modyfikowania kolumn
#kolumna kategoryzacyjna na podstawie listy słów 
ocean_prox_vocab = ['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']
ocean_proximity = tf.feature_column.categorical_column_with_vocabulary_list(
    "ocean_proximity", ocean_prox_vocab)

ocean_proximity_embed = tf.feature_column.embedding_column(ocean_proximity,
                                                           dimension=2) # zaembedowana zmienna kategoryzacyjna
median_income = tf.feature_column.numeric_column("median_income") #oznaczamy sobie, że to będzie kolumna wiaderek
bucketized_income = tf.feature_column.bucketized_column(
    median_income, boundaries=[1.5, 3., 4.5, 6.])

In [None]:
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [None]:
msome_columns = [ocean_proximity_embed, bucketized_income] #zdefiniowane kolumny z transformacjami
dense_features = keras.layers.DenseFeatures(msome_columns) #zdefiniowanie warstwy, która zaaplikuje transformacje
dense_features({
    "ocean_proximity": [["NEAR OCEAN"], ["INLAND"], ["INLAND"]], # wszelkie transformacje
    "median_income": [[3.], [7.2], [1.]]
})

# wynikiem tego wszystkiego jest de facto tabelka numeryczna

<tf.Tensor: shape=(3, 7), dtype=float32, numpy=
array([[ 0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        -1.0993124 ,  0.9000791 ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        -0.15603873,  0.6362506 ],
       [ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        -0.15603873,  0.6362506 ]], dtype=float32)>

In [None]:
#przerobione wszystkie kolumny po kolei
# tworzymy kolumnę z id housing_median_age (w cudysłowie)
age_mean, age_std = X_mean[1], X_std[1]  # The median age is column  1
housing_median_age = tf.feature_column.numeric_column(
    "housing_median_age", normalizer_fn=lambda x: (x -   age_mean) / age_std) #tworzymy kolumnę, która normalizuje swoje dane
# Just an example, it's not used later on
city_hash = tf.feature_column.categorical_column_with_hash_bucket(
    "city", hash_bucket_size=1000) 
###
#hashe
# Sandomierz -> hashfn -> hash = 42043 -> wrzucony jest do wiadra numer 42043 % 1000 = 43
# w wiadrze numer 43 mogą być inne miasta, ale skraca to czas wyszukiwania

bucketized_age = tf.feature_column.bucketized_column(
    housing_median_age, boundaries=[-1., -0.5, 0., 0.5, 1.]) # age was scaled
age_and_ocean_proximity = tf.feature_column.crossed_column(
    [bucketized_age, ocean_proximity], hash_bucket_size=100) # po prostu łączona kolumna, która dzieli "łączną informację" na 100 wiader



latitude = tf.feature_column.numeric_column("latitude")
longitude = tf.feature_column.numeric_column("longitude")
bucketized_latitude = tf.feature_column.bucketized_column(
    latitude, boundaries=list(np.linspace(32., 42., 20 - 1)))
bucketized_longitude = tf.feature_column.bucketized_column(
    longitude, boundaries=list(np.linspace(-125., -114., 20 - 1)))
location = tf.feature_column.crossed_column(
    [bucketized_latitude, bucketized_longitude], hash_bucket_size=1000)

median_house_value = tf.feature_column.numeric_column("median_house_value") #cena

columns = [housing_median_age, median_house_value]
feature_descriptions = tf.feature_column.make_parse_example_spec(columns) # wiadomo, czego się spodziewać w środku kolumny
feature_descriptions

{'housing_median_age': FixedLenFeature(shape=(1,), dtype=tf.float32, default_value=None),
 'median_house_value': FixedLenFeature(shape=(1,), dtype=tf.float32, default_value=None)}

In [None]:
BytesList = tf.train.BytesList
FloatList = tf.train.FloatList
Int64List = tf.train.Int64List
Feature = tf.train.Feature
Features = tf.train.Features
Example = tf.train.Example



with tf.io.TFRecordWriter("my_data_with_features.tfrecords") as f: #"przepisuję" na tfrecord
    for x, y in zip(X_train[:, 1:2], y_train): # mamy kolumnę 1 i igreki
        example = Example(features=Features(feature={
            "housing_median_age": Feature(float_list=FloatList(value=[x])), # still raw numbers  
            "median_house_value": Feature(float_list=FloatList(value=[y])) 
        }))
        # tworzymy po jednej zmiennej example dla każdej pary, z cechami housing median age i median house value
        f.write(example.SerializeToString())

In [None]:
"""
Stworzenie osoby , która jest instancją protobufa Example (który jest standardowy)
person_example = Example(
    features=Features(
        feature={
            "name": Feature(bytes_list=BytesList(value=[b"Alice"])),
            "id": Feature(int64_list=Int64List(value=[123])),
            "emails": Feature(bytes_list=BytesList(value=[b"a@b.com", b"c@d.com"]))
        }))

with tf.io.TFRecordWriter("my_contacts.tfrecord") as f:
    f.write(person_example.SerializeToString())

"""

def parse_examples(serialized_examples):
    examples = tf.io.parse_example(serialized_examples, feature_descriptions) # Zczytuj batch by batch
    # każdy przykład to swój tensor z jedną zmienną
    print(examples)
    targets = examples.pop("median_house_value") # separate the targets
    return examples, targets

batch_size = 32
dataset = tf.data.TFRecordDataset(["my_data_with_features.tfrecords"])
dataset = dataset.repeat().shuffle(10000).batch(batch_size).map(parse_examples)
for data in dataset.take(1):
  print(data) # ceny są niepodpisane, ale są

{'housing_median_age': <tf.Tensor 'ParseExample/ParseExampleV2:0' shape=(None, 1) dtype=float32>, 'median_house_value': <tf.Tensor 'ParseExample/ParseExampleV2:1' shape=(None, 1) dtype=float32>}
({'housing_median_age': <tf.Tensor: shape=(32, 1), dtype=float32, numpy=
array([[49.],
       [46.],
       [23.],
       [13.],
       [11.],
       [16.],
       [31.],
       [27.],
       [30.],
       [43.],
       [30.],
       [20.],
       [33.],
       [24.],
       [48.],
       [29.],
       [15.],
       [45.],
       [52.],
       [10.],
       [40.],
       [52.],
       [37.],
       [42.],
       [16.],
       [33.],
       [37.],
       [29.],
       [35.],
       [30.],
       [33.],
       [27.]], dtype=float32)>}, <tf.Tensor: shape=(32, 1), dtype=float32, numpy=
array([[2.679  ],
       [3.882  ],
       [2.417  ],
       [2.979  ],
       [2.059  ],
       [2.488  ],
       [1.969  ],
       [0.968  ],
       [1.821  ],
       [2.977  ],
       [5.00001],
       [1.655  ],


In [None]:
i = 0
for serialized_example in tf.data.TFRecordDataset(["my_data_with_features.tfrecords"]): # długie stringi tensorowe
 #print(serialized_example) # printss a lot of long byte string tensors like
  #
  #tf.Tensor(b'\n@\n\x1e\n\x12housing_median_age\x12\x08\x12\x06\n\x04\x00\x00\x08B\n\x1e\n\x12median_house_value\x12\x08\x12\x06\n\x04P\x8d\x87?', shape=(), dtype=string)
  parsed_example = tf.io.parse_single_example(serialized_example, feature_descriptions) # one by one
  #print(parsed_example)
  #ładny tensor typu
  #{'housing_median_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([39.], dtype=float32)>, 'median_house_value': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.292], dtype=float32)>}
  # już nie dtype string


[1;30;43mStrumieniowane dane wyjściowe obcięte do 5000 ostatnich wierszy.[0m
{'housing_median_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([39.], dtype=float32)>, 'median_house_value': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.292], dtype=float32)>}
{'housing_median_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([33.], dtype=float32)>, 'median_house_value': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.929], dtype=float32)>}
{'housing_median_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([38.], dtype=float32)>, 'median_house_value': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.827], dtype=float32)>}
{'housing_median_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([24.], dtype=float32)>, 'median_house_value': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.153], dtype=float32)>}
{'housing_median_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([52.], dtype=float32)>, 'median_house_value': <tf.Tensor: s

In [None]:
columns_without_target = columns[:-1]
model = keras.models.Sequential([
    keras.layers.DenseFeatures(feature_columns=columns_without_target),#tu dopiero standaryzacja???
    keras.layers.Dense(1)
])
model.compile(loss="mse",
              optimizer=keras.optimizers.SGD(lr=1e-3),
              metrics=["accuracy"])
model.fit(dataset, steps_per_epoch=len(X_train) // batch_size, epochs=5)

Epoch 1/5
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f16b0c9ad68>

In [None]:
#zadanie 10

#to-do

#załadować fash mnist
#podzielić
#pomieszać training set za pomocą shuffle
#zapisać training/val/test do wielu TFRecordów


import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow import keras
(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()
X_valid, X_train = X_train_full[:5000], X_train_full[5000:]
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]

In [None]:

print(tfds.builder("fashion_mnist").info)

tfds.core.DatasetInfo(
    name='fashion_mnist',
    version=3.0.0,
    description='Fashion-MNIST is a dataset of Zalando's article images consisting of a training set of 60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image, associated with a label from 10 classes.',
    homepage='https://github.com/zalandoresearch/fashion-mnist',
    features=FeaturesDict({
        'image': Image(shape=(28, 28, 1), dtype=tf.uint8),
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=10),
    }),
    total_num_examples=70000,
    splits={
        'test': 10000,
        'train': 60000,
    },
    supervised_keys=('image', 'label'),
    citation="""@article{DBLP:journals/corr/abs-1708-07747,
      author    = {Han Xiao and
                   Kashif Rasul and
                   Roland Vollgraf},
      title     = {Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning
                   Algorithms},
      journal   = {CoRR},
      volume

In [None]:
#zapisać te wszystkie datasety do wielu tfrecordów
#każdy tfrecord poinien być zserializowanym protobufem example z dwoma featurami , serializowane zdjęcie i label
import tensorflow as tf
BytesList = tf.train.BytesList
FloatList = tf.train.FloatList
Int64List = tf.train.Int64List
Feature = tf.train.Feature
Features = tf.train.Features
Example = tf.train.Example

#stworzyć record dla train
with tf.io.TFRecordWriter("my_train_001.tfrecord") as f:
  for image, label in X_train, y_train:
    im = tf.io.serialize_tensor(image)
    
    ex = Example(
        features = Features(
            feature={
                "image": Feature(bytes_list=BytesList(value=[im.numpy()])),
                "label": Feature(int64_list=Int64List(value=[label]))
            }
        )
    )
    serialized_ex = ex.SerializeToString()
    f.write(serialized_ex)

ValueError: ignored

In [None]:
for ex in train.take(1):
  print(ex["label"].numpy()[0])

0


In [None]:
for (image, label) in (X_train, y_train):
  print(image)

ValueError: ignored

In [None]:
for image in X_train:
  print(image)

[1;30;43mStrumieniowane dane wyjściowe obcięte do 5000 ostatnich wierszy.[0m
    0   1   0  18 107  89  95  56  29   7]
 [ 38  41  41  23  66 107  73  69 124  84   0  96 168 122  87   0   0   0
    0   0   0  18  87  64  76  49  27   6]
 [ 52  38  36  36  35  49  67  62  58  95  84 113  73 130  32   0   1   1
    0   1   0  18  69  46  62  44  33   3]
 [ 33  18  12   7  12  23  10  46  58  47  62  75  66 121   0   0   0   0
    0   1   0  24  79  55  61  39  33   0]
 [ 26  19  12  12  18  13  27  12   4  21  15  38  82  99   0   0   0   0
    0   1   0  16  75  55  61  39  35   0]
 [ 39  72  26  10   3   0  10  16  24  26  38  23 136  33   0   1   0   0
    1   0   0   3  32  21   7   0  12   0]
 [  0  29  73  81  76  56  44  26   6  13  23  75 122   0   0   0   0   0
    0   1   0  19 119  96  79  86  79   0]
 [  0   0   0   0   6  36  50  64  84  79  96 113  18   0   1   0   0   0
    0   1   0  21 135 115 110 102  72   0]]
[[  0   0   0   0   0   0   0   0   0   0   0   0 150 153 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStrumieniowane dane wyjściowe obcięte do 5000 ostatnich wierszy.[0m
   28  15  22   7  10   2   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]]
[[  0   0   0   0   0   0   0   0   0  18 134 123 126 124 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStrumieniowane dane wyjściowe obcięte do 5000 ostatnich wierszy.[0m
  205 214  73   0   2   0   0   0   0   0]
 [  0   0   0   0   0   2   0   0 216 211 208 212 211 210 210 210 211 213
  207 214 159   0   0   0   0   0   0   0]
 [  0   0   0   0   0   4   0  14 201 208 210 212 211 211 212 211 210 211
  211 210 201   0   0   0   0   0   0   0]
 [  0   0   0   0   0   3   0  46 215 207 210 212 215 217 217 216 213 210
  210 211 215   0   0   1   0   0   0   0]
 [  0   0   0   0   0   3   0  79 220 208 217 216 218 218 217 218 217 215
  214 211 194   0   0   2   0   0   0   0]
 [  0   0   0   0   0   2   0 105 220 207 213 212 213 212 212 214 215 215
  213 210 200   0   0   2   0   0   0   0]
 [  0   0   0   0   0   1   0 156 227 215 235 234 234 234 234 234 233 232
  233 218 214  32   0   1   0   0   0   0]
 [  0   0   0   0   0   0   0 113 182 160 166 163 164 162 161 161 162 161
  162 169 194  33   0   1   0   0   0   0]]
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStrumieniowane dane wyjściowe obcięte do 5000 ostatnich wierszy.[0m
   79  91  38 120 113  90  53   0   0   0]
 [  0   0   0  29 109  90 112  84 112  82  85  84  86  86  83  87  97  83
   72  91  45 110 117  87  60   0   0   0]
 [  0   0   0  36 103  93 102  86 116  81  87  89  93  89  80  81  97  90
   73  92  42  86 128  83  65   0   0   0]
 [  0   0   0  50 104 104  94  79 115  85  88  85  87  93  91  79  93  88
   74  90  24  77 136  86  68   0   0   0]
 [  0   0   0  63  99 108  84  36 142 106 103  87  87 102 105  89  98 112
  104 130  11  94 145  85  68   0   0   0]
 [  0   0   0  85  90 101  87  23 165 106 123 129 131 111 127 121 119 105
  108 133  19  91 119  90  82   0   0   0]
 [  0   0   0  63 139 134  69   0  41  38  37  42  44  40  40  44  39  30
   25  21   0  76 133 117  55   0   0   0]
 [  0   0   0  18  84  82  37   0   0   0   0   0   0   0   0   0   0   0
    0   0   0  41 115  96  12   0   0   0]]
[[  0   0   0   0   0   2   1   2   0   0  54 231 174 161 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStrumieniowane dane wyjściowe obcięte do 5000 ostatnich wierszy.[0m
  137 139 146 130 239 143 172 160   0   0]
 [  0   0   0  20 113 180 205 110 103 101  99 103  93  96  94  94  96  93
   89  96  98  82 110 215 236  56   0   0]
 [  0   0   0  10 143 136 180 237 194 205 222 205 199 205 208 210 213 215
  223 227 229 230 201 189 141  82   0   0]
 [  0   0   0  89 156 170 187 206 230 251 242 236 220 215 220 220 227 230
  222 220 223 227 217 186 111  93   0   0]
 [  0   0   0 110 139 148 182 213 213 218 170 201 227 184 174 148 149 149
  167 167 170 174 174 170 144  84   0   0]
 [  0   0   0 124 111  98  94 124 151 179 156 210 248 229 218 205 196 191
  191 186 184 186 184 189 196 201   0   0]
 [  0   0   0  98 132 139 139 168 172 194 179 205 222 208 201 196 192 184
  172 172 175 180 187 153 125  41   0   0]
 [  0   0   0   0   0  22  46  91  89 106  60   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]]
[[  0   0   0   0   1   1   0   0   0  55 151 120  89  96 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStrumieniowane dane wyjściowe obcięte do 5000 ostatnich wierszy.[0m
  162 138 193 188 155 191  55   0   0   0]
 [  0   0   0   0  52 172 146 182 169 147 156 149 160 157 149 175 165 157
  161 152 182 187 157 193  60   0   0   0]
 [  0   0   0   0  65 170 147 180 164 149 165 167 168 164 155 180 172 165
  177 154 171 187 161 193  68   0   0   0]
 [  0   0   0   0  72 173 151 186 162 142 152 154 164 177 171 187 162 164
  167 153 174 195 160 194  79   0   0   0]
 [  0   0   0   0  76 171 162 165 157 173 153 154 156 160 150 164 157 156
  149 176 151 176 175 175  82   0   0   0]
 [  0   0   0   0  81 152 159 143  32 215 197 200 196 191 192 195 196 196
  193 204   0 151 176 170  78   0   0   0]
 [  0   0   0   0 133 174 176 184   0   0  73 143 159 170 170 160 150 153
   91   0   0 185 183 180 129   0   0   0]
 [  0   0   0   0  43 113 120  72   0   0   0   0   0   0   0   0   0   0
    0   0   0  82 158 151  61   0   0   0]]
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStrumieniowane dane wyjściowe obcięte do 5000 ostatnich wierszy.[0m
  223 219 223 180 109 192  14   0   0   0]
 [  0   0   0   0   0 222 190 183 180 179 189 210 217 222 224 223 226 226
  226 221 219 195 135 189   0   0   0   0]
 [  0   0   0   0   0 200 192 172 182 191 210 214 216 223 221 219 213 217
  218 218 223 211 168 134   0   0   0   0]
 [  0   0   0   0   0 153 209 173 190 193 193 209 216 218 220 223 226 229
  228 226 226 206 158  82   0   0   0   0]
 [  0   0   0   0   0  63 212 184 185 203 207 201 216 227 229 231 234 236
  233 230 228 220 197   3   0   0   0   0]
 [  0   0   0   0   0   0 183 195 200 229 227 227 216 219 227 224 224 229
  228 231 229 226 167   0   0   0   0   0]
 [  0   0   0   0   0   0  48 208 213 224 233 250 255 249 241 243 243 244
  244 247 231 250 158   0   1   0   0   0]
 [  0   0   0   0   0   0   0 188 207 201 206 180 142 154 189 192 192 189
  187 191 174 162  43   0   0   0   0   0]]
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStrumieniowane dane wyjściowe obcięte do 5000 ostatnich wierszy.[0m
   12  79 109 123  74  70 102  34   7   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]]
[[  0   0   0   0   0   0   0   0   0   3   0   0 191 159 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStrumieniowane dane wyjściowe obcięte do 5000 ostatnich wierszy.[0m
  164 152 125 133 167 149 167 147 147  76]
 [ 67 138 127 121 115  99  92  82  72  86  98  98  79 121 145 102 115 129
  145 185 182 136 112 112 121  92 104  61]
 [  0  27  87 130 150 150 138 132 138 139 133 136 121  89  87  95  90  76
   73  90  95  86  87  87  81  76  79  59]
 [  0   0   0   0  36  93 141 170 172 164 158 144 122 121 121 115 116 122
  127 121 121 127 129 129 125 116 119  79]
 [  0   1   0   0   0   0   0   7  30  50  76  90  87  81  78  75  75  75
   75  79  75  66  59  56  47  36  26   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]]
[[  0   0   0   0   0   0   0   0   0 130 120   0   0   0 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStrumieniowane dane wyjściowe obcięte do 5000 ostatnich wierszy.[0m
  160 221 226 227 225 235 219 214 255  35]
 [  0  63 165 141 143 115 114 139 150 149 133 139 144 163 163 186 161 179
  255 145  32 244 220 221 218 215 254  26]
 [  0 134 134 126 121 118 115 122 132 134 145 144 138 163 163 169 203 221
   46   0   5 255 214 219 214 212 248  21]
 [ 85 187 167 167 151 130 124 122 122 121 133 129 162 179 197 199 142   0
    0   0  45 223 208 212 208 209 235  16]
 [ 83 206 224 232 230 218 233 227 226 230 231 219 226 219 170   0   0   0
    4   0  78 232 210 231 224 213 255  21]
 [  0   0  36  59 107 125 147 178 178 175 180 139  70  37   0   0   0   0
    3   0  61 168 136 151 152 153 169   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]]
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0 

KeyboardInterrupt: ignored