In [None]:
# Importujeme si opět knihovny jako minule
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import tensorflow as tf
import keras

print(f"TensorFlow version: {tf.__version__}")
print(f"Keras version: {keras.__version__}")

from keras import layers
%matplotlib ipympl

In [None]:
# Informace o datasetu: 
# https://keras.io/api/datasets/california_housing/
from keras.datasets import california_housing

(train_data, train_target), (test_data, test_target) = california_housing.load_data(version="small")

feature_names = [
    "MedInc",
    "HouseAge",
    "AveRooms",
    "AveBedrms",
    "Population",
    "AveOccup",
    "Latitude",
    "Longitude"
]

target_names = ["MedianHouseValue"]

train_data.shape, test_data.shape

In [None]:
train_target = train_target / 1000
test_target = test_target / 1000

"mean: {}, min: {}, max: {}".format(test_target.mean(), test_target.min(), test_target.max())

In [None]:
pd.DataFrame(train_data, columns=feature_names).head(10)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_validate, y_train, y_validate = train_test_split(train_data, train_target, test_size=0.2)

x_train.shape, x_validate.shape

In [9]:
def normalize(data):
    mean = data.mean(axis=0)
    std = data.std(axis=0)
    return (data - mean) / std

x_train = normalize(x_train)
x_validate = normalize(x_validate)

train_data = normalize(train_data)
test_data = normalize(test_data)

In [None]:
from sklearn.model_selection import KFold

def build_model():
    model = keras.Sequential([
        layers.Dense(64, activation="relu", input_shape=(train_data.shape[1],)),
        layers.Dense(64, activation="relu"),
        layers.Dense(1)
    ])

    model.compile(optimizer="adam", loss="mse", metrics=["mae"])
    return model

kf = KFold(n_splits=10)
for train_idx, valid_idx in kf.split(train_data, train_target):
    # print(train_idx.shape, valid_idx.shape)
    kf_x_train = normalize(train_data[train_idx])
    kf_x_validate = normalize(train_data[valid_idx])

    model = build_model()
    model.fit(
        kf_x_train, train_target[train_idx],
        validation_data=(kf_x_validate, train_target[valid_idx]),
        epochs=10,
        batch_size=8,
        verbose=0
    )
    model.evaluate(kf_x_validate, train_target[valid_idx], batch_size=8)
    # Ukládám si výsledky metriky a modely a nejlepší nakonec vyberu.

In [11]:
model = keras.Sequential([
    layers.Dense(128, input_shape=[x_train.shape[1]], activation="relu"),
    layers.Dense(128, activation="relu"),
    layers.Dense(1)
])

model.compile(
	optimizer="adam",
	loss="mse", # Mean Squared Error – loss function
    metrics=['mae'], 
        # Mean Absolute Error – regression metric
        # vzorec: sum(abs(y_true - y_pred)) / n
        # MAE 0,5 v tomto problému by znamenalo, že vaše předpovědi jsou v průměru o 500 dolarů nižší.
)

trained = model.fit(
    x_train, y_train,
    validation_data=(x_validate, y_validate),
    batch_size=4,
    epochs=300,
    verbose=False,
)

In [None]:
history = pd.DataFrame(trained.history)

history.loc[10:, ['loss', 'val_loss']].plot()
# history.loc[10:, ['mae', 'val_mae']].plot()

# V poslední epoše.
history.iloc[-1]

# Vidíme, že zhruba po 100 epoše se model začíná přeučovat

In [None]:
# Vyhodnocení modelu
results = model.evaluate(test_data, test_target, batch_size=32)

In [None]:
model = keras.Sequential([
    layers.Dense(128, input_shape=[x_train.shape[1]], activation="relu"),
    layers.Dense(128, activation="relu"),
    layers.Dropout(0.2),
    layers.Dense(1)
])

model.compile(
	optimizer="adam",
	loss="mse",
    metrics=['mae'],
)

trained = model.fit(
    x_train, y_train,
    validation_data=(x_validate, y_validate),
    batch_size=16,
    epochs=500,
    verbose=False,
)

In [None]:
history = pd.DataFrame(trained.history)

history.loc[10:, ['loss', 'val_loss']].plot()
history.loc[10:, ['mae', 'val_mae']].plot()

# V poslední epoše.
history.iloc[-1]

In [16]:
# Pokud máme připravenou architekturu a konfiguraci modelu, můžeme model natrénovat na celé trénovací množině.
trained = model.fit(
    train_data, train_target,
    validation_data=(test_data, test_target),
    batch_size=16,
    epochs=500,
    verbose=False,
)

In [None]:
history = pd.DataFrame(trained.history)

history.loc[10:, ['loss', 'val_loss']].plot()
# history.loc[10:, ['mae', 'val_mae']].plot()

# V poslední epoše.
history.iloc[-1]

# Co je tady za chybu?

In [None]:
# Vyhodnocení modelu
results = model.evaluate(test_data, test_target, batch_size=32)

In [None]:
# Sami si vyzkoušejte ladění hyperparametrů a architektury modelu.