# Python Deep Learning

Notebook内容全部来源于《Python深度学习》，涵盖书中全部内容，以练习为主，理论知识较少，该书作者：Francois Chollet，即Keras之父，该书译者：张亮；

In [None]:
import os,sys,random,gc
import numpy as np
import pandas as pd
import keras
from keras import Input,models,layers,optimizers,preprocessing
from keras.datasets import mnist,imdb,reuters,boston_housing
from keras.utils import to_categorical
from keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator
from keras.applications import VGG16
from keras.models import load_model
from keras import backend as K
import matplotlib.pyplot as plt
from scipy.stats import norm

%matplotlib inline

In [None]:
(train_images,train_labels),(test_images,test_labels) = mnist.load_data()

print(train_images.shape)
print(train_labels.shape)
print(test_images.shape)
print(test_labels.shape)

## 神经网络数学基础

In [None]:
# reshape 3-ndim tensor(60000,28,28) to 2-nidm tensor(60000,512)
train_x = train_images.reshape((60000,28*28))
test_x = test_images.reshape((10000,28*28))
# scale 0~255 to 0~1
train_x = train_x.astype("float32")/255
test_x = test_x.astype("float32")/255
# labels transform
train_y = to_categorical(train_labels)
test_y = to_categorical(test_labels)

In [None]:
from keras import models,layers

# build network framework
network = models.Sequential()
network.add(layers.Dense(512,activation="relu",input_shape=(28*28,))) # 数据蒸馏，稠密连接（全连接）神经层，一阶张量，向量长度为28*28
network.add(layers.Dense(10,activation="softmax")) # 输出层，10个标签，激活函数为softmax，一般用于多分类

# compile：loss function, optimizer, metric
network.compile(optimizer="rmsprop",
               loss="categorical_crossentropy",
               metrics=["accuracy"])

# train model
network.fit(train_x,train_y,epochs=5,batch_size=128)

# evaluate model
test_loss,test_acc = network.evaluate(test_x,test_y)

print("test loss:",test_loss)
print("test accuracy:",test_acc)
gc.collect()

In [None]:
print("张量阶：",train_images.ndim)
print("张量形状：",train_images.shape)
print("张量数据类型：",train_images.dtype)

In [None]:
import matplotlib.pyplot as plt
plt.imshow(train_images[4],cmap=plt.cm.binary)

## 电影评论情感分类

- 数据源：IMDB；
- 特点：评论向量长度不一，这个长度对应的是评论文本的长度；

整数序列需要处理后再送入网络：
- 方法1：填充序列，使其具有相等的长度，然后送入Embedding层；
- 方法2：one-hot处理，每个序列转为长度为10000（num_words）的向量，然后送入Dense层处理；

In [None]:
(train_data,train_labels),(test_data,test_labels) = imdb.load_data(num_words=10000) # 只保留出现此处最多的10000个单词

print(train_data.shape)
print(train_labels.shape)
print(test_data.shape)
print(test_labels.shape)

In [None]:
def review(vec,dict_):
    dict_ = {v:k for k,v in dict_.items()}
    return " ".join([dict_.get(int(v)-3,"?") for v in vec])

In [None]:
dict_ = imdb.get_word_index()

print(train_data[0])
print(review(train_data[0],dict_))
print(train_labels[0])
gc.collect()

In [None]:
def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences),dimension))
    for i,sequence in enumerate(sequences):
        results[i, sequence] = 1
    return results

x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

print(x_train.shape)
print(x_train[0])
print(x_test.shape)
print(x_test[0])

In [None]:
network = models.Sequential()
network.add(layers.Dense(16,activation="relu",input_shape=(10000,))) # relu整流激活函数实现表示空间非线性
network.add(layers.Dense(16,activation="relu"))
network.add(layers.Dense(1,activation="sigmoid")) # sigmoid将输出压缩到0~1之间作为二分类的类别概率值

network.compile(loss="binary_crossentropy", # 适用于输出概率值的二分类模型
               optimizer="rmsprop", # SGD的变种
               metrics=["accuracy"])

# train and validation
x_val = x_train[:10000]
x_train_partial = x_train[10000:]
y_val = train_labels[:10000]
y_train_partial = train_labels[10000:]

# W迭代更新次数=(15000/512)*20
history = network.fit(x_train_partial,y_train_partial,epochs=20,batch_size=512,validation_data=(x_val,y_val))

history.history

gc.collect()

In [None]:
history_df = pd.DataFrame(history.history)
history_df[["loss","val_loss"]].plot()

In [None]:
history_df[["accuracy","val_accuracy"]].plot()

In [None]:
gc.collect()
network = models.Sequential()
network.add(layers.Dense(16,activation="relu",input_shape=(10000,))) # relu整流激活函数实现表示空间非线性
network.add(layers.Dense(16,activation="relu"))
network.add(layers.Dense(1,activation="sigmoid")) # sigmoid将输出压缩到0~1之间作为二分类的类别概率值

network.compile(loss="binary_crossentropy", # 适用于输出概率值的二分类模型
               optimizer="rmsprop", # SGD的变种
               metrics=["accuracy"])

# W迭代更新次数=(25000/512)*20
network.fit(x_train,train_labels,epochs=4,batch_size=512)

result = network.evaluate(x_test,test_labels)

result

In [None]:
gc.collect()
network = models.Sequential()
network.add(layers.Dense(16,activation="relu",input_shape=(10000,))) # relu整流激活函数实现表示空间非线性
network.add(layers.Dense(1,activation="sigmoid")) # sigmoid将输出压缩到0~1之间作为二分类的类别概率值

network.compile(loss="binary_crossentropy", # 适用于输出概率值的二分类模型
               optimizer="rmsprop", # SGD的变种
               metrics=["accuracy"])

# W迭代更新次数=(25000/512)*20
network.fit(x_train,train_labels,epochs=4,batch_size=512)

result = network.evaluate(x_test,test_labels)

result

In [None]:
gc.collect()
network = models.Sequential()
network.add(layers.Dense(16,activation="relu",input_shape=(10000,))) # relu整流激活函数实现表示空间非线性
network.add(layers.Dense(16,activation="relu"))
network.add(layers.Dense(1,activation="sigmoid")) # sigmoid将输出压缩到0~1之间作为二分类的类别概率值

network.compile(loss="binary_crossentropy", # 适用于输出概率值的二分类模型
               optimizer="rmsprop", # SGD的变种
               metrics=["accuracy"])

# W迭代更新次数=(25000/512)*20
network.fit(x_train,train_labels,epochs=4,batch_size=128)

result = network.evaluate(x_test,test_labels)

result

In [None]:
gc.collect()
network = models.Sequential()
network.add(layers.Dense(16,activation="relu",input_shape=(10000,))) # relu整流激活函数实现表示空间非线性
network.add(layers.Dense(16,activation="relu"))
network.add(layers.Dense(16,activation="relu"))
network.add(layers.Dense(1,activation="sigmoid")) # sigmoid将输出压缩到0~1之间作为二分类的类别概率值

network.compile(loss="binary_crossentropy", # 适用于输出概率值的二分类模型
               optimizer="rmsprop", # SGD的变种
               metrics=["accuracy"])

# W迭代更新次数=(25000/512)*20
network.fit(x_train,train_labels,epochs=4,batch_size=512)

result = network.evaluate(x_test,test_labels)

result

In [None]:
gc.collect()
network = models.Sequential()
network.add(layers.Dense(16,activation="relu",input_shape=(10000,))) # relu整流激活函数实现表示空间非线性
network.add(layers.Dense(16,activation="relu"))
network.add(layers.Dense(1,activation="sigmoid")) # sigmoid将输出压缩到0~1之间作为二分类的类别概率值

network.compile(loss="mse", # 适用于输出概率值的二分类模型
               optimizer="rmsprop", # SGD的变种
               metrics=["accuracy"])

# W迭代更新次数=(25000/512)*20
network.fit(x_train,train_labels,epochs=4,batch_size=512)

result = network.evaluate(x_test,test_labels)

result

In [None]:
gc.collect()
network = models.Sequential()
network.add(layers.Dense(16,activation="tanh",input_shape=(10000,))) # relu整流激活函数实现表示空间非线性
network.add(layers.Dense(16,activation="tanh"))
network.add(layers.Dense(1,activation="sigmoid")) # sigmoid将输出压缩到0~1之间作为二分类的类别概率值

network.compile(loss="binary_crossentropy", # 适用于输出概率值的二分类模型
               optimizer="rmsprop", # SGD的变种
               metrics=["accuracy"])

# W迭代更新次数=(25000/512)*20
network.fit(x_train,train_labels,epochs=4,batch_size=512)

result = network.evaluate(x_test,test_labels)

result

In [None]:
gc.collect()
network = models.Sequential()
network.add(layers.Dense(32,activation="relu",input_shape=(10000,))) # relu整流激活函数实现表示空间非线性
network.add(layers.Dense(64,activation="relu"))
network.add(layers.Dense(1,activation="sigmoid")) # sigmoid将输出压缩到0~1之间作为二分类的类别概率值

network.compile(loss="binary_crossentropy", # 适用于输出概率值的二分类模型
               optimizer="rmsprop", # SGD的变种
               metrics=["accuracy"])

# W迭代更新次数=(25000/512)*20
network.fit(x_train,train_labels,epochs=4,batch_size=512)

result = network.evaluate(x_test,test_labels)

result

## 新闻多分类

单标签、多分类问题；

In [None]:
(train_data,train_labels),(test_data,test_labels) = reuters.load_data(num_words=10000)

print(train_data.shape)
print(train_labels.shape)
print(test_data.shape)
print(test_labels.shape)

dict_ = reuters.get_word_index()
print(review(train_data[0],dict_))
print(review(test_data[0],dict_))

In [None]:
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

In [None]:
# 功能等价于to_categorical
def to_one_hot(seqs,dimension=46):
    results = np.zeros((len(seqs),dimension))
    for i,seq in enumerate(seqs):
        results[i,seq] = 1
    return results

y_train = to_one_hot(train_labels)
y_test = to_one_hot(test_labels)

print(y_train[0])
print(y_test[0])

In [None]:
gc.collect()
network = models.Sequential()
network.add(layers.Dense(64,activation="relu",input_shape=(10000,))) # 隐藏单元数设置为64，用于构建更复杂的表示空间去识别复杂的46个类别的表示
network.add(layers.Dense(64,activation="relu"))
network.add(layers.Dense(46,activation="softmax")) # softmax用于多分类的激活函数，输出46个类别对应的概率，概率和为1

network.compile(optimizer="rmsprop",
               loss="categorical_crossentropy",
               metrics=["accuracy"])

x_val = x_train[:1000]
x_train_part = x_train[1000:]
y_val = y_train[:1000]
y_train_part = y_train[1000:]

history = network.fit(x_train_part,y_train_part,epochs=20,batch_size=512,validation_data=(x_val,y_val))

In [None]:
history_df = pd.DataFrame(history.history)

history_df[["loss","val_loss"]].plot()

In [None]:
history_df[["accuracy","val_accuracy"]].plot()

In [None]:
gc.collect()
network = models.Sequential()
network.add(layers.Dense(64,activation="relu",input_shape=(10000,))) # 隐藏单元数设置为64，用于构建更复杂的表示空间去识别复杂的46个类别的表示
network.add(layers.Dense(64,activation="relu"))
network.add(layers.Dense(46,activation="softmax")) # softmax用于多分类的激活函数，输出46个类别对应的概率，概率和为1

network.compile(optimizer="rmsprop",
               loss="categorical_crossentropy",
               metrics=["accuracy"])

network.fit(x_train,y_train,epochs=7,batch_size=512)

result = network.evaluate(x_test,y_test)

print(result)

In [None]:
gc.collect()
network = models.Sequential()
network.add(layers.Dense(64,activation="relu",input_shape=(10000,))) # 隐藏单元数设置为64，用于构建更复杂的表示空间去识别复杂的46个类别的表示
network.add(layers.Dense(4,activation="relu"))
network.add(layers.Dense(46,activation="softmax")) # softmax用于多分类的激活函数，输出46个类别对应的概率，概率和为1

network.compile(optimizer="rmsprop",
               loss="categorical_crossentropy",
               metrics=["accuracy"])

x_val = x_train[:1000]
x_train_part = x_train[1000:]
y_val = y_train[:1000]
y_train_part = y_train[1000:]

history = network.fit(x_train_part,y_train_part,epochs=20,batch_size=512,validation_data=(x_val,y_val))

In [None]:
gc.collect()
network = models.Sequential()
network.add(layers.Dense(64,activation="relu",input_shape=(10000,))) # 隐藏单元数设置为64，用于构建更复杂的表示空间去识别复杂的46个类别的表示
network.add(layers.Dense(128,activation="relu"))
network.add(layers.Dense(46,activation="softmax")) # softmax用于多分类的激活函数，输出46个类别对应的概率，概率和为1

network.compile(optimizer="rmsprop",
               loss="categorical_crossentropy",
               metrics=["accuracy"])

x_val = x_train[:1000]
x_train_part = x_train[1000:]
y_val = y_train[:1000]
y_train_part = y_train[1000:]

history = network.fit(x_train_part,y_train_part,epochs=20,batch_size=512,validation_data=(x_val,y_val))

In [None]:
gc.collect()
network = models.Sequential()
network.add(layers.Dense(64,activation="relu",input_shape=(10000,))) # 隐藏单元数设置为64，用于构建更复杂的表示空间去识别复杂的46个类别的表示
network.add(layers.Dense(64,activation="relu"))
network.add(layers.Dense(64,activation="relu"))
network.add(layers.Dense(64,activation="relu"))
network.add(layers.Dense(46,activation="softmax")) # softmax用于多分类的激活函数，输出46个类别对应的概率，概率和为1

network.compile(optimizer="rmsprop",
               loss="categorical_crossentropy",
               metrics=["accuracy"])

x_val = x_train[:1000]
x_train_part = x_train[1000:]
y_val = y_train[:1000]
y_train_part = y_train[1000:]

history = network.fit(x_train_part,y_train_part,epochs=20,batch_size=512,validation_data=(x_val,y_val))

In [None]:
gc.collect()
network = models.Sequential()
network.add(layers.Dense(16,activation="relu",input_shape=(10000,))) # 隐藏单元数设置为64，用于构建更复杂的表示空间去识别复杂的46个类别的表示
network.add(layers.Dense(16,activation="relu"))
network.add(layers.Dense(16,activation="relu"))
network.add(layers.Dense(16,activation="relu"))
network.add(layers.Dense(46,activation="softmax")) # softmax用于多分类的激活函数，输出46个类别对应的概率，概率和为1

network.compile(optimizer="rmsprop",
               loss="categorical_crossentropy",
               metrics=["accuracy"])

x_val = x_train[:1000]
x_train_part = x_train[1000:]
y_val = y_train[:1000]
y_train_part = y_train[1000:]

history = network.fit(x_train_part,y_train_part,epochs=50,batch_size=512,validation_data=(x_val,y_val))

## 波士顿房价回归预测

In [None]:
(train_data,train_targets),(test_data,test_targets) = boston_housing.load_data()

print(train_data.shape)
print(train_data[0])
print(train_targets.shape)
print(test_data.shape)
print(test_data[0])
print(test_targets.shape)

In [None]:
mean_ = train_data.mean(axis=0)
std_ = train_data.std(axis=0)
train_data -= mean_
train_data /= std_
test_data -= mean_
test_data /= std_

In [None]:
def build_model():
    model = models.Sequential()
    model.add(layers.Dense(64,activation="relu",input_shape=(train_data.shape[1],)))
    model.add(layers.Dense(64,activation="relu"))
    model.add(layers.Dense(1))
    model.compile(optimizer="rmsprop",loss="mse",metrics=["mae"])
    return model

In [None]:
gc.collect()
k=5
fold_size = train_data.shape[0]//k
all_scores = []
for i in range(k):
    x_train = np.concatenate([
        train_data[:i*fold_size],
        train_data[(i+1)*fold_size:]
    ],axis=0)
    y_train = np.concatenate([
        train_targets[:i*fold_size],
        train_targets[(i+1)*fold_size:]
    ],axis=0)
    x_val = train_data[i*fold_size:(i+1)*fold_size]
    y_val = train_targets[i*fold_size:(i+1)*fold_size]
    
    model = build_model()
    model.fit(x_train,y_train,epochs=100,batch_size=1,verbose=0)
    val_mse,val_mae = model.evaluate(x_val,y_val,verbose=0)
    all_scores.append(val_mae)

print(np.mean(all_scores),all_scores)

In [None]:
gc.collect()
k=5
fold_size = train_data.shape[0]//k
all_scores = []
for i in range(k):
    x_train = np.concatenate([
        train_data[:i*fold_size],
        train_data[(i+1)*fold_size:]
    ],axis=0)
    y_train = np.concatenate([
        train_targets[:i*fold_size],
        train_targets[(i+1)*fold_size:]
    ],axis=0)
    x_val = train_data[i*fold_size:(i+1)*fold_size]
    y_val = train_targets[i*fold_size:(i+1)*fold_size]
    
    model = build_model()
    history = model.fit(x_train,y_train,epochs=500,batch_size=1,verbose=0,validation_data=(x_val,y_val))
    all_scores.append(history.history["val_mae"])

all_scores = [np.mean([score[i] for score in all_scores]) for i in range(500)]

In [None]:
pd.DataFrame({"VAL-MAE":all_scores}).plot()

In [None]:
pd.DataFrame({"VAL-MAE":all_scores[10:100]}).plot()

In [None]:
pd.DataFrame({"VAL-MAE":np.array(all_scores[10:-1])*.9+np.array(all_scores[11:])*.1}).plot()

In [None]:
gc.collect()
model.fit(x_train,y_train,epochs=65,batch_size=1,verbose=0)
result = model.evaluate(test_data,test_targets)
print(result)

In [None]:
gc.collect()
model.fit(x_train,y_train,epochs=65,batch_size=16,verbose=0)
result = model.evaluate(test_data,test_targets)
print(result)

## 评估模型

1. 数据代表性：不平衡类别的按比例划分训练集和验证集；
2. 时间箭头：时序数据按时间前后划分；
3. 数据冗余：避免训练集与验证集中出现重复数据，这个结果类似数据泄露；

## 神经网络的数据预处理

1. 向量化
2. 标准化
3. 缺失、异常处理

## 神经网络的特征工程

1. 提高模型运行速度；
2. 减少模型依赖的数据量；

因此对于DL，特征工程依然是有用的；

## 过欠拟合

模型对于数据中的表示的学习是不足还是过多；

正则化：限制模型存储的信息量，或对其加以约束；

手段：
1. 减少网络大小，也就减少了网络中的参数个数，也就减少了存储的信息量，限制了表示空间；
2. 添加权重正则化；
3. 增加dropout正则化：训练中，每层的输出中有一定比例的特征被丢弃，即设置为0，一般0.2到0.5之间，注意测试时，则不会进行丢弃，因此此时的输出需要按drop比例缩小，因此测试中比训练中有更多的单元被激活，需要缩小平衡处理（PS：这两个计算过程如果都放到训练中做，则测试时不需要缩小处理，当然在训练中则是按比例放大）；

Dropout思想：在层的输出中人工引入噪声，以打破那些不显著的、偶然发现的表示/模式，以此实现降低过拟合，重要的表示理应是有更强的鲁棒性的；

## DL用于计算机视觉

针对不用的场景业务，不同类型（不同阶的张量）数据，需要使用不同的网络拓扑架构+神经层来构建模型，例如普遍用于CV领域的卷积神经网络；

In [None]:
gc.collect()
model = models.Sequential()
# output shape=26 26 32, 26 26由(3,3)，也就是窗口大小决定，通道数32由入参32决定
model.add(layers.Conv2D(32,(3,3),activation="relu",input_shape=(28,28,1,)))
model.add(layers.MaxPooling2D((2,2)))
model.add(layers.Conv2D(64,(3,3),activation="relu"))
model.add(layers.MaxPooling2D((2,2)))
model.add(layers.Conv2D(64,(3,3),activation="relu"))

# 下面这三层用于多类别概率输出，不属于卷积神经网络拓扑结构的核心层
model.add(layers.Flatten())
model.add(layers.Dense(64,activation="relu"))
model.add(layers.Dense(10,activation="softmax"))
model.summary()

In [None]:
(train_images,train_labels),(test_images,test_labels) = mnist.load_data()

train_x = train_images.reshape((60000,28,28,1))
test_x = test_images.reshape((10000,28,28,1))

train_x = train_x.astype("float32")/255
test_x = test_x.astype("float32")/255

train_y = to_categorical(train_labels)
test_y = to_categorical(test_labels)

model.compile(optimizer="rmsprop",
               loss="categorical_crossentropy",
               metrics=["accuracy"])

model.fit(train_x,train_y,epochs=5,batch_size=128)

test_loss,test_acc = model.evaluate(test_x,test_y)

print("test loss:",test_loss)
print("test accuracy:",test_acc)

In [None]:
gc.collect()
model = models.Sequential()
model.add(layers.Conv2D(32,(3,3),activation="relu",input_shape=(28,28,1,)))
model.add(layers.Conv2D(64,(3,3),activation="relu"))
model.add(layers.Conv2D(64,(3,3),activation="relu"))

model.add(layers.Flatten())
model.add(layers.Dense(64,activation="relu"))
model.add(layers.Dense(10,activation="softmax"))

model.compile(optimizer="rmsprop",
               loss="categorical_crossentropy",
               metrics=["accuracy"])

model.fit(train_x,train_y,epochs=5,batch_size=128)

test_loss,test_acc = model.evaluate(test_x,test_y)

print("test loss:",test_loss)
print("test accuracy:",test_acc)

## 卷积神经网络 - 猫狗分类

- 小型卷积神经网络：0.74；
- 数据增强+dropout：0.8；
- 预训练网络提取特征：0.9；
- 微调预训练网络：0.9；

In [None]:
!mkdir ../working/train
!mkdir ../working/test
!mkdir ../working/validation
!mkdir ../working/train/dogs
!mkdir ../working/validation/dogs
!mkdir ../working/test/dogs
!mkdir ../working/train/cats
!mkdir ../working/validation/cats
!mkdir ../working/test/cats
!mkdir ../working/models

import os
cats = []
dogs = []
for f in os.listdir("../input/dogs-vs-cats/train/train"):
    if f.startswith("cat") and f.endswith("jpg") and len(cats)<2000:
        cats.append(f)
    elif f.startswith("dog") and f.endswith("jpg") and len(dogs)<2000:
        dogs.append(f)

for cat in cats[:1000]:
    os.system("cp ../input/dogs-vs-cats/train/train/"+cat+" ../working/train/cats/")
for cat in cats[1000:1500]:
    os.system("cp ../input/dogs-vs-cats/train/train/"+cat+" ../working/validation/cats/")
for cat in cats[1500:]:
    os.system("cp ../input/dogs-vs-cats/train/train/"+cat+" ../working/test/cats/")
for dog in dogs[:1000]:
    os.system("cp ../input/dogs-vs-cats/train/train/"+dog+" ../working/train/dogs/")
for dog in dogs[1000:1500]:
    os.system("cp ../input/dogs-vs-cats/train/train/"+dog+" ../working/validation/dogs/")
for dog in dogs[1500:]:
    os.system("cp ../input/dogs-vs-cats/train/train/"+dog+" ../working/test/dogs/")

train_dir = "../working/train"
test_dir = "../working/test"
validation_dir = "../working/validation"
print(len(os.listdir(train_dir)))
print(len(os.listdir(validation_dir)))
print(len(os.listdir(test_dir)))

In [None]:
gc.collect()
model = models.Sequential()
model.add(layers.Conv2D(32,(3,3),activation="relu",input_shape=(150,150,3)))
model.add(layers.MaxPool2D((2,2)))
model.add(layers.Conv2D(64,(3,3),activation="relu"))
model.add(layers.MaxPool2D((2,2)))
model.add(layers.Conv2D(128,(3,3),activation="relu"))
model.add(layers.MaxPool2D((2,2)))
model.add(layers.Conv2D(128,(3,3),activation="relu"))
model.add(layers.MaxPool2D((2,2)))
model.add(layers.Flatten())
model.add(layers.Dense(512,activation="relu"))
model.add(layers.Dense(1,activation="sigmoid"))
model.summary()

In [None]:
model.compile(optimizer="rmsprop",loss="binary_crossentropy",metrics=["acc"])

In [None]:
gc.collect()
train_generator = ImageDataGenerator(rescale=1./255)
test_generator = ImageDataGenerator(rescale=1./255)

train_iter = train_generator.flow_from_directory(train_dir,target_size=(150,150),batch_size=20,class_mode="binary")
validation_iter = test_generator.flow_from_directory(validation_dir,target_size=(150,150),batch_size=20,class_mode="binary")

history = model.fit_generator(
    train_iter,steps_per_epoch=100, # 该参数表示每次epoch迭代次数，次数为(1000+1000)/20
    epochs=30,
    validation_data=validation_iter,validation_steps=50 # 与上述steps一致，次数为(500+500)/20
)

model.save("../working/models/cats_dogs_1.h5")

In [None]:
history_1 = pd.DataFrame(history.history)
history_1[["loss","val_loss"]].plot()

In [None]:
history_1[["acc","val_acc"]].plot()

In [None]:
gc.collect()
data_gen = ImageDataGenerator(rotation_range=40,width_shift_range=.2,height_shift_range=.2,
                              shear_range=.2,zoom_range=.2,horizontal_flip=True,fill_mode="nearest")
img = image.load_img("../working/train/dogs/"+os.listdir("../working/train/dogs")[0], target_size=(150,150))
img_arr = image.img_to_array(img)
img_arr = img_arr.reshape((1,)+img_arr.shape) # reshape to 1 150 150 3
i=0
plt.imshow(image.img_to_array(img))
for batch in data_gen.flow(img_arr,batch_size=1):
    plt.figure(i+1)
    plt.imshow(image.array_to_img(batch[0]))
    i+=1
    if i%3==0:
        break
plt.show()

In [None]:
gc.collect()
train_generator = ImageDataGenerator(rescale=1./255,rotation_range=40,
                                     width_shift_range=.2,height_shift_range=.2,
                                    shear_range=.2,zoom_range=.2,horizontal_flip=True)
test_generator = ImageDataGenerator(rescale=1./255)
train_iter = train_generator.flow_from_directory(train_dir,target_size=(150,150),batch_size=20,class_mode="binary")
validation_iter = test_generator.flow_from_directory(validation_dir,target_size=(150,150),batch_size=20,class_mode="binary")

model = models.Sequential()
model.add(layers.Conv2D(32,(3,3),activation="relu",input_shape=(150,150,3)))
model.add(layers.MaxPool2D((2,2)))
model.add(layers.Conv2D(64,(3,3),activation="relu"))
model.add(layers.MaxPool2D((2,2)))
model.add(layers.Conv2D(128,(3,3),activation="relu"))
model.add(layers.MaxPool2D((2,2)))
model.add(layers.Conv2D(128,(3,3),activation="relu"))
model.add(layers.MaxPool2D((2,2)))
model.add(layers.Flatten())
model.add(layers.Dropout(.5)) # drop比例，处理过拟合问题
model.add(layers.Dense(512,activation="relu"))
model.add(layers.Dense(1,activation="sigmoid"))

model.compile(optimizer="rmsprop",loss="binary_crossentropy",metrics=["acc"])

history = model.fit_generator(
    train_iter,steps_per_epoch=100, # 该参数表示每次epoch迭代次数，次数为(1000+1000)/20
    epochs=30,
    validation_data=validation_iter,validation_steps=50 # 与上述steps一致，次数为(500+500)/20
)

model.save("../working/models/cats_dogs_2.h5")

In [None]:
history_2 = pd.DataFrame(history.history)
history_2[["loss","val_loss"]].plot()

In [None]:
history_2[["acc","val_acc"]].plot()

In [None]:
conv_base = VGG16(weights="imagenet",include_top=False,input_shape=(150,150,3))
conv_base.summary()

In [None]:
gc.collect()
# VGG16做特征提取输出feature，即不使用数据增强
data_gen = ImageDataGenerator(rescale=1./255)
batch_size = 20
def extract_feature(path,sample_count):
    features = np.zeros(shape=(sample_count,4,4,512))
    labels = np.zeros(shape=(sample_count))
    generator = data_gen.flow_from_directory(
        path,target_size=(150,150),batch_size=batch_size,class_mode="binary"
    )
    i=0
    for input_batch,label_batch in generator:
        features_batch = conv_base.predict(input_batch)
        features[i*batch_size:(i+1)*batch_size]=features_batch
        labels[i*batch_size:(i+1)*batch_size]=label_batch
        i+=1
        if i*batch_size >= sample_count:
            break
    return features,labels

train_features,train_labels = extract_feature(train_dir,2000)
test_features,test_labels = extract_feature(test_dir,1000)
validation_features,validation_labels = extract_feature(validation_dir,1000)

# 手动展开
train_features = np.reshape(train_features,(2000,4*4*512))
test_features = np.reshape(test_features,(1000,4*4*512))
validation_features = np.reshape(validation_features,(1000,4*4*512))

# 结构从原始的卷积神经网络变为只有两个隐含层的线性叠加结构，相当于之前的卷积层已经被卷积基代替了
model = models.Sequential()
model.add(layers.Dense(256,activation="relu",input_shape=(4*4*512,)))
model.add(layers.Dropout(.5))
model.add(layers.Dense(1,activation="sigmoid"))

model.compile(optimizer=optimizers.RMSprop(lr=2e-5),loss="binary_crossentropy",metrics=["acc"])

history = model.fit(train_features,train_labels,epochs=30,batch_size=20,validation_data=(validation_features,validation_labels))

In [None]:
history_3 = pd.DataFrame(history.history)
history_3[["loss","val_loss"]].plot()

In [None]:
history_3[["acc","val_acc"]].plot()

In [None]:
gc.collect()
# 将卷积基作为一个网络层添加到拓扑结构中，应用数据增强
train_generator = ImageDataGenerator(rescale=1./255,rotation_range=20,
                                     width_shift_range=.1,height_shift_range=.1,
                                    shear_range=.2,zoom_range=.2,horizontal_flip=True)
test_generator = ImageDataGenerator(rescale=1./255)
train_iter = train_generator.flow_from_directory(train_dir,target_size=(150,150),batch_size=20,class_mode="binary")
validation_iter = test_generator.flow_from_directory(validation_dir,target_size=(150,150),batch_size=20,class_mode="binary")

conv_base = VGG16(weights="imagenet",include_top=False,input_shape=(150,150,3))
conv_base.trainable = False
model = models.Sequential()
model.add(conv_base)
model.add(layers.Flatten())
model.add(layers.Dense(256,activation="relu"))
model.add(layers.Dense(1,activation="sigmoid"))
model.summary()

In [None]:
gc.collect()
model.compile(optimizer=optimizers.RMSprop(lr=2e-5),loss="binary_crossentropy",metrics=["acc"])

history = model.fit_generator(
    train_iter,steps_per_epoch=100, # 该参数表示每次epoch迭代次数，次数为(1000+1000)/20
    epochs=30,
    validation_data=validation_iter,validation_steps=50 # 与上述steps一致，次数为(500+500)/20
)

In [None]:
history_4 = pd.DataFrame(history.history)
history_4[["loss","val_loss"]].plot()

In [None]:
history_4[["acc","val_acc"]].plot()

In [None]:
gc.collect()
# Fine tuning
refreeze = False
for layer in conv_base.layers:
    if layer == "block5_conv1":
        refreeze = True
    if refreeze:
        layer.trainable = True
    else:
        layer.trainable = False

# 取更小的学习率，保证对于原网络表示的更新是微小的
model.compile(optimizer=optimizers.RMSprop(lr=3e-6),loss="binary_crossentropy",metrics=["acc"])

# 解冻卷积基中部分层，基于之前训练好的分类层做联合训练
history = model.fit_generator(
    train_iter,steps_per_epoch=100, # 该参数表示每次epoch迭代次数，次数为(1000+1000)/20
    epochs=50,
    validation_data=validation_iter,validation_steps=50 # 与上述steps一致，次数为(500+500)/20
)

In [None]:
history_5 = pd.DataFrame(history.history)
history_5[["loss","val_loss"]].plot()

In [None]:
history_5[["acc","val_acc"]].plot()

## 深度神经网络可视化

对于视觉世界任务，相关可视化方法甚至要好于人工特征工程的可视化效果，这与视觉世界的平移不变、层次空间等特性与卷积神经网络提取表示特征想符合有很大关系，同样的结构用于其他任务，可视化效果可能就没这么理想；

- 卷积核输出结果可视化（深度为N个过滤器）；


In [None]:
model = load_model("../working/models/cats_dogs_2.h5")
model.summary()

In [None]:
gc.collect()
img_path = "../working/test/dogs/"+random.choice(os.listdir("../working/test/dogs"))
img = image.load_img(img_path,target_size=(150,150))
img_tensor = image.img_to_array(img)
print(img_tensor.shape)
img_tensor = np.expand_dims(img_tensor,axis=0)
img_tensor /= 255.
print(img_tensor.shape)

layer_outputs = [layer.output for layer in model.layers[:8]] # 只获取前8层，也就是卷积层和最大池层的输出 - 中间输出
activation_model = models.Model(inputs=model.input, outputs=layer_outputs)
activations = activation_model.predict(img_tensor)
for i in range(len(activations)):
    print(activations[i].shape) # 输出的shape


# 第一个卷积层的第八个过滤器
plt.figure(figsize=(15,20))
plt.subplot(5,1,1)
plt.imshow(img_tensor[0])

idx = random.choice(list(range(activations[0].shape[3])))
vec = activations[0][0,:,:,idx]
plt.subplot(5,4,5)
plt.imshow(np.clip(((vec-vec.mean())/vec.std())*64+128,0,255).astype("uint8"),cmap="viridis")
plt.subplot(5,4,9)
plt.imshow(vec,cmap="viridis")
vec = activations[1][0,:,:,idx]
plt.subplot(5,4,13)
plt.imshow(np.clip(((vec-vec.mean())/vec.std())*64+128,0,255).astype("uint8"),cmap="viridis")
plt.subplot(5,4,17)
plt.imshow(vec,cmap="viridis")

idx = random.choice(list(range(activations[2].shape[3])))
vec = activations[2][0,:,:,idx]
plt.subplot(5,4,6)
plt.imshow(np.clip(((vec-vec.mean())/vec.std())*64+128,0,255).astype("uint8"),cmap="viridis")
plt.subplot(5,4,10)
plt.imshow(vec,cmap="viridis")
vec = activations[3][0,:,:,idx]
plt.subplot(5,4,14)
plt.imshow(np.clip(((vec-vec.mean())/vec.std())*64+128,0,255).astype("uint8"),cmap="viridis")
plt.subplot(5,4,18)
plt.imshow(vec,cmap="viridis")

idx = random.choice(list(range(activations[4].shape[3])))
vec = activations[4][0,:,:,idx]
plt.subplot(5,4,7)
plt.imshow(np.clip(((vec-vec.mean())/vec.std())*64+128,0,255).astype("uint8"),cmap="viridis")
plt.subplot(5,4,11)
plt.imshow(vec,cmap="viridis")
vec = activations[5][0,:,:,idx]
plt.subplot(5,4,15)
plt.imshow(np.clip(((vec-vec.mean())/vec.std())*64+128,0,255).astype("uint8"),cmap="viridis")
plt.subplot(5,4,19)
plt.imshow(vec,cmap="viridis")

idx = random.choice(list(range(activations[6].shape[3])))
vec = activations[6][0,:,:,idx]
plt.subplot(5,4,8)
plt.imshow(np.clip(((vec-vec.mean())/vec.std())*64+128,0,255).astype("uint8"),cmap="viridis")
plt.subplot(5,4,12)
plt.imshow(vec,cmap="viridis")
vec = activations[7][0,:,:,idx]
plt.subplot(5,4,16)
plt.imshow(np.clip(((vec-vec.mean())/vec.std())*64+128,0,255).astype("uint8"),cmap="viridis")
plt.subplot(5,4,20)
plt.imshow(vec,cmap="viridis")

In [None]:
gc.collect()
layer_names = [layer.name for layer in model.layers]
images_per_row = 16
for layer_name, layer_activation in zip(layer_names,activations):
    n_features = layer_activation.shape[-1]
    size = layer_activation.shape[1]
    n_cols = n_features // images_per_row
    display_grid = np.zeros((size*n_cols,images_per_row*size))
    for col in range(n_cols):
        for row in range(images_per_row):
            channel_image = layer_activation[0,:,:,col*images_per_row+row]
            channel_image = ((channel_image - channel_image.mean())/channel_image.std())*64+128
            channel_image = np.clip(channel_image,0,255).astype("uint8")
            display_grid[col*size:(col+1)*size,row*size:(row+1)*size] = channel_image
    scale = 1./size
    plt.figure(figsize=(scale*display_grid.shape[1],scale*display_grid.shape[0]))
    plt.title(layer_name)
    plt.grid(False)
    plt.imshow(display_grid,aspect="auto",cmap="viridis")

In [None]:
gc.collect()
img_path = "../working/test/cats/"+random.choice(os.listdir("../working/test/cats"))
img = image.load_img(img_path,target_size=(150,150))
img_tensor = image.img_to_array(img)
print(img_tensor.shape)
img_tensor = np.expand_dims(img_tensor,axis=0)
img_tensor /= 255.
print(img_tensor.shape)

layer_outputs = [layer.output for layer in model.layers[:8]] # 只获取前8层，也就是卷积层和最大池层的输出 - 中间输出
activation_model = models.Model(inputs=model.input, outputs=layer_outputs)
activations = activation_model.predict(img_tensor)
for i in range(len(activations)):
    print(activations[i].shape) # 输出的shape


# 第一个卷积层的第八个过滤器
plt.figure(figsize=(15,20))
plt.subplot(5,1,1)
plt.imshow(img_tensor[0])

idx = random.choice(list(range(activations[0].shape[3])))
vec = activations[0][0,:,:,idx]
plt.subplot(5,4,5)
plt.imshow(np.clip(((vec-vec.mean())/vec.std())*64+128,0,255).astype("uint8"),cmap="viridis")
plt.subplot(5,4,9)
plt.imshow(vec,cmap="viridis")
vec = activations[1][0,:,:,idx]
plt.subplot(5,4,13)
plt.imshow(np.clip(((vec-vec.mean())/vec.std())*64+128,0,255).astype("uint8"),cmap="viridis")
plt.subplot(5,4,17)
plt.imshow(vec,cmap="viridis")

idx = random.choice(list(range(activations[2].shape[3])))
vec = activations[2][0,:,:,idx]
plt.subplot(5,4,6)
plt.imshow(np.clip(((vec-vec.mean())/vec.std())*64+128,0,255).astype("uint8"),cmap="viridis")
plt.subplot(5,4,10)
plt.imshow(vec,cmap="viridis")
vec = activations[3][0,:,:,idx]
plt.subplot(5,4,14)
plt.imshow(np.clip(((vec-vec.mean())/vec.std())*64+128,0,255).astype("uint8"),cmap="viridis")
plt.subplot(5,4,18)
plt.imshow(vec,cmap="viridis")

idx = random.choice(list(range(activations[4].shape[3])))
vec = activations[4][0,:,:,idx]
plt.subplot(5,4,7)
plt.imshow(np.clip(((vec-vec.mean())/vec.std())*64+128,0,255).astype("uint8"),cmap="viridis")
plt.subplot(5,4,11)
plt.imshow(vec,cmap="viridis")
vec = activations[5][0,:,:,idx]
plt.subplot(5,4,15)
plt.imshow(np.clip(((vec-vec.mean())/vec.std())*64+128,0,255).astype("uint8"),cmap="viridis")
plt.subplot(5,4,19)
plt.imshow(vec,cmap="viridis")

idx = random.choice(list(range(activations[6].shape[3])))
vec = activations[6][0,:,:,idx]
plt.subplot(5,4,8)
plt.imshow(np.clip(((vec-vec.mean())/vec.std())*64+128,0,255).astype("uint8"),cmap="viridis")
plt.subplot(5,4,12)
plt.imshow(vec,cmap="viridis")
vec = activations[7][0,:,:,idx]
plt.subplot(5,4,16)
plt.imshow(np.clip(((vec-vec.mean())/vec.std())*64+128,0,255).astype("uint8"),cmap="viridis")
plt.subplot(5,4,20)
plt.imshow(vec,cmap="viridis")

In [None]:
gc.collect()
layer_names = [layer.name for layer in model.layers]
images_per_row = 16
for layer_name, layer_activation in zip(layer_names,activations):
    n_features = layer_activation.shape[-1]
    size = layer_activation.shape[1]
    n_cols = n_features // images_per_row
    display_grid = np.zeros((size*n_cols,images_per_row*size))
    for col in range(n_cols):
        for row in range(images_per_row):
            channel_image = layer_activation[0,:,:,col*images_per_row+row]
            channel_image = ((channel_image - channel_image.mean())/channel_image.std())*64+128
            channel_image = np.clip(channel_image,0,255).astype("uint8")
            display_grid[col*size:(col+1)*size,row*size:(row+1)*size] = channel_image
    scale = 1./size
    plt.figure(figsize=(scale*display_grid.shape[1],scale*display_grid.shape[0]))
    plt.title(layer_name)
    plt.grid(False)
    plt.imshow(display_grid,aspect="auto",cmap="viridis")

In [None]:
# cat_output = model.output[:,0]
# last_conv_layer = model.get_layer("conv2d_3")
# grads = K.gradients(cat_output,last_conv_layer.output)[0]
# pooled_grads = K.mean(grads,axis=(0,1,2))
# iterate = K.function([model.input],[pooled_grads,last_conv_layer.output[0]])
# pooled_grads_value,conv_layer_output_value = iterate(img_tensor)
# for i in range(512):
#     conv_layer_output_value[:,:,i]*=pooled_grads_value[i]
# heatmap = np.mean(conv_layer_output,axis=-1)
# heatmap = np.maximum(heatmap,0)
# heatmap /= np.max(heatmap)
# plt.matshow(heatmap)

## DL应用于序列问题

- 文本序列；
- 时间序列；
- 其他序列；

序列问题特点：序列元素前后关系、上下文、历史依赖性；

对于文本序列组成单元：
- 字；
- 词；
- n-grams，词组对；
称为token，分词更一般的指的是将文本转为token集合，只不过一般值得是分为**词**的集合；

文本数据向量化：
- one-hot；
- token嵌入（一般指词嵌入）；

In [None]:
gc.collect()
(x_train,y_train),(x_test,y_test) = imdb.load_data(num_words=10000) # 获取出现次数最多的前10000个单词
x_train = preprocessing.sequence.pad_sequences(x_train,maxlen=100) # 文本截取前20个单词
x_test = preprocessing.sequence.pad_sequences(x_test,maxlen=100)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

In [None]:
# 利用Embedding层学习词嵌入字典
# 嵌入层后直接接全连接层做分类任务，缺陷在于模型无法全面的考虑token之间的关系以及上下文、语境等，而是一个一个的针对单个token做计算
# you love i和i love you在这种情况下模型无法分辨其差异
model = models.Sequential()
model.add(layers.Embedding(10000,8,input_length=100)) # 10000为上述索引最大值+1，8为单个token向量化后的向量长度
model.add(layers.Flatten())
model.add(layers.Dense(1,activation="sigmoid"))

model.compile(loss="binary_crossentropy",optimizer="rmsprop",metrics=["acc"])

model.summary()

In [None]:
gc.collect()
history = model.fit(x_train,y_train,epochs=10,batch_size=32,validation_split=.2)
history_df = pd.DataFrame(history.history)
history_df[["loss","val_loss"]].plot()

In [None]:
history_df[["acc","val_acc"]].plot()

## 循环神经网络 - RNN

循环的目的在于一步一步的计算时间步内结果并作为状态向后传递，循环作用于单个输入序列上（类似人眼扫过一大段话时，会一小段一小段的看，同时大脑中的内容模型会根据已经看过的内容实时更新，最后看完整句话，并理解了它的意思），在两个不同的序列之间，状态会被重置；

- SimpleRNN：最简单循环神经网络，SimpleRNN的问题在于当处于时刻t，理论说，此时模型应该记住所有之前的时间步见过的信息，但是实际上它不可能学习到这种**长期依赖**，原因在于**梯度消失问题**；
- LSTM：长短期记忆，是SimpleRNN的一种变体，简单地说它增加了一种可以跨越多个时间步传递信息的机制，以解决SimpleRNN的长期依赖捕获不到的问题；
- GRU：

RNN高级用法：
- 循环dropout：
- 堆叠循环层：
- 双向循环：

In [None]:
gc.collect()
timesteps = 10
input_features = 8
output_features = 16

input_seq = np.random.random((timesteps,input_features))

W = np.random.random((output_features,input_features))
U = np.random.random((output_features,output_features))
B = np.random.random((output_features,))

output_seq = []
state_t = np.zeros((output_features,))
for input_t in input_seq:
    output_t = np.tanh(np.dot(W,input_t)+np.dot(U,state_t)+B)
    output_seq.append(output_t)
    state_t = output_t
    
print(input_seq)
print(output_seq)

In [None]:
gc.collect()
model = models.Sequential()
model.add(layers.Embedding(10000,32))
model.add(layers.SimpleRNN(32)) # 每个序列的处理只输出最后一个时间步对应的结果
model.summary()

In [None]:
gc.collect()
model = models.Sequential()
model.add(layers.Embedding(10000,32))
model.add(layers.SimpleRNN(32,return_sequences=True)) # 输出所有时间步的结果
model.summary()

In [None]:
gc.collect()
model = models.Sequential()
model.add(layers.Embedding(10000,32))
model.add(layers.SimpleRNN(32,return_sequences=True)) # 堆叠循环层
model.add(layers.SimpleRNN(32,return_sequences=True))
model.add(layers.SimpleRNN(32,return_sequences=True))
model.add(layers.SimpleRNN(32))
model.summary()

In [None]:
gc.collect()
max_features,maxlen,batch_size = 10000,500,32
(input_train,y_train),(input_test,y_test) = imdb.load_data(num_words=max_features)
input_train = preprocessing.sequence.pad_sequences(input_train,maxlen=maxlen)
input_test = preprocessing.sequence.pad_sequences(input_test,maxlen=maxlen)
print(input_train.shape)
print(y_train.shape)
print(input_test.shape)
print(y_test.shape)

In [None]:
gc.collect()
model = models.Sequential()
model.add(layers.Embedding(max_features,32))
model.add(layers.SimpleRNN(32))
model.add(layers.Dense(1,activation="sigmoid"))

model.compile(optimizer="rmsprop",loss="binary_crossentropy",metrics=["acc"])

history = model.fit(input_train,y_train,epochs=10,batch_size=batch_size,validation_split=.2)

In [None]:
history_df = pd.DataFrame(history.history)
history_df[["loss","val_loss"]].plot()

In [None]:
history_df[["acc","val_acc"]].plot()

In [None]:
gc.collect()
model = models.Sequential()
model.add(layers.Embedding(max_features,32))
model.add(layers.LSTM(32)) # 缓解SimpleRNN中由于梯度消失导致的无法识别长期依赖问题
model.add(layers.Dense(1,activation="sigmoid"))

model.compile(optimizer="rmsprop",loss="binary_crossentropy",metrics=["acc"])

history = model.fit(input_train,y_train,epochs=10,batch_size=batch_size,validation_split=.2)

In [None]:
history_df = pd.DataFrame(history.history)
history_df[["loss","val_loss"]].plot()

In [None]:
history_df[["acc","val_acc"]].plot()

## 循环神经网络应用于时序序列

- 循环dropout；
- 堆叠循环层；
- 双向循环；

时序序列问题抽象：
1. 设timestep=10min，每个step采集一次数据；
2. 给定lookback个step之内的数据；
3. 能够预测delay个step之后的某个target；

In [None]:
jc_df = pd.read_csv("../input/jena-climate-2009-2016/jena_climate_2009_2016.csv")
jc_df.head()

In [None]:
jc_df["T (degC)"].plot()

In [None]:
jc_df.iloc[:1440]["T (degC)"].plot() # 每天144个点，前10天温度数据

In [None]:
# 标准化
float_data = jc_df.drop("Date Time",axis=1).values
mean = float_data[:200000].mean(axis=0)
float_data -= mean
std = float_data[:20000].std(axis=0)
float_data /= std
float_data

In [None]:
# 利用yield关键字构建时间序列样本及其target生成器
def generator(data,lookback,delay,min_index,max_index,shuffle=False,batch_size=128,step=6):
    '''
    data
    lookback
    delay
    min_index,max_index
    shuffle
    batch_size
    step
    '''
    max_index = len(data)-delay-1 if max_index is None else max_index
    i = min_index + lookback
    while True:
        if shuffle:
            rows = np.random.randint(min_index+lookback,max_index,size=batch_size)
        else:
            i = min_index + lookback if i + batch_size >= max_index else i
            rows = np.arange(i,min(i+batch_size,max_index))
            i += len(rows)
        samples = np.zeros((len(rows),lookback//step,data.shape[-1]))
        targets = np.zeros((len(rows),))
        for j,row in enumerate(rows):
            indices = range(rows[j]-lookback,rows[j],step)
            samples[j] = data[indices]
            targets[j] = data[rows[j]+delay][1]
        yield samples,targets

In [None]:
lookback,step,delay,batch_size = 1440,6,144,128
train_gen = generator(data=float_data,lookback=lookback,delay=delay,min_index=0,max_index=200000,shuffle=True,step=step,batch_size=batch_size)
val_gen = generator(data=float_data,lookback=lookback,delay=delay,min_index=200001,max_index=300000,shuffle=True,step=step,batch_size=batch_size)
test_gen = generator(data=float_data,lookback=lookback,delay=delay,min_index=300001,max_index=None,shuffle=True,step=step,batch_size=batch_size)

val_steps = (300000-200001-lookback) // batch_size
test_steps = (len(float_data)-300001-lookback) // batch_size

In [None]:
# naive method
def evaluate_naive_method():
    batch_maes = []
    for step in range(val_steps):
        samples,targets = next(val_gen)
        preds = samples[:,-1,1]
        mae = np.mean(np.abs(preds-targets))
        batch_maes.append(mae)
    print("naive method mae=",np.mean(batch_maes))
evaluate_naive_method()

In [None]:
# 密集连接网络模型
model = models.Sequential()
model.add(layers.Flatten(input_shape=(lookback//step,float_data.shape[-1])))
model.add(layers.Dense(32,activation='relu'))
model.add(layers.Dense(1))
model.compile(optimizer=optimizers.RMSprop(),loss='mae')
history = model.fit_generator(train_gen,steps_per_epoch=500,epochs=20,validation_data=val_gen,validation_steps=val_steps)

In [None]:
history_df = pd.DataFrame(history.history)
history_df[['loss','val_loss']].plot()

In [None]:
# RNN GRU基准模型
model = models.Sequential()
model.add(layers.GRU(32,input_shape=(None,float_data.shape[-1])))
model.add(layers.Dense(1))
model.compile(optimizer=optimizers.RMSprop(),loss='mae')
history = model.fit_generator(train_gen,steps_per_epoch=500,epochs=20,validation_data=val_gen,validation_steps=val_steps)

In [None]:
history_df = pd.DataFrame(history.history)
history_df[['loss','val_loss']].plot()

In [None]:
# RNN GRU+Dropout
model = models.Sequential()
model.add(layers.GRU(32,dropout=.2,recurrent_dropout=.2,input_shape=(None,float_data.shape[-1])))
model.add(layers.Dense(1))
model.compile(optimizer=optimizers.RMSprop(),loss='mae')
history = model.fit_generator(train_gen,steps_per_epoch=500,epochs=20,validation_data=val_gen,validation_steps=val_steps)

In [None]:
history_df = pd.DataFrame(history.history)
history_df[['loss','val_loss']].plot()

In [None]:
# RNN GRU堆叠+Dropout
model = models.Sequential()
model.add(layers.GRU(32,dropout=.1,recurrent_dropout=.5,return_sequence=True,input_shape=(None,float_data.shape[-1])))
model.add(layers.GRU(64,activation='relu',dropout=.1,recurrent_dropout=.5))
model.add(layers.Dense(1))
model.compile(optimizer=optimizers.RMSprop(),loss='mae')
history = model.fit_generator(train_gen,steps_per_epoch=500,epochs=20,validation_data=val_gen,validation_steps=val_steps)

In [None]:
history_df = pd.DataFrame(history.history)
history_df[['loss','val_loss']].plot()

## 高级深度学习最佳实践

- Keras函数式API；
- Keras回调函数；
- TensorBoard可视化工具；
- 开发先进模型的重要最佳实践；
    - 高级架构模式：
        1. 批标准化；
        2. 深度可分离卷积；
        3. 残差连接；
    - 超参数优化；
    - 集成模型；

多模态数据输入：
- 构建多个模型，每个模型处理一种输入，最后结果加权融合 - 缺点：多个模型学习的内容可能是互相冗余的，数据分散降低了假设空间的可能性；
- 构建单个模型同时处理多个输入 - 优点：不需要额外处理结果，对数据的使用充分且不会冗余；

多任务输出：
- 构建多个模型 - 缺点：会有重复工作量，数据特征互相是相关的；
- 构建单个模型输出多个任务输出 - 优点：任务之间的相关性使得模型浅层表示会对各个任务都有用；

In [None]:
# Sequential构建的都是线性堆叠层的模型，结构简单明了，单输入单输出；
# Keras函数式API可以构建类图结构模型，且支持多输入、多输出；
# 多模态输入：文本、结构化数据、图片共同服务于一个任务；

input_tensor = Input(shape=(64,))
x = layers.Dense(32,activation="relu",input_shape=(64,))(input_tensor)
x = layers.Dense(32,activation="relu")(x)
output_tensor = layers.Dense(10,activation="softmax")(x)

model = models.Model(input_tensor,output_tensor)
model.summary()

model.compile(optimizer="rmsprop",loss="categorical_crossentropy")

x_train = np.random.random((1000,64))
y_train = np.random.random((1000,10))
model.fit(x_train,y_train,epochs=10,batch_size=128)
model.evaluate(x_train,y_train)

## 生成式深度学习

顾名思义，不是用于被动性（目标识别）、反应性（驾驶汽车），而是创造性的任务；

In [None]:
def reweight_distribution(distribution, temperature=.5):
    distribution = np.log(distribution)/temperature
    distribution = np.exp(distribution)
    return distribution/np.sum(distribution)

In [None]:
arr = np.array([0.1,0.2,0.3,0.4])
print(arr)
print(reweight_distribution(arr,.01))
print(reweight_distribution(arr,.1))
print(reweight_distribution(arr,.5))
print(reweight_distribution(arr,.8))
print(reweight_distribution(arr,1.))

## 变分自编码器VAE与生成式对抗网络GAN

In [None]:
# VAE编码器网络
img_shape = (28,28,1)
batch_size = 16
latent_dim = 2 # 潜在空间维度

input_img = Input(shape=img_shape)
x = layers.Conv2D(32,3,padding="same",activation="relu")(input_img)
x = layers.Conv2D(64,3,padding="same",activation="relu",strides=(2,2))(x)
x = layers.Conv2D(64,3,padding="same",activation="relu")(x)
x = layers.Conv2D(64,3,padding="same",activation="relu")(x)
shape_before_flattening = K.int_shape(x)
x = layers.Flatten()(x)
x = layers.Dense(32,activation="relu")(x)
z_mean = layers.Dense(latent_dim)(x) # 输入图像最终被编码器编码为z_mean和z_log_var两个参数
z_log_var = layers.Dense(latent_dim)(x)

In [None]:
# # VAE编码器网络
# img_shape = (500,1)
# batch_size = 16
# latent_dim = 2 # 潜在空间维度

# input_img = Input(shape=img_shape)
# x = layers.Conv1D(32,3,padding="same",activation="relu")(input_img)
# x = layers.Conv1D(64,3,padding="same",activation="relu",strides=2)(x)
# x = layers.Conv1D(64,3,padding="same",activation="relu")(x)
# x = layers.Conv1D(64,3,padding="same",activation="relu")(x)
# shape_before_flattening = K.int_shape(x)
# x = layers.Flatten()(x)
# x = layers.Dense(32,activation="relu")(x)
# z_mean = layers.Dense(latent_dim)(x) # 输入图像最终被编码器编码为z_mean和z_log_var两个参数
# z_log_var = layers.Dense(latent_dim)(x)

In [None]:
# 潜在空间的采样函数
# 使用z_mean和z_log_var生成空间中的一个点
def sampling(args):
    z_mean_,z_log_var_ = args
    epsilon = K.random_normal(shape=(K.shape(z_mean_)[0],latent_dim),mean=0.,stddev=1.)
    return z_mean_ + K.exp(.5*z_log_var_) * epsilon
z = layers.Lambda(sampling)([z_mean,z_log_var])

In [None]:
# # 潜在空间的采样函数
# # 使用z_mean和z_log_var生成空间中的一个点
# def sampling(args):
#     z_mean_,z_log_var_ = args
#     epsilon = K.random_normal(shape=(K.shape(z_mean_)[0],latent_dim),mean=0.,stddev=1.)
#     return z_mean_ + K.exp(.5*z_log_var_) * epsilon
# z = layers.Lambda(sampling)([z_mean,z_log_var])

In [None]:
# VAE解码器网络，将潜在空间的点映射为图像
decoder_input = Input(K.int_shape(z)[1:])
x = layers.Dense(np.prod(shape_before_flattening[1:]),activation="relu")(decoder_input)
x = layers.Reshape(shape_before_flattening[1:])(x)
x = layers.Conv2DTranspose(32,3,padding="same",activation="relu",strides=(2,2))(x)
x = layers.Conv2D(1,3,padding="same",activation="sigmoid")(x)
decoder = models.Model(decoder_input,x)
z_decoded = decoder(z) # 编码器与解码器组合

In [None]:
# # VAE解码器网络，将潜在空间的点映射为图像
# decoder_input = Input(K.int_shape(z)[1:])
# x = layers.Dense(np.prod(shape_before_flattening[1:]),activation="relu")(decoder_input)
# x = layers.Reshape(shape_before_flattening[1:])(x)
# x = layers.Conv1DTranspose(32,3,padding="same",activation="relu",strides=2)(x)
# x = layers.Conv1D(1,3,padding="same",activation="sigmoid")(x)
# decoder = models.Model(decoder_input,x)
# z_decoded = decoder(z) # 编码器与解码器组合

In [None]:
# 自定义损失函数层
class CustomVariationalLayer(keras.layers.Layer):
    def vae_loss(self,x,z_decoded,z_mean,z_log_var):
        x = K.flatten(x)
        z_decoded = K.flatten(z_decoded)
        xent_loss = keras.metrics.binary_crossentropy(x,z_decoded)
        k1_loss = -5e-4 * K.mean(1+z_log_var-K.square(z_mean)-K.exp(z_log_var),axis=-1)
        return K.mean(xent_loss+k1_loss)
    def call(self,inputs):
        x,z_decoded,z_mean,z_log_var = inputs
        loss = self.vae_loss(x,z_decoded,z_mean,z_log_var)
        self.add_loss(loss,inputs=inputs)
        return x

y = CustomVariationalLayer()([input_img,z_decoded,z_mean,z_log_var])

In [None]:
# # 自定义损失函数层
# class CustomVariationalLayer(keras.layers.Layer):
#     def vae_loss(self,x,z_decoded,z_mean,z_log_var):
#         x = K.flatten(x)
#         z_decoded = K.flatten(z_decoded)
#         xent_loss = keras.metrics.binary_crossentropy(x,z_decoded)
#         k1_loss = -5e-4 * K.mean(1+z_log_var-K.square(z_mean)-K.exp(z_log_var),axis=-1)
#         return K.mean(xent_loss+k1_loss)
#     def call(self,inputs):
#         x,z_decoded,z_mean,z_log_var = inputs
#         loss = self.vae_loss(x,z_decoded,z_mean,z_log_var)
#         self.add_loss(loss,inputs=inputs)
#         return x

# y = CustomVariationalLayer()([input_img,z_decoded,z_mean,z_log_var])

In [None]:
# 实例化模型并训练，不需要指定loss，也不需要y_train
vae = models.Model(input_img,y)
vae.compile(optimizer="RMSprop",loss=None)
vae.summary()

In [None]:
gc.collect()
(x_train,_),(x_test,y_test) = mnist.load_data()
x_train = x_train.astype("float32") / 255.
x_train = x_train.reshape(x_train.shape+(1,))
x_test = x_test.astype("float32") / 255.
x_test = x_test.reshape(x_test.shape+(1,))

vae.fit(x=x_train,y=None,shuffle=True,epochs=10,batch_size=batch_size,validation_data=(x_test,None))

In [None]:
# gc.collect()
# max_features,maxlen,batch_size = 10000,500,32
# (x_train,y_train),(x_test,y_test) = imdb.load_data(num_words=max_features)
# x_train = preprocessing.sequence.pad_sequences(input_train,maxlen=maxlen)
# x_test = preprocessing.sequence.pad_sequences(input_test,maxlen=maxlen)
# print(x_train.shape)
# print(y_train.shape)
# print(x_test.shape)
# print(y_test.shape)

# vae.fit(x=x_train,y=None,shuffle=True,epochs=10,batch_size=batch_size,validation_data=(x_test,None))

In [None]:
# n=15
# print(np.linspace(0.05,0.95,n)) # 获取a到b的n等分集合
# print(norm.ppf(np.linspace(0.05,0.95,n))) # ppf：累积分布函数的逆函数，返回传入x对应的正态分布的x轴坐标，0对应0.5，两侧依次类推

In [None]:
# 从潜在空间采样，解码成图像
n = 15
digit_size = 28
figure = np.zeros((digit_size*n, digit_size*n))
grid_x = norm.ppf(np.linspace(0.05,0.95,n))
grid_y = norm.ppf(np.linspace(0.05,0.95,n))
for i,yi in enumerate(grid_x):
    for j,xi in enumerate(grid_y):
        z_sample = np.array([[xi,yi]])
        z_sample = np.tile(z_sample,batch_size).reshape(batch_size,2)
        x_decoded = decoder.predict(z_sample,batch_size=batch_size)
        digit = x_decoded[0].reshape(digit_size,digit_size)
        figure[i*digit_size:(i+1)*digit_size,j*digit_size:(j+1)*digit_size] = digit

plt.figure(figsize=(10,10))
plt.imshow(figure,cmap="Greys_r")

In [None]:
# # 从潜在空间采样，解码成文本
# n = 15
# digit_size = maxlen
# figure = np.zeros((digit_size*n, digit_size*n))
# grid_x = norm.ppf(np.linspace(0.05,0.95,n))
# grid_y = norm.ppf(np.linspace(0.05,0.95,n))
# for i,yi in enumerate(grid_x):
#     for j,xi in enumerate(grid_y):
#         z_sample = np.array([[xi,yi]])
#         z_sample = np.tile(z_sample,batch_size).reshape(batch_size,2)
#         x_decoded = decoder.predict(z_sample,batch_size=batch_size)
#         digit = x_decoded[0].reshape(digit_size,)
#         print(digit)
#         break
#     break

In [None]:
xi,yi = norm.ppf([0.95,0.05])
print(xi,yi)
z_sample = np.array([[xi,yi]])
z_sample = np.tile(z_sample,batch_size).reshape(batch_size,2)
x_decoded = decoder.predict(z_sample,batch_size=batch_size)
digit = x_decoded[0].reshape(digit_size,digit_size)
plt.imshow(digit,cmap="Greys_r")

In [None]:
xi,yi = norm.ppf([0.85,0.15])
print(xi,yi)
z_sample = np.array([[xi,yi]])
z_sample = np.tile(z_sample,batch_size).reshape(batch_size,2)
x_decoded = decoder.predict(z_sample,batch_size=batch_size)
digit = x_decoded[0].reshape(digit_size,digit_size)
plt.imshow(digit,cmap="Greys_r")

In [None]:
xi,yi = norm.ppf([0.75,0.25])
print(xi,yi)
z_sample = np.array([[xi,yi]])
z_sample = np.tile(z_sample,batch_size).reshape(batch_size,2)
x_decoded = decoder.predict(z_sample,batch_size=batch_size)
digit = x_decoded[0].reshape(digit_size,digit_size)
plt.imshow(digit,cmap="Greys_r")

The end.
From SIBAT.