In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

## 微调神经网络参数

In [None]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

#### 加载数据

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing ()

### 尝试使用csv导入

In [None]:
from pandas import read_csv
#使用Pandas导入csv数据
filename = 'housing.csv'
housing = read_csv(filename)
print(housing.shape)

(20640, 10)


In [None]:
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [None]:
housing=housing.drop(['ocean_proximity'],axis=1)
housing.to_csv('housing_.csv')

In [None]:
housing_ = read_csv('housing_.csv')
housing_

Unnamed: 0.1,Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0
...,...,...,...,...,...,...,...,...,...,...
20635,20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0
20636,20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0
20637,20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0
20638,20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0


In [None]:
import csv

In [None]:
def read_data(test_data='features.csv',n=1,label=1):
    '''
    加载数据的功能
    n:特征数据起始位
    label：是否是监督样本数据
    '''
    csv_reader=csv.reader(open(test_data))
    data_list=[]
    for one_line in csv_reader:
        data_list.append(one_line)
    x_list=[]
    y_list=[]
    for one_line in data_list[1:]:
        if label==1:
            y_list.append(int(one_line[-1]))   #标志位
            one_list=[float(o) for o in one_line[n:-1]]
            x_list.append(one_list)
        else:
            one_list=[float(o) for o in one_line[n:]]
            x_list.append(one_list)
    return x_list, y_list

In [None]:
def split_data(data_list, y_list, ratio=0.30):
    '''
    按照指定的比例，划分样本数据集
    ratio: 测试数据的比率
    '''
    X_train, X_test, y_train, y_test = train_test_split(data_list, y_list, 
                                                        test_size=ratio, random_state=50)
    print(len(X_train), len(y_train))
    print(len(X_test), len(y_test))
    return X_train, X_test, y_train, y_test

In [None]:
read_data('housing_.csv')

NameError: name 'double' is not defined

In [None]:
split_data(x_list,y_list)

### 分割数据集

In [None]:
X_train_full, X_test, y_train_full, y_test = train_test_split(housing.data, 
                                  housing.target, 
                                  random_state=42)

X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, 
                             y_train_full,
                            random_state=42)
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)#求训练集的固有属性

X_valid = scaler.transform(X_valid)#归一化转换验证集
X_test = scaler.transform(X_test)#归一化转换测试集

In [None]:
def build_model(n_hidden=1, n_neurons=30, learning_rate=3e-3, input_shape=[8]):
    """在给定一组超参数的情况下构建并编译Keras模型（超参数有默认值）"""
    
    model = keras.models.Sequential()
    model.add(keras.layers.InputLayer(input_shape=input_shape))
    for layer in range(n_hidden):
        model.add(keras.layers.Dense(n_neurons, activation="relu"))
    model.add(keras.layers.Dense(1))#单变量回归
    
    optimizer = keras.optimizers.SGD(learning_rate=learning_rate)
    #选择sgd优化器
    model.compile(loss="mse", optimizer=optimizer)
    #编译
    
    return model

#### 包装模型

KerasClassifier：将深度学习分类模型包装成Scikit-Learn中的分类模型，便于其使用Scikit-Learn中的方法和函数

KerasRegressor：将深度学习回归模型包装成Scikit-Learn中的回归模型，便于其使用Scikit-Learn中的方法和函数

KerasClassifier和KerasRegressor类使用参数build_fn，指定用来创建模型的函数的名称。因此必须定义一个函数，并通过函数来定义深度学习的模型，编译并返回函数。

In [None]:
#创建一个KerasRegressor对象：包装类（Wrapper）
keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)

  


#### 训练模型

In [None]:
#进行训练，回调函数选择EarlyStopping避免过拟合
#传递给fit（）方法的任何其他参数都将传递给内部的Keras模型
keras_reg.fit(X_train, y_train,
              epochs=100,
              validation_data=(X_valid, y_valid),
              callbacks=[keras.callbacks.EarlyStopping(patience=10)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fa3eb88b850>

#### 评估模型

scikit_learn中的score()是依据决定系数R²得来的。对训练集来说取值范围为[0,1]，对测试集时其值可能为负。

一般R²越高，说明自变量x对因变量y的解释程度越高，自变量引起的变动占总变动的百分比高。观察点在回归直线附近越密集。

In [None]:
#Scikit-learn框架中，回归模型的性能评估的分数(Score)
mse_test = keras_reg.score(X_test, y_test)
mse_test



-0.34119606018066406

#### 预测模型

In [None]:
y_pred = keras_reg.predict(X_test)
y_pred



array([0.6514432, 1.6107497, 4.071351 , ..., 1.513548 , 2.5864756,
       4.094509 ], dtype=float32)

# 使用 随机搜索 探索使模型效果最佳的多个超参数

## RandomizedSearchCV使用k折验证法，不使用验证集（只用于提前停止）

xe-y：表示有y个0构成的0.000……000x

np.arange()：函数返回一个有终点和起点的固定步长的排列，参数：起点、终点、步长（默认为0）

np.tolist()：将矩阵转换成列表

scipy.stats.reciprocal()：获得倒数分布的连续随机变量

In [None]:
from scipy.stats import reciprocal
#scipy.stats生成指定分布
#倒数连续随机变量
from sklearn.model_selection import RandomizedSearchCV

param_distribs = {
    "n_hidden": [0, 1, 2, 3],
    "n_neurons": np.arange(1, 100).tolist(),
    "learning_rate": reciprocal(3e-4, 3e-2).rvs(1000).tolist(),
}
#1e-3：有3个0构成的0.001 #3e-4为0.0003，3e-2为0.03 #rvs（）：产生指定个数的服从指定分布的随机数
#学习率为1000个范围为（0.003,0.03）的随机数

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
[CV] END learning_rate=0.022174573948353458, n_hidden=1, n_neurons=4; total time=   9.4s
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
[CV] END learning_rate=0.022174573948353458, n_hidden=1, n_neurons=4; total time=  23.9s
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9

RandomizedSearchCV(cv=3,
                   estimator=<keras.wrappers.scikit_learn.KerasRegressor object at 0x7fa3eba69850>,
                   param_distributions={'learning_rate': [0.001683454924600351,
                                                          0.02390836445593178,
                                                          0.008731907739399206,
                                                          0.004725396149933917,
                                                          0.0006154014789262348,
                                                          0.0006153331256530192,
                                                          0.0003920021771415983,
                                                          0.01619845322936229,
                                                          0.004779156784872302,
                                                          0.007821074275112298,...
                                                          0.00502142573

In [None]:
rnd_search_cv = RandomizedSearchCV(keras_reg, param_distribs, 
                  n_iter=10, cv=3, verbose=2)
#每次抽取10个样本，使用3折交叉验证法，详细程度为2

In [None]:
rnd_search_cv.fit(X_train, y_train, epochs=100,
                  validation_data=(X_valid, y_valid),
                  callbacks=[keras.callbacks.EarlyStopping(patience=10)])

In [25]:
rnd_search_cv.get_params().keys()

dict_keys(['cv', 'error_score', 'estimator__build_fn', 'estimator', 'n_iter', 'n_jobs', 'param_distributions', 'pre_dispatch', 'random_state', 'refit', 'return_train_score', 'scoring', 'verbose'])

In [26]:
keras_reg.get_params().keys()

dict_keys(['build_fn'])

In [None]:
print(3e-4)

0.0003


In [None]:
reciprocal(3e-4,3e-2)

<scipy.stats._distn_infrastructure.rv_frozen at 0x7fa3e7719610>

In [None]:
print(reciprocal(3e-4, 3e-2))
#'rv_frozen' object:一个知道其形状参数(hypergeom:m、n、n)的对象(实际上是一个实例)

<scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa3e7658450>


In [None]:
#print(reciprocal(3e-4, 3e-2).rvs(1000))

#### 访问随机搜索找到的模型的最佳参数

In [None]:
rnd_search_cv.best_params_

{'n_neurons': 80, 'n_hidden': 3, 'learning_rate': 0.0059640580092043885}

In [None]:
#最佳模型（估计器）
rnd_search_cv.best_estimator_

<keras.wrappers.scikit_learn.KerasRegressor at 0x7fa3eb807cd0>

In [None]:
#随机搜索出的模型的最佳分数（经过交叉验证过的模型在训练集中的最佳分数）
#在 tuned_params 中指定的参数的单个组合的所有cv折叠的平均分数中最高的
rnd_search_cv.best_score_

-0.31572551528612774

In [None]:
#分割数据集中不断地对测试集进行预测得出的最佳分数
#估计器为keras_reg，使用其评估方法：绝对系数
rnd_search_cv.score(X_test, y_test)



-0.2822590470314026

#### 保存模型

In [None]:
model = rnd_search_cv.best_estimator_.model
model

<keras.engine.sequential.Sequential at 0x7fa3e7719850>

#### 评估模型

In [None]:
model.evaluate(X_test, y_test)



0.2822590470314026