In [1]:
# 导入所需要的包

%matplotlib inline
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Activation, Dropout
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow import initializers as init

# 查看tf、keras版本
print(tf.__version__)
print(keras.__version__)

2.0.0
2.2.4-tf


In [2]:
# 使用pandas读取数据
path = 'E:/3_code/4_python/ML/Dive-into-DL-TensorFlow2.0/data/kaggle_house/'
train_data = pd.read_csv(path + 'train.csv')
test_data = pd.read_csv(path + 'test.csv')

print(train_data.shape)
print(test_data.shape)

(1460, 81)
(1459, 80)


In [3]:
# 将所有特征按列连接，为后面将非数值化特征变为数值化特征做准备
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))

In [4]:
# 获取特征为数值的下标
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
all_features[numeric_features] = all_features[numeric_features].fillna(0)

In [5]:
# 将非数值特征转化为数值特征, 将缺失值也作为合法的特征值
all_features = pd.get_dummies(all_features, dummy_na=True)

In [6]:
# 分割数据集 并且将pandas转为numpy
n_train = train_data.shape[0]
train_features = np.array(all_features[:n_train].values, dtype=np.float)
test_features = np.array(all_features[n_train:].values, dtype=np.float)
train_labels = np.array(train_data.SalePrice.values.reshape(-1, 1), dtype=np.float)

In [7]:
def get_net():
    net = keras.models.Sequential()
    # net.add(Dense(256, activation='relu'))
    # net.add(Dropout(0.2))
    net.add(Dense(128, activation='relu'))
    net.add(Dropout(0.3))
    # net.add(Dense(64, activation='relu'))
    net.add(Dropout(0.4))
    # net.add(Dense(32, activation='relu'))
    net.add(Dropout(0.5))
    net.add(Dense(1, activation='relu'))
    return net

In [8]:
# k折交叉验证
def get_k_fold_data(k, i, X, y):
    assert k > 1
    # // 整数除法 返回不大于结果的一个最大整数
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        # slice切片函数 负责截取数据
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = tf.concat([X_train, X_part], axis=0)
            y_train = tf.concat([y_train, y_part], axis=0)
    return X_train, y_train, X_valid, y_valid

In [9]:
def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay, batch_size):
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):
        data = get_k_fold_data(k, i, X_train, y_train)
        net = get_net()
        net.compile(
            loss=tf.keras.losses.mean_squared_logarithmic_error, 
            optimizer=keras.optimizers.Adam(learning_rate)            )
        # 当性能没有提升时，停止训练
        monitor = keras.callbacks.EarlyStopping(
            monitor='val_loss', min_delta=1e-3,
            patience=5, verbose=1, mode='auto', 
            restore_best_weights=True)
        history = net.fit(
            data[0], data[1], 
            validation_data=(data[2], data[3]), 
            epochs=num_epochs, callbacks=[monitor],
            batch_size=batch_size, validation_freq=1, verbose=2)
        loss = history.history['loss']
        val_loss = history.history['val_loss']
        # print('fold %d, train rmse %f, valid rmse %f' % (i, loss[-1], val_loss[-1]))
    return loss, val_loss

In [10]:
def start_train(k, X_train, y_train, num_epochs, learning_rate, weight_decay, batch_size):
    train_l_sum, valid_l_sum = 0, 0
    data = get_k_fold_data(k, 0, X_train, y_train)
    net = get_net()
    net.compile(
        loss=tf.keras.losses.mean_squared_logarithmic_error, 
        optimizer=keras.optimizers.Adam(learning_rate), metrics=['accuracy'])
    # 当性能没有提升时，停止训练
    monitor = keras.callbacks.EarlyStopping(
        monitor='val_loss', min_delta=1e-3,
        patience=5, verbose=1, mode='auto', 
        restore_best_weights=True)
    history = net.fit(
        data[0], data[1], 
        validation_data=(data[2], data[3]), 
        epochs=num_epochs, callbacks=[monitor],
        batch_size=batch_size, validation_freq=1, verbose=2)
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    # print('fold %d, train rmse %f, valid rmse %f' % (i, loss[-1], val_loss[-1]))
    return loss, val_loss, net

In [11]:
k, num_epochs, lr, weight_decay, batch_size = 5, 1000, 0.5, 0, 64
loss, val_loss, net= start_train(k, train_features, train_labels, num_epochs, lr,
                          weight_decay, batch_size)

# plt.subplot()函数用于直接指定划分方式和位置进行绘图。https://blog.csdn.net/missyougoon/article/details/90543210
# plt.subplot(1, 2, 2)
# plt.plot(loss, label='train')
# plt.plot(val_loss, label='valid')
# plt.legend(loc='uper right')
# plt.title('Training and Validation Loss')
# plt.show()

Train on 1168 samples, validate on 292 samples
Epoch 1/1000
1168/1168 - 1s - loss: 15.3266 - accuracy: 0.0000e+00 - val_loss: 1.9958 - val_accuracy: 0.0000e+00
Epoch 2/1000
1168/1168 - 0s - loss: 1.3338 - accuracy: 0.0000e+00 - val_loss: 0.5304 - val_accuracy: 0.0000e+00
Epoch 3/1000
1168/1168 - 0s - loss: 0.5820 - accuracy: 0.0000e+00 - val_loss: 0.1462 - val_accuracy: 0.0000e+00
Epoch 4/1000
1168/1168 - 0s - loss: 0.3821 - accuracy: 0.0000e+00 - val_loss: 0.0609 - val_accuracy: 0.0000e+00
Epoch 5/1000
1168/1168 - 0s - loss: 0.2723 - accuracy: 0.0000e+00 - val_loss: 0.0426 - val_accuracy: 0.0000e+00
Epoch 6/1000
1168/1168 - 0s - loss: 0.2182 - accuracy: 0.0000e+00 - val_loss: 0.0331 - val_accuracy: 0.0000e+00
Epoch 7/1000
1168/1168 - 0s - loss: 0.1939 - accuracy: 0.0000e+00 - val_loss: 0.0257 - val_accuracy: 0.0000e+00
Epoch 8/1000
1168/1168 - 0s - loss: 0.1818 - accuracy: 0.0000e+00 - val_loss: 0.0241 - val_accuracy: 0.0000e+00
Epoch 9/1000
1168/1168 - 0s - loss: 0.1695 - accuracy: 0

In [28]:
preds=np.array(net.predict(test_features))
# print(test_data.shape)
print(train_data.shape)
# test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
test_data['SalePrice'] = preds
print(test_data.shape)
# print(pred[0])
submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
print(submission.shape)
submission.to_csv('submission.csv', index=False)

(1460, 81)
(1459, 81)
(1459, 2)


In [12]:
# from sklearn.model_selection import train_test_split
# x_train, x_test, y_train, y_test = train_test_split(    
#     train_features, train_labels, test_size=0.2, random_state=42)

In [13]:
# print(x_train.shape, x_test.shape)

In [14]:
# model = keras.models.Sequential()
# model.add(keras.layers.Dense(256, activation='relu'))
# model.add(keras.layers.Dense(128, activation='relu'))
# model.add(keras.layers.Dense(64, activation='relu'))
# model.add(keras.layers.Dense(32, activation='relu'))
# model.add(keras.layers.Dense(1, activation='relu'))

# model.compile(loss=tf.keras.losses.mean_squared_logarithmic_error,             optimizer=keras.optimizers.Adam(5), metrics=['accuracy'])
# # 当性能没有提升时，停止训练
# monitor = keras.callbacks.EarlyStopping(
#     monitor='val_loss', min_delta=1e-3,
#     patience=5, verbose=1, mode='auto', 
#     restore_best_weights=True)

In [15]:
# model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],batch_size=256, verbose=2,epochs=10000)
# model.fit(x_train,y_train,validation_data=(x_test,y_test),batch_size=256, verbose=2,epochs=10000)