In [1]:
# 导入所需要的包
%matplotlib inline
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, initializers, optimizers, regularizers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow import initializers as init
from sklearn.model_selection import train_test_split
from scipy.stats import zscore

# 查看tf、keras版本
# print(tf.__version__)
# print(keras.__version__)

In [2]:
# 使用pandas读取数据
path = 'E:/3_code/4_python/ML/Dive-into-DL-TensorFlow2.0/data/kaggle_house/'
train_data = pd.read_csv(path + 'train.csv')
test_data = pd.read_csv(path + 'test.csv')

In [3]:
# 将所有特征按列连接，为后面将非数值化特征变为数值化特征做准备
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))

In [4]:
# 获取特征为数值的下标
all_features.drop(columns=['Alley', '3SsnPorch', 'FireplaceQu', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'BsmtHalfBath', 'LowQualFinSF', 'BsmtFinSF2', 'LandContour', 'Utilities', 'LandSlope'], inplace=True)
# all_features[['MSSubClass', 'YearBuilt', 'YearRemodAdd']] = all_features[['MSSubClass', 'YearBuilt', 'YearRemodAdd']].astype(str)
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
# all_features[numeric_features] = zscore(all_features[numeric_features])
# for item in all_features[numeric_features]:
#     all_features[item].fillna(all_features[item].median())
# all_features[numeric_features] = all_features[numeric_features].fillna(0)
# all_features[numeric_features] = all_features[numeric_features].dropna(inplace=True)
# print(all_features.columns)

# print(all_features.columns)
# 将非数值特征转化为数值特征, 将缺失值也作为合法的特征值
# all_features = pd.get_dummies(all_features, dummy_na=True)

# 将非数值特征转化为数值特征, 将缺失值也不作为合法的特征值
# all_features = pd.get_dummies(all_features)
# all_features = all_features.fillna(0)
# 清除含有缺失值的行
# all_features.dropna(inplace=True)

In [5]:
all_features = pd.get_dummies(all_features)

In [6]:
# print(all_features.shape)
# 统计每列缺失值情况
mean_cols = all_features.mean()
all_features = all_features.fillna(mean_cols)
# na_index= all_features.isnull().sum().sort_values(ascending=False).head
# all_features[na_index] = all_features[[na_index]].fillna(all_features[[na_index]].mean())

In [7]:
print(all_features.isnull().sum().sum())

0


In [8]:
# 分割数据集 并且将pandas转为numpy
n_train = train_data.shape[0]
train_features = np.array(all_features[:n_train].values, dtype=np.float64)
test_features = np.array(all_features[n_train:].values, dtype=np.float64)
train_labels = np.array(train_data.SalePrice.values.reshape(-1, 1), dtype=np.float64)

In [15]:
def get_k_fold_data(k, i, X, y):
    assert k > 1
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = tf.concat([X_train, X_part], axis=0)
            y_train = tf.concat([y_train, y_part], axis=0)
    return X_train, y_train, X_valid, y_valid


In [47]:
def get_net(wd=0):
    net = keras.models.Sequential()
    net.add(layers.Dense(256, activation='relu'))
    net.add(layers.Dropout(0.1))
    # net.add(layers.Dense(1, activation='relu'))
    # net.add(layers.Dropout(0.1))
    net.add(layers.Dense(8, activation='relu'))
    # net.add(layers.Dropout(0.1))
    # net.add(layers.Dense(256, activation='relu'))
    # net.add(layers.Dropout(0.1))
    net.add(layers.Dense(1, kernel_regularizer=regularizers.l2(wd)))
    # net.add(layers.Dense(1, kernel_regularizer=regularizers.l2(wd)))
    return net

In [48]:
def start_train(k, X_train, y_train, num_epochs, learning_rate, weight_decay, batch_size, history_loss):
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):
    # data = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    # data = tf.convert_to_tensor(data, dtype=tf.float64)
        data = get_k_fold_data(k, i, X_train, y_train)
        # data[0] = tf.convert_to_tensor(data[0],dtype=tf.float64)
        # data[1] = tf.convert_to_tensor(data[1],dtype=tf.float64)
        # data[2] = tf.convert_to_tensor(data[2],dtype=tf.float64)
        # data[3] = tf.convert_to_tensor(data[3],dtype=tf.float64)
        # data = tf.convert_to_tensor(data, dtype=tf.float64)
        net = get_net(weight_decay)
        net.compile(
            loss=tf.keras.losses.mean_squared_logarithmic_error, 
            optimizer=keras.optimizers.Adam(learning_rate))
        # 当性能没有提升时，停止训练
        monitor = keras.callbacks.EarlyStopping(
            monitor='val_loss', min_delta=1e-3,
            patience=10, verbose=1, mode='auto', 
            restore_best_weights=True)
        history = net.fit(data[0], data[1], validation_data=(data[2], data[3]), 
            epochs=num_epochs, callbacks=[monitor],
            batch_size=batch_size, validation_freq=1)
        loss = history.history['loss']
        val_loss = history.history['val_loss']
        if val_loss[-1] < history_loss:
            return net
        print('fold %d, train rmse %f, valid rmse %f'
              % (i, loss[-1], val_loss[-1]))
    plt.subplot(1, 2, 2)
    plt.plot(loss, label='train')
    plt.plot(val_loss, label='valid')
    plt.legend(loc='upper right')
    plt.title('Training and Validation Loss')
    plt.show()
    # net.fit(data[0], data[2], validation_data=(data[1], data[3]), 
        # epochs=num_epochs, batch_size=batch_size,validation_freq=1, verbose=2)
    return net

In [49]:
def start_train1(k, X_train, y_train, num_epochs, learning_rate, weight_decay, batch_size, history_loss):
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):
    # data = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    # data = tf.convert_to_tensor(data, dtype=tf.float64)
        data = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
        data[0] = tf.convert_to_tensor(data[0],dtype=tf.float64)
        data[1] = tf.convert_to_tensor(data[1],dtype=tf.float64)
        data[2] = tf.convert_to_tensor(data[2],dtype=tf.float64)
        data[3] = tf.convert_to_tensor(data[3],dtype=tf.float64)
        net = get_net(weight_decay)
        net.compile(
            loss=tf.keras.losses.mean_squared_logarithmic_error, 
            optimizer=keras.optimizers.Adam(learning_rate))
        # 当性能没有提升时，停止训练
        monitor = keras.callbacks.EarlyStopping(
            monitor='val_loss', min_delta=1e-3,
            patience=10, verbose=1, mode='auto', 
            restore_best_weights=True)
        history = net.fit(data[0], data[2], validation_data=(data[1], data[3]), 
            epochs=num_epochs, callbacks=[monitor],
            batch_size=batch_size, validation_freq=1)
        loss = history.history['loss']
        val_loss = history.history['val_loss']
        if val_loss[-1] < history_loss:
            return net
    # net.fit(data[0], data[2], validation_data=(data[1], data[3]), 
        # epochs=num_epochs, batch_size=batch_size,validation_freq=1, verbose=2)
    return net

In [50]:
# x_train = tf.convert_to_tensor(train_features,dtype=tf.float64)
# y_train = tf.convert_to_tensor(train_labels,dtype=tf.float64)
# x_test = tf.convert_to_tensor(test_features,dtype=tf.float64)
# train_labels = np.expm1(train_labels)
k, num_epochs, lr, weight_decay, batch_size, history_loss = 100, 100000, 0.5, 0, 256, 0.010
# net = start_train(k, train_features, train_labels, num_epochs, lr,
                        #   weight_decay, batch_size, history_loss)
net = start_train1(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size, history_loss)
# preds=np.array(net.predict(test_features))

ch 66/100000
Epoch 67/100000
Epoch 68/100000
Epoch 69/100000
Epoch 70/100000
Epoch 71/100000
Epoch 72/100000
Epoch 73/100000
Epoch 74/100000
Epoch 75/100000
 256/1168 [=====>........................] - ETA: 0s - loss: 0.0253Restoring model weights from the end of the best epoch.
Epoch 00075: early stopping
Train on 1168 samples, validate on 292 samples
Epoch 1/100000
Epoch 2/100000
Epoch 3/100000
Epoch 4/100000
Epoch 5/100000
Epoch 6/100000
Epoch 7/100000
Epoch 8/100000
Epoch 9/100000
Epoch 10/100000
Epoch 11/100000
Epoch 12/100000
Epoch 13/100000
Epoch 14/100000
Epoch 15/100000
Epoch 16/100000
Epoch 17/100000
Epoch 18/100000
Epoch 19/100000
Epoch 20/100000
Epoch 21/100000
Epoch 22/100000
Epoch 23/100000
Epoch 24/100000
Epoch 25/100000
Epoch 26/100000
Epoch 27/100000
Epoch 28/100000
Epoch 29/100000
Epoch 30/100000
Epoch 31/100000
Epoch 32/100000
Epoch 33/100000
Epoch 34/100000
Epoch 35/100000
Epoch 36/100000
Epoch 37/100000
Epoch 38/100000
Epoch 39/100000
Epoch 40/100000
Epoch 41/10000

In [13]:
# data = train_test_split(train_features, train_labels, test_size=0.2, random_state=42)
# # print(type(data))
# # data[0] = np.array(data[0], dtype=np.float64)
# # data[1] = np.array(data[0], dtype=np.float64)
# # data[2] = np.array(data[0], dtype=np.float64)
# # data[3] = np.array(data[0], dtype=np.float64)
# data[0] = tf.convert_to_tensor(data[0],dtype=tf.float64)
# data[1] = tf.convert_to_tensor(data[1],dtype=tf.float64)
# data[2] = tf.convert_to_tensor(data[2],dtype=tf.float64)
# data[3] = tf.convert_to_tensor(data[3],dtype=tf.float64)
# model = get_net()
# model.compile(
#         loss=tf.keras.losses.mean_squared_logarithmic_error, 
#         optimizer=keras.optimizers.Adam(0.5))
# # 当性能没有提升时，停止训练
# monitor = keras.callbacks.EarlyStopping(
#     monitor='val_loss', min_delta=1e-3,
#     patience=10, verbose=1, mode='auto', 
#     restore_best_weights=True)
# model.fit(data[0], data[2], validation_data=(data[1], data[3]), epochs=num_epochs, callbacks=[monitor],
#     batch_size=batch_size, validation_freq=1, verbose=2)

In [37]:
preds=np.array(net.predict(test_features))
test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
submission.to_csv('submission.csv', index=False)