In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# 0.载入数据集
以下是关于数据集的解释

Each record in the database describes a Boston suburb or town.
1. CRIM: per capita crime rate by town.人均城镇犯罪率
2. ZN: proportion of residential land zoned for lots over 25,000 sq.ft. 25000英尺内的住宅用地比例
3. INDUS: proportion of non-retail business acres per town.非零售业务用地比例
4. CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)  在河边为1，否则为0
5. NOX: nitric oxides concentration (parts per 10 million) 一氧化氮浓度
6. RM: average number of rooms per dwelling 每个住宅的平均房间数
7. AGE: proportion of owner-occupied units built prior to 1940 1940年前的自有住房的比例 
8. DIS: weighted distances to ﬁve Boston employment centers 到5个工作市区的加权距离
9. RAD: index of accessibility to radial highways 径向公路的通行性指数
10. TAX: full-value property-tax rate per
11. PTRATIO: pupil-teacher ratio by town 按照镇分的师生比例
12. B: 1000(Bk−0.63)^2 where Bk is the proportion of blacks by town 黑人数量比例 
13. LSTAT: lower status of the population 人口中地位较低人群的百分数 
14. MEDV: Median value of owner-occupied homes in 1000s 房价中位数，单位为1000美元

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
filePath='/kaggle/input/boston-house-prices/housing.csv'

In [None]:
names=['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT','MEDV']
df=pd.read_csv(filePath,delim_whitespace=True,header=None,names=names)
df.head(10)

查看数据集中是否有空值，如果有空值，则需要进行必要的处理

In [None]:
df.info()

若想对数据的基本情况进行快速了解，可以用如下方式获得

In [None]:
df.describe().T

使用协方差矩阵查看各个特征之间的关系

In [None]:
corrmat = df.corr()
f, ax = plt.subplots(figsize=(12, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corrmat, vmax=1, square=True,annot=True,center=0,cmap=cmap)
plt.show()

使用numpy读取csv文件

In [None]:
numpy_data=np.genfromtxt(filePath)
data=numpy_data[:,0:13]
target=numpy_data[:,13]

In [None]:
print(data.shape)
print(target.shape)

# 特征提取
## 3.1 字段CRIM分析

In [None]:
import matplotlib.pyplot as plt


def drawScatter(x):
    plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
    plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
    plt.scatter(df[x], df['MEDV'])
    #plt.title(x+' & MEDV')
    plt.xlabel(x)
    plt.ylabel('MEDV')
    plt.grid()
    plt.show()

In [None]:
drawScatter('CRIM')

分析结论：
1. 高房价的房屋都集中在低犯罪率地区；
2. 城镇人均犯罪率超过20%的情况下，房价最高不高于20；
3. 城镇人均犯罪率处于(10, 20)区间的情况下，房价最高不高于30。

## 3.2字段ZN分析
ZN表示住宅用地所占比例，把它作为x轴的数值。

In [None]:
drawScatter('ZN')

## 3.3字段INDUS分析
INDUS表示城镇中非商业用地的所占比例，把它作为x轴的数值。

In [None]:
drawScatter('INDUS')

## 3.4 字段CHAS分析
CHAS表示地产是否处于查尔斯河边，1表示在河边，0表示不在河边。

In [None]:
drawScatter('CHAS')

## 3.5 NOX分析
NOX表示一氧化氮浓度

In [None]:
drawScatter('NOX')

## 3.6 字段RM分析
RM表示每栋住宅的房间数。

In [None]:
drawScatter('RM')

每栋房子的房间数与房价存在比较明显的线性关系

## 3.7 字段AGE分析
AGE表示1940年以前建成的业主自住单位的占比

In [None]:
drawScatter('AGE')

## 3.8 DIS分析
DIS表示距离5个波士顿就业中心的距离

In [None]:
drawScatter('DIS')

## 3.9 字段RAD分析
RAD表示距离高速公路的便利指数

In [None]:
drawScatter('RAD')

## 3.10 字段与TAX分析
TAX表示每一万元的不动产税率

In [None]:
drawScatter('TAX')

## 3.11 字段PTRATIO分析
教师比例

In [None]:
drawScatter('PTRATIO')

## 3.12 字段B分析
B表示城镇中黑人的比例。

In [None]:
drawScatter('B')

3.13 字段LSTAT分析
表示低收入阶层占比

In [None]:
drawScatter('LSTAT')

# 使用GBDT来进行回归预测

In [None]:
from sklearn.model_selection import train_test_split

In [None]:

X=df[names[0:-1]]
Y=df[names[-1]]

x_train, x_test, y_train, y_test=train_test_split(X,Y,test_size=0.3)
print(x_train.shape)
print(x_test.shape)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
gdbt=GradientBoostingRegressor(loss='ls')
gdbt.fit(x_train, y_train)
gdbt.score(x_test, y_test)

gdbt_y_predict=gdbt.predict(x_test)

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error 

In [None]:
print('r2', r2_score(y_test,gdbt_y_predict))
print('mse',mean_squared_error(y_test,gdbt_y_predict))
print('mae',mean_absolute_error(y_test,gdbt_y_predict))
print('rmse',np.sqrt(mean_squared_error(y_test,gdbt_y_predict)))

In [None]:
plt.grid()
plt.scatter(range(1,len(x_test)+1),gdbt_y_predict,label='predict' )
plt.scatter(range(1,len(x_test)+1),y_test ,label='actual')
plt.legend()
plt.xlabel('index')
plt.ylabel('price')

plt.show()

# 使用神经网络对来进行回归预测
创建训练数据以及测试数据集

对特征数据进行标准化处理

In [None]:
mean=x_train.mean(axis=0)
std=x_train.std(axis=0)
train_data_processed=(x_train-mean)/std

test_data_processed=(x_test-mean)/std

print(train_data_processed.mean(axis=0))
print(train_data_processed.std(axis=0))

In [None]:
from keras import models
from keras import layers

## 创建神经网络模型进行建模

In [None]:
def build_model():
  model=models.Sequential()
  model.add(layers.Dense(64,activation='relu',input_shape=(train_data_processed.shape[1],)))
  model.add(layers.Dense(64,activation='relu'))
  model.add(layers.Dense(1))
  model.compile(optimizer='rmsprop',loss='mse',metrics=['mae'])
  return model

In [None]:
model = build_model()
history=model.fit(train_data_processed, y_train,epochs=80, batch_size=16, verbose=0)
test_mse_score, test_mae_score = model.evaluate(test_data_processed, y_test)

import matplotlib.pyplot as plt
plt.plot(range(1, len(history.history['mae']) + 1), history.history['mae'])
plt.xlabel('Epochs')
plt.ylabel('Validation MAE')
plt.show()

In [None]:
print(test_mse_score)
print(test_mae_score)

In [None]:
predict_result=model.predict(test_data_processed)

In [None]:
print('r2', r2_score(y_test,predict_result))
print('mse',mean_squared_error(y_test,predict_result))
print('mae',mean_absolute_error(y_test,predict_result))
print('rmse',np.sqrt(mean_squared_error(y_test,predict_result)))

实际值与预测的偏差图

In [None]:
plt.grid()
plt.scatter(range(1,predict_result.shape[0]+1),predict_result,label='predict' )
plt.scatter(range(1,predict_result.shape[0]+1),y_test ,label='actual')
plt.scatter(range(1,len(x_test)+1),gdbt_y_predict,label='gbdt_predict' )
plt.legend()
plt.xlabel('index')
plt.ylabel('price')

plt.show()