## 更多关于线性回归模型的讨论

In [1]:
import numpy as np
import matplotlib.pyplot as plt 
from sklearn import datasets

In [2]:
# 使用波士顿房价数据
boston = datasets.load_boston()

# 由于是多元线性回归，所以这里处理所有的特征值
X = boston.data
y = boston.target

# 数据清理
X = X[y < 50.0]
y = y[y < 50.0]

In [3]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

In [4]:
# linear regression
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [5]:
# 系数 coefficients
linear_reg.coef_

array([-1.20354261e-01,  3.64423279e-02, -3.61493155e-02,  5.12978140e-02,
       -1.15775825e+01,  3.42740062e+00, -2.32311760e-02, -1.19487594e+00,
        2.60101728e-01, -1.40219119e-02, -8.35430488e-01,  7.80472852e-03,
       -3.80923751e-01])

In [6]:
# 对于这些系数：
#   正数：代表对应的特征和最终的房价正相关
#   负数：代表对应的特征和最终的房价负相关
#   系数的绝对值大小：代表对应的特征对最终的房价影响的程度

In [7]:
# 下面把这些系数从小到大排序，看看他们对应的分别是哪些特征
np.argsort(linear_reg.coef_)

array([ 4,  7, 10, 12,  0,  2,  6,  9, 11,  1,  3,  8,  5])

In [8]:
boston.feature_names[np.argsort(linear_reg.coef_)]

array(['NOX', 'DIS', 'PTRATIO', 'LSTAT', 'CRIM', 'INDUS', 'AGE', 'TAX',
       'B', 'ZN', 'CHAS', 'RAD', 'RM'], dtype='<U7')

In [10]:
# 查看这些特征名代表的含义
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [11]:
# 其中最大的正相关 RM 代表房间的数量，  其次的正相关 RAD 代表到达附近高速路的方便程度
# 再来看看最大的负相关  NOX 代表房子周围一氧化碳的浓度，   其次的负相关 DIS 代表远离波士顿5个劳动就业中心加权平均距离

### 所以哪怕你拟合出来的线性回归的结果不够好，但是能够发现这些特征和最终的结果的相关性，也是有价值的

### 因此你拿到一组数据先用线性回归的方式试试看，它总之是没有坏处的