### The Boston Housing Dataset
https://archive.ics.uci.edu/ml/datasets/Housing

In [1]:
from sklearn.datasets import load_boston
import matplotlib.pyplot as plt
import numpy as np

In [8]:
boston = load_boston()   # dict type형태의 데이터!!!
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [9]:
print(boston["DESCR"])   # 데이터 설명 볼 수 있음

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [7]:
boston["data"]

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

In [10]:
x_data = boston.data
y_data = boston.target.reshape(boston.target.size, 1)   # 2-dim array로 변환

In [13]:
boston.target[:5]

array([24. , 21.6, 34.7, 33.4, 36.2])

In [14]:
y_data[:5]

array([[24. ],
       [21.6],
       [34.7],
       [33.4],
       [36.2]])

In [16]:
boston.target.shape, y_data.shape

((506,), (506, 1))

In [17]:
from sklearn import preprocessing

minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,5)).fit(x_data)
# standard_scale = preprocessing.StandardScaler().fit(x_data)
x_scaled_data = minmax_scale.transform(x_data)

x_scaled_data[:3]

array([[0.00000000e+00, 9.00000000e-01, 3.39076246e-01, 0.00000000e+00,
        1.57407407e+00, 2.88752635e+00, 3.20803296e+00, 1.34601570e+00,
        0.00000000e+00, 1.04007634e+00, 1.43617021e+00, 5.00000000e+00,
        4.48399558e-01],
       [1.17961270e-03, 0.00000000e+00, 1.21151026e+00, 0.00000000e+00,
        8.64197531e-01, 2.73998850e+00, 3.91349125e+00, 1.74480990e+00,
        2.17391304e-01, 5.24809160e-01, 2.76595745e+00, 5.00000000e+00,
        1.02235099e+00],
       [1.17848872e-03, 0.00000000e+00, 1.21151026e+00, 0.00000000e+00,
        8.64197531e-01, 3.47192949e+00, 2.99691040e+00, 1.74480990e+00,
        2.17391304e-01, 5.24809160e-01, 2.76595745e+00, 4.94868627e+00,
        3.17328918e-01]])

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x_scaled_data, y_data, test_size=0.2)

In [19]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((404, 13), (102, 13), (404, 1), (102, 1))

In [20]:
from sklearn import  linear_model

regr = linear_model.LinearRegression(fit_intercept=True, 
                                     normalize=False,    # normalize 옵션
                                     copy_X=True,        # 일반적으로 True로 함
                                     n_jobs=8)           # cpu 개수를 몇 개로 해줄지?
regr.fit(X_train, y_train)
regr

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=8, normalize=False)

In [22]:
regr.coef_, regr.intercept_

(array([[-1.88779234,  1.06688997,  0.33163498,  0.89555879, -1.57880191,
          3.95794126, -0.1161108 , -3.35378911,  1.37047585, -1.31329819,
         -1.52808995,  0.75001805, -4.14126917]]), array([26.07779969]))

In [23]:
# The coefficients
print('Coefficients: ', regr.coef_)
print('intercept: ', regr.intercept_)   

Coefficients:  [[-1.88779234  1.06688997  0.33163498  0.89555879 -1.57880191  3.95794126
  -0.1161108  -3.35378911  1.37047585 -1.31329819 -1.52808995  0.75001805
  -4.14126917]]
intercept:  [26.07779969]


In [25]:
regr.predict(x_scaled_data[:5])

array([[29.53983801],
       [24.54987224],
       [30.43447868],
       [28.48472173],
       [27.65149189]])

In [26]:
x_scaled_data[:5].dot(regr.coef_.T) + regr.intercept_

array([[29.53983801],
       [24.54987224],
       [30.43447868],
       [28.48472173],
       [27.65149189]])

In [27]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [28]:
# test performance
y_true = y_test
y_hat = regr.predict(X_test)

r2_score(y_true, y_hat), mean_absolute_error(y_true, y_hat), mean_squared_error(y_true, y_hat)

(0.7270446848426855, 3.3744885013538486, 21.618658345361865)

In [29]:
# train performance
y_true = y_train
y_hat = regr.predict(X_train)

r2_score(y_true, y_hat), mean_absolute_error(y_true, y_hat), mean_squared_error(y_true, y_hat)

(0.7383892584093531, 3.3236872775236392, 22.41402863317568)