# Linearregression - sklearn

In [1]:
from sklearn.datasets import load_boston
import matplotlib.pyplot as plt
import numpy as np

In [2]:
boston = load_boston()

In [4]:
# 보스톤 데이터의 설명
print(boston['DESCR'])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [7]:
print(boston["data"][:3])

[[6.3200e-03 1.8000e+01 2.3100e+00 0.0000e+00 5.3800e-01 6.5750e+00
  6.5200e+01 4.0900e+00 1.0000e+00 2.9600e+02 1.5300e+01 3.9690e+02
  4.9800e+00]
 [2.7310e-02 0.0000e+00 7.0700e+00 0.0000e+00 4.6900e-01 6.4210e+00
  7.8900e+01 4.9671e+00 2.0000e+00 2.4200e+02 1.7800e+01 3.9690e+02
  9.1400e+00]
 [2.7290e-02 0.0000e+00 7.0700e+00 0.0000e+00 4.6900e-01 7.1850e+00
  6.1100e+01 4.9671e+00 2.0000e+00 2.4200e+02 1.7800e+01 3.9283e+02
  4.0300e+00]]


In [9]:
print(boston["target"][:10])

[24.  21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9]


In [13]:
print(boston.keys())

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])


In [17]:
x_data = boston.data
y_data = boston.target.reshape(boston.target.size, 1)

y_data[:10]

array([[24. ],
       [21.6],
       [34.7],
       [33.4],
       [36.2],
       [28.7],
       [22.9],
       [27.1],
       [16.5],
       [18.9]])

In [18]:
from sklearn import preprocessing

# 데이터 scale 
minmax_scale = preprocessing.MinMaxScaler().fit(x_data)
x_scaled_data = minmax_scale.transform(x_data)

x_scaled_data[:3]

array([[0.00000000e+00, 1.80000000e-01, 6.78152493e-02, 0.00000000e+00,
        3.14814815e-01, 5.77505269e-01, 6.41606591e-01, 2.69203139e-01,
        0.00000000e+00, 2.08015267e-01, 2.87234043e-01, 1.00000000e+00,
        8.96799117e-02],
       [2.35922539e-04, 0.00000000e+00, 2.42302053e-01, 0.00000000e+00,
        1.72839506e-01, 5.47997701e-01, 7.82698249e-01, 3.48961980e-01,
        4.34782609e-02, 1.04961832e-01, 5.53191489e-01, 1.00000000e+00,
        2.04470199e-01],
       [2.35697744e-04, 0.00000000e+00, 2.42302053e-01, 0.00000000e+00,
        1.72839506e-01, 6.94385898e-01, 5.99382080e-01, 3.48961980e-01,
        4.34782609e-02, 1.04961832e-01, 5.53191489e-01, 9.89737254e-01,
        6.34657837e-02]])

In [31]:
from sklearn.model_selection import train_test_split

# training data, test data 구분
X_train, X_test, y_train, y_test = train_test_split(x_scaled_data, y_data, test_size = 0.2)

In [32]:
print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

404
102
404
102


In [33]:
from sklearn import linear_model

regr = linear_model.LinearRegression(fit_intercept = True,
                                                                                normalize = False,
                                                                                copy_X = True,
                                                                                n_jobs = 8)
regr.fit(X_train, y_train)
regr

LinearRegression(n_jobs=8)

In [34]:
regr.coef_, regr.intercept_ # weights, bias(W_0)

(array([[ -9.42703628,   5.3805917 ,   1.6222118 ,   2.78161134,
          -8.19072172,  17.29154693,   0.96550524, -14.88436014,
           6.95544919,  -6.7458174 ,  -8.79315907,   3.7311918 ,
         -21.06284905]]),
 array([26.80992743]))

In [35]:
# The coefficients
print('Coefficients :', regr.coef_)
print()
print('intercept: ', regr.intercept_)

Coefficients : [[ -9.42703628   5.3805917    1.6222118    2.78161134  -8.19072172
   17.29154693   0.96550524 -14.88436014   6.95544919  -6.7458174
   -8.79315907   3.7311918  -21.06284905]]

intercept:  [26.80992743]


In [37]:
# 예측값
y_true = y_test
y_pred = regr.predict(X_test)

### RMSE로 측정

In [43]:
# 직접구현
RMSE_self = np.sqrt(((y_true - y_pred) ** 2).sum()/len(y_true))
print(RMSE_self)

# 모듈사용
from sklearn.metrics import mean_squared_error as mse
np.sqrt(mse(y_true,y_pred))

5.320345604234057


5.320345604234057