## Linear Regression

### H(x) = Wx + b에서 W와 b의 값을 찾기

## 01. Boston Housing Dataset 불러오기

In [1]:
from sklearn.datasets import load_boston
%matplotlib inline

In [2]:
boston = load_boston()

X = boston["data"]     # 입력 데이터 
y = boston["target"]   # target 데이터 

In [3]:
print(boston['DESCR'])
print(boston['feature_names'])

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

## 02. pandas 데이터 셋으로 변경

In [15]:
import pandas as pd
import numpy as np
data = pd.DataFrame(X, columns=boston['feature_names'])
data.head()
data['PRICE'] = y
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


## 03. x의 값과 w의 값을 지정
### x1~x13, w1~w13

In [22]:
x1 = data["CRIM"].values
x2 = data["ZN"].values
x3 = data["INDUS"].values

w1 = np.random.uniform(low=0.0, high=1.0)
w2 = np.random.uniform(low=0.0, high=1.0)
w3 = np.random.uniform(low=0.0, high=1.0)

In [23]:
print(w1, w2, w3)

0.5911635034998545 0.4717140671105343 0.5887115292916901


In [24]:
num_epoch = 10000
learning_rate = 0.000005

for epoch in range(num_epoch):
    y_predict = x1 * w1 + x2 * w2 + x3 * w3
    err = np.abs(y_predict - y).mean()

    w1 = w1 - learning_rate * ((y_predict - y) * x1).mean()
    w2 = w2 - learning_rate * ((y_predict - y) * x2).mean()
    w3 = w3 - learning_rate * ((y_predict - y) * x3).mean()
    
    if epoch % 1000 == 0:
        print("{0:5} error = {1:.5f}".format(epoch, err))

print("----" * 10)
print("{0:5} error = {1:.5f}".format(epoch, err))

    0 error = 13.06213
 1000 error = 11.96344
 2000 error = 11.48906
 3000 error = 11.16285
 4000 error = 10.92769
 5000 error = 10.75571
 6000 error = 10.65491
 7000 error = 10.59984
 8000 error = 10.56735
 9000 error = 10.55052
----------------------------------------
 9999 error = 10.53768


In [27]:
all_df = data.copy()
y_predict = x1 * w1 + \
            x2 * w2 + \
            x3 * w3
        
all_df['PRICE(target)'] = y_predict
all_df.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE,PRICE(target)
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0,10.49727
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6,9.165586
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7,9.165592
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4,2.818302
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2,2.806261
