## sklearn.preprocessing.PolynomialFeatures

* _class_ sklearn.preprocessing.PolynomialFeatures(_degree=2_,  _*_,  _interaction_only=False_,  _include_bias=True_,  _order='C'_)[[source]](https://github.com/scikit-learn/scikit-learn/blob/364c77e04/sklearn/preprocessing/_polynomial.py#L30)[](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html#sklearn.preprocessing.PolynomialFeatures "Permalink to this definition")

In [1]:
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# 다항식으로 변환한 단항식 생성, [[0, 1], [2, 3]]의 2*2 행렬 생성
X = np.arange(4).reshape(2, 2)

print('일차 단항식 계수 feature :\n', X)

일차 단항식 계수 feature :
 [[0 1]
 [2 3]]


In [3]:
# degree = 2 인 2차 다항식으로 변환하기 위해 PolynomialFeatures를 이용하여 변환
poly = PolynomialFeatures(degree=2)
poly.fit(X)
poly_ftr = poly.transform(X)

print('변환된 2차 다항식 계수 feature : \n', poly_ftr)

변환된 2차 다항식 계수 feature : 
 [[1. 0. 1. 0. 0. 1.]
 [1. 2. 3. 4. 6. 9.]]


In [4]:
def polynomial_func(X) :
    y = 1 + 2*X + X**2 + X**3
    return y

X = np.arange(4).reshape(2, 2)
print('일차 단항식 계수 feature : \n' , X)
y = polynomial_func(X)
print('삼차 다항식 결정식 : \n', y)

일차 단항식 계수 feature : 
 [[0 1]
 [2 3]]
삼차 다항식 결정식 : 
 [[ 1  5]
 [17 43]]


In [6]:
# 3차 다항식 변환
poly_ftr = PolynomialFeatures(degree=3).fit_transform(X)
print('3차 다항식 계수 feature : \n', poly_ftr)

# Linear Regression에 3차 다항식 계수 feature와 3차 다항식 결정값으로 학습 후 회귀 계수 확인
model = LinearRegression()
model.fit(poly_ftr, y)

print('Polynomial 회귀 계수\n', np.round(model.coef_, 2))
print('Polynomial 회귀 Shape : ', model.coef_.shape)

3차 다항식 계수 feature : 
 [[ 1.  0.  1.  0.  0.  1.  0.  0.  0.  1.]
 [ 1.  2.  3.  4.  6.  9.  8. 12. 18. 27.]]
Polynomial 회귀 계수
 [[0.   0.02 0.02 0.05 0.07 0.1  0.1  0.14 0.22 0.31]
 [0.   0.06 0.06 0.11 0.17 0.23 0.23 0.34 0.51 0.74]]
Polynomial 회귀 Shape :  (2, 10)


## sklearn.pipeline.Pipeline

* _class_ sklearn.pipeline.Pipeline(_steps_,  _*_,  _memory=None_,  _verbose=False_)[[source]](https://github.com/scikit-learn/scikit-learn/blob/364c77e04/sklearn/pipeline.py#L52)[](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline "Permalink to this definition")

In [7]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import numpy as np

In [9]:
def polynomial_func(X) :
    y = 1 + 2*X + X**2 + X**3
    return y

# Pipeline 객체로 streamline 하게 Polynomial Feature변환과 Linear Regression을 연결
model = Pipeline([('poly', PolynomialFeatures(degree=3)), ('linear', LinearRegression())])

X = np.arange(4).reshape(2, 2)
y = polynomial_func(X)

model = model.fit(X, y)
print('Polynomial 회귀 계수\n', np.round(model.named_steps['linear'].coef_, 2))

Polynomial 회귀 계수
 [[0.   0.02 0.02 0.05 0.07 0.1  0.1  0.14 0.22 0.31]
 [0.   0.06 0.06 0.11 0.17 0.23 0.23 0.34 0.51 0.74]]


# 보스턴 주택 가격 예측 실습

> 다항회귀 이용해서 보스턴 주택가격 예측

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import numpy as np

In [13]:
# boston 데이터셋 로드
boston= pd.read_csv("C:/apps/ml_7/datasets/Boston.csv", index_col=0)

In [18]:
# boston dataset의 target array는 주택 가격
y_target = boston['medv']
X_data = boston.drop('medv', axis=1)

print(boston.shape)
boston.head()

(506, 14)


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_target, test_size=0.3, random_state=156)

In [23]:
# Pipeline 객체로 streamline 하게 Polynomial Feature변환과 Linear Regression을 연결
p_model = Pipeline([('poly', PolynomialFeatures(degree=2, include_bias=False)), ('linear', LinearRegression())])

p_model
# 파이프라인 구축
Pipeline(memory=None,
         steps=[('poly',
                 PolynomialFeatures(degree=2, include_bias=False,
                                    interaction_only=False, order='C')),
                ('Linear', 
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None))],
        verbose=False)

In [24]:
p_model.fit(X_train, y_train)
y_preds = p_model.predict(X_test)
mse = mean_squared_error(y_test, y_preds)
rmse = np.sqrt(mse)

print('MSE : {0:.3f}, RMSE : {1:.3f}'.format(mse, rmse))
print('Variance score : {0:.3f}'.format(r2_score(y_test, y_preds)))

MSE : 15.556, RMSE : 3.944
Variance score : 0.782


In [26]:
# Pipeline 객체로 Streamline 하게 Polynomial Feature 변환과 Linear Regression을 연결
p_model = Pipeline([('poly', PolynomialFeatures(degree=3, include_bias=False)),
                    ('linear', LinearRegression())])

p_model

In [28]:
p_model.fit(X_train, y_train)
y_preds = p_model.predict(X_test)
mse = mean_squared_error(y_test, y_preds)
rmse = np.sqrt(mse)

print('MSE : {0:.3f}, RMSE : {1:.3f}'.format(mse, rmse))
print('Variance score : {0:.3f}'.format(r2_score(y_test, y_preds)))

MSE : 79625.592, RMSE : 282.180
Variance score : -1116.598


In [29]:
# degree = 2로 변환된 다항 회귀의 피처들을 살펴보면, 기존의 피처 13개를 조합해서 피처가 104개로 늘어난 것을 확인할 수 있다.
# 다항식에서 degree가 높아지면 오버피팅이 일어날 수 있다
X_train_poly = PolynomialFeatures(degree=2, include_bias=False).fit_transform(X_train, y_train)

print(X_train_poly.shape, X_train.shape)

(354, 104) (354, 13)
