# 다항 회귀 (Polynomial Regression)
- 연속형 레이블과 특성 간의 관계가 선형이 아닌 경우 1차식으로 표현이 불가능하여 2차식 이상의 다항식으로 변형하여 회귀 분석을 진행
- 기존의 특성에 다항 변환을 적용하여 새로운 특성을 추가
- scikit-learn의 경우 PolynomialFeatures를 사용하여 다항 변환 및 상호교차항을 쉽게 추가 가능

In [1]:
# 다항 변환을 위한 데이터 생성
import pandas as pd
X = pd.DataFrame({
    'X':[1, 2, 3, 4, 5]
})

In [2]:
# 다항 변환 적용 (2차항)
from sklearn.preprocessing import PolynomialFeatures
P = PolynomialFeatures(degree=2) # 2차항으로 변환하는 변환기 객체
X_POLY = pd.DataFrame(P.fit_transform(X), columns=P.get_feature_names_out()) # X를 fit_transform에 주고, 변환기에서 컬럼 이름도 받아옴
X_POLY

Unnamed: 0,1,X,X^2
0,1.0,1.0,1.0
1,1.0,2.0,4.0
2,1.0,3.0,9.0
3,1.0,4.0,16.0
4,1.0,5.0,25.0


In [3]:
# 다항 변환을 위한 데이터 생성
import pandas as pd
X = pd.DataFrame({
    'X0':[1, 2, 3, 4, 5],
    'X1':[6, 7, 8, 9, 10]
})
X

Unnamed: 0,X0,X1
0,1,6
1,2,7
2,3,8
3,4,9
4,5,10


In [4]:
P = PolynomialFeatures(degree=2) 
X_POLY = pd.DataFrame(P.fit_transform(X), columns=P.get_feature_names_out())
X_POLY
# 제곱항이 있다! -> 여기에 가중치(w)를 곱해서 식을 만든다.
# w1*x1 + w2*x2 ... -> w1*x1^2 + w2*x2^2 ... 차수의 증가!
# 즉, LinearRegression 모델을 써서 가중치를 구하는데 차수자체가 증가하니까 2차식을 만드는 것!

Unnamed: 0,1,X0,X1,X0^2,X0 X1,X1^2
0,1.0,1.0,6.0,1.0,6.0,36.0
1,1.0,2.0,7.0,4.0,14.0,49.0
2,1.0,3.0,8.0,9.0,24.0,64.0
3,1.0,4.0,9.0,16.0,36.0,81.0
4,1.0,5.0,10.0,25.0,50.0,100.0


# 다항회귀 적용

In [5]:
# 환경 설정
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

In [6]:
# 회귀 데이터 가져오기
df = pd.read_csv('/mnt/elice/dataset/diabetes.csv')
df

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018118,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017282,-0.037344,-0.013840,-0.024993,-0.011080,-0.046879,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044528,-0.025930,220.0


In [7]:
# 데이터 분할하기
X = df.iloc[:, :-1]
Y = df.iloc[:, -1]
X.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641


In [9]:
# 다항 변환 적용
P = PolynomialFeatures(degree=2, include_bias=False) # 상수항은 제외시킴
X_Poly = P.fit_transform(X)
X.shape, X_Poly.shape # X_Poly가 65개의 column으로 증가한 것을 볼 수 있다.

((442, 10), (442, 65))

In [10]:
# 학습용과 평가용으로 데이터 분할
x_train, x_test, y_train, y_test = train_test_split(X_Poly, Y, random_state=0)

In [11]:
# 모델 생성
model = LinearRegression() 

In [12]:
# 모델 학습
model.fit(x_train, y_train) # 학습용 데이터만 사용

In [13]:
# 모델 평가 (R Squared)
print('학습 데이터 성능 :', model.score(x_train, y_train))
print('평가 데이터 성능 :', model.score(x_test, y_test))

학습 데이터 성능 : 0.646875342028455
평가 데이터 성능 : 0.24413674792367002


In [14]:
# 모델 예측
p_test = model.predict(x_test) # test 데이터에 대한 예측 수행
p_test

array([244.31677323, 244.97875976, 149.08823824, 116.66764727,
       173.62488209, 249.51487076,  91.14020939, 224.5029242 ,
       168.32616594, 236.27576247, 215.43533314, 162.8234412 ,
        79.62039717, 105.16087386, 291.65552324,  87.53194636,
       145.91028234,  67.63426156, 141.18527438, 218.58855039,
       123.33569406, 154.57138326, 185.37719633, 105.0654444 ,
       209.02795358, 155.03714787, 145.66155775,  78.77480473,
       198.5411409 , 147.29182207, 215.75453135,  82.75615595,
       127.92832613, 206.54652026,  99.27480743, 183.79975701,
       174.90531198, 172.56805707, 111.45411905, 196.53739042,
        95.60174631, 154.48703038, 135.56191183, 207.51998574,
       151.14036712,  58.43285644, 144.30258091, 131.26026359,
       110.07953298, 235.95781011, 160.21446084,  75.86500057,
       166.75785082, 171.73679294, 248.71567055, 171.35904248,
       188.98404739,  88.01077194,  82.98439532, 203.85665111,
       224.67000533, 143.05709815, 111.06634709, 102.64

In [15]:
# rmse 계산
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, p_test, squared=False)
# 학습 데이터 성능이 증가하고 평가 데이터 성능이 감소했다 -> 차이가 크다! 과대적합
# 그러면서 성능 자체가 낮음 -> 과소적합

61.25698172827581

### 다항 변환 전 모델 성능
- 학습 데이터 성능 : 0.5554371489353019
- 평가 데이터 성능 : 0.35940090989715556


### 다항 변환 후 모델 성능
- 학습 데이터 성능 : 0.6468753420284549
- 평가 데이터 성능 : 0.24413674792366735

## 제출

제출을 위해 아래 `X` DataFrame에 2차 다항변환을 적용한 결과 DataFrame을 `result_df`에 저장하세요.
- 컬럼 이름은 다항 변환기가 생성해주는 이름, 즉 `get_feature_names_out`을 통해 지정합니다.

In [16]:
X = pd.DataFrame({
    'X0':[1, 2, 3, 4, 5],
    'X1':[6, 7, 8, 9, 10]
})

from sklearn.preprocessing import PolynomialFeatures
P = PolynomialFeatures(degree = 2)

# TODO: 위 DataFrame에 다항변환을 적용하세요.
result_df = X_POLY = pd.DataFrame(P.fit_transform(X), columns=P.get_feature_names_out())

## 채점 수행

아래 코드는 채점 수행을 위한 코드입니다.

따라서 이를 수정했을시 **채점이 제대로 이루어지지 않습니다.**

**주의**: 채점 코드를 실행하기 전에 반드시 코드 파일을 한번 저장하시길 바랍니다.

In [17]:
import os
import pandas as pd

assert isinstance(result_df, pd.DataFrame), \
    "result_df에 DataFrame을 제대로 저장했는지 확인하세요."

result_df.to_json("result.json")

os.system('elice_grade result.json cds_ai_exercise.ipynb')

send files ['result.json', 'cds_ai_exercise.ipynb'] for grade...
waiting result...
waiting result...
done!

Score: 100.000000
Duration: 1.466 seconds
=== Message ===
제출 완료되었습니다.


0