In [7]:
# PCR (Principal Components Regression)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# PCA 라이브러리 불러오기
from sklearn.decomposition import PCA
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
ad = pd.read_csv('./Advertising.csv', index_col = 0)
ad

Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9
...,...,...,...,...
196,38.2,3.7,13.8,7.6
197,94.2,4.9,8.1,9.7
198,177.0,9.3,6.4,12.8
199,283.6,42.0,66.2,25.5


In [3]:
# PCR with advertising data

# feature와 response 분리

ad.columns

Index(['TV', 'Radio', 'Newspaper', 'Sales'], dtype='object')

In [5]:
x = ad[['TV', 'Radio', 'Newspaper']]
y = ad[['Sales']]

x

Unnamed: 0,TV,Radio,Newspaper
1,230.1,37.8,69.2
2,44.5,39.3,45.1
3,17.2,45.9,69.3
4,151.5,41.3,58.5
5,180.8,10.8,58.4
...,...,...,...
196,38.2,3.7,13.8
197,94.2,4.9,8.1
198,177.0,9.3,6.4
199,283.6,42.0,66.2


In [6]:
y

Unnamed: 0,Sales
1,22.1
2,10.4
3,9.3
4,18.5
5,12.9
...,...
196,7.6
197,9.7
198,12.8
199,25.5


In [8]:
# 차원 축소(2개 변수 사용 > 2차원 축소)

pca = PCA(n_components=2)

In [9]:
# 적합(fitting)
pca.fit(x)

PCA(n_components=2)

In [10]:
# 2개의 PC의 분산설명력
pca.explained_variance_

array([7373.2933766 ,  516.07662595])

In [11]:
# 분산설명력 비율
pca.explained_variance_ratio_
# PC1 : 91.41%
# PC2 : 6.39 % 
# >> 97.8% 설명

array([0.91415577, 0.06398422])

In [12]:
# PC1축과 PC2 축으로 변환

z = pca.fit_transform(x)
z

array([[ 8.37885459e+01,  3.97809922e+01],
       [-1.02138730e+02,  2.11666084e+01],
       [-1.28992484e+02,  4.66162737e+01],
       [ 5.07097169e+00,  3.24605596e+01],
       [ 3.40590034e+01,  2.09381554e+01],
       [-1.37372550e+02,  5.31645709e+01],
       [-8.95411919e+01, -1.56105218e+00],
       [-2.71687122e+01, -1.85233852e+01],
       [-1.39089396e+02, -3.26587125e+01],
       [ 5.23964374e+01, -1.70725851e+01],
       [-8.12021689e+01, -1.07115904e+01],
       [ 6.72413078e+01, -2.57554904e+01],
       [-1.22554515e+02,  3.94626040e+01],
       [-5.00532789e+01, -2.65102072e+01],
       [ 5.73839300e+01,  1.68358060e+01],
       [ 4.89406812e+01,  2.87251422e+01],
       [-7.78006550e+01,  8.41188799e+01],
       [ 1.34889902e+02,  2.69844146e+01],
       [-7.80470423e+01, -1.10237502e+01],
       [ 8.60991305e-02, -1.04730127e+01],
       [ 7.17442417e+01,  2.16291631e+01],
       [ 9.00507477e+01, -1.47109210e+01],
       [-1.33597890e+02,  1.75720041e+01],
       [ 8.

In [13]:
# PCA를 한 후, scaling된 값을 활용
# 훈련용 셋, 테스트 셋 구분

train_z = z[:-20]
test_z = z[-20 :]

train_y = y[:-20]
test_y = y[-20:]

In [14]:
# 선형회귀 객체 생성

regr = linear_model.LinearRegression()

# training data를 이용, 적합
regr.fit(train_z, train_y)

# training data를 이용, 예측
train_y_pred = regr.predict(train_z)

#(**) test data 이용, 예측
test_y_pred = regr.predict(test_z)

In [15]:
# 회귀 계수(coefficients)
print('coefficients \n', regr.coef_)

coefficients 
 [[0.04749783 0.06306131]]


In [17]:
# training MSE
print('Training_MSE \n', round(mean_squared_error(train_y, train_y_pred), 4))

Training_MSE 
 8.2043


In [18]:
# test MSE
print('Test_MSE \n', round(mean_squared_error(test_y, test_y_pred), 4))

Test_MSE 
 8.5583


In [19]:
# training data R2 score
print('Training R2 \n', round(r2_score(train_y, train_y_pred),4))

Training R2 
 0.6876


In [20]:
# teset data R2 score
print('Test R2 \n', round(r2_score(test_y, test_y_pred),4))

Test R2 
 0.7517
