## 6.Scikit learn을 이용한 회귀분석

In [20]:
# Boston 데이터 불러오기
import pandas as pd
house = pd.read_csv("BostonHousing.csv")
house.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
 'PTRATIO', 'LSTAT', 'MEDV']
house.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2


In [21]:
# Scatter Plot 행렬 구하기
import plotly.express as px
fig = px.scatter_matrix(house, dimensions=['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV'],
title='Scatter Matrix')
fig.show()
fig = px.scatter_matrix(house, dimensions=['ZN', 'AGE', 'DIS', 'TAX', 'PTRATIO', 'MEDV'],
title='Scatter Matrix')
fig.show()

In [22]:
# 상관계수 계산
import numpy as np

col1 = ['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV']
col2 = ['ZN', 'AGE', 'DIS', 'TAX', 'PTRATIO', 'MEDV']

cm1 = np.corrcoef(house[col1].values.T)
cm2 = np.corrcoef(house[col2].values.T)
print(cm1)
print(cm2)

[[ 1.          0.60379972  0.59087892 -0.61380827 -0.73766273]
 [ 0.60379972  1.          0.76365145 -0.39167585 -0.48372516]
 [ 0.59087892  0.76365145  1.         -0.30218819 -0.42732077]
 [-0.61380827 -0.39167585 -0.30218819  1.          0.69535995]
 [-0.73766273 -0.48372516 -0.42732077  0.69535995  1.        ]]
[[ 1.         -0.56953734  0.66440822 -0.31456332 -0.39167855  0.36044534]
 [-0.56953734  1.         -0.74788054  0.50645559  0.26151501 -0.37695457]
 [ 0.66440822 -0.74788054  1.         -0.53443158 -0.23247054  0.24992873]
 [-0.31456332  0.50645559 -0.53443158  1.          0.46085304 -0.46853593]
 [-0.39167855  0.26151501 -0.23247054  0.46085304  1.         -0.50778669]
 [ 0.36044534 -0.37695457  0.24992873 -0.46853593 -0.50778669  1.        ]]


In [23]:
# 특성 변수 로그 변환
import numpy as np

house['LLSTAT'] = np.log(house['LSTAT'])
house['LINDUS'] = np.log(house['INDUS'])
house.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV,LLSTAT,LINDUS
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0,1.60543,0.837248
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6,2.21266,1.95586
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7,1.393766,1.95586
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4,1.07841,0.779325
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2,1.673351,0.779325


In [24]:
# 선형회귀분석 적용
y = house['MEDV'].values
house = house.drop(['LSTAT', 'INDUS', 'MEDV'], axis=1)
X = house.values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

from sklearn.linear_model import LinearRegression
mlr = LinearRegression()
mlr.fit(X_train, y_train)

print('Slope:', mlr.coef_)
print('Intercept:', mlr.intercept_)

Slope: [-1.49431476e-01  3.10090136e-02  2.58695952e+00 -1.76444484e+01
  1.16042961e+00  3.15997656e-02 -1.29747125e+00  2.55714669e-01
 -6.70108437e-03 -8.14141039e-01 -1.01287282e+01 -6.32176434e-01]
Intercept: 68.59687641998106


In [25]:
# 추정된 회귀모형을 train, test에 적용하여 잔차 구하기
y_train_pred = mlr.predict(X_train)
y_test_pred = mlr.predict(X_test)
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=y_train_pred, y=y_train_pred - y_train, mode = 'markers',
name = 'Training data'))
fig.add_trace(go.Scatter(x=y_test_pred, y=y_test_pred - y_test, mode = 'markers',
name = 'Test data'))
fig.update_layout(width=600, height=400, title_text='Residual Plots versus predicted values',
title_x=0.5)
fig.update_xaxes(title_text='predicted')
fig.update_yaxes(title_text='residuals')
fig.show()

In [26]:
# train, test로 MSE 계산
from sklearn.metrics import mean_squared_error
print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred),
mean_squared_error(y_test, y_test_pred)))

MSE train: 18.281, test: 18.133


In [27]:
# train, test로 R^2 계산
from sklearn.metrics import r2_score
print('R^2 train: %.3f, test: %.3f' %(r2_score(y_train, y_train_pred), r2_score(y_test,
y_test_pred)))

R^2 train: 0.775, test: 0.802


In [29]:
# RANSAC 적용하기
from sklearn.linear_model import RANSACRegressor
rans = RANSACRegressor(max_trials=100, min_samples=45, loss='absolute_error', residual_threshold=5.0,
random_state=1)
rans.fit(X, y)
y_train_pred = rans.predict(X_train)
y_test_pred = rans.predict(X_test)

In [32]:
# 회귀계수 추정에 사용된 데이터 (일종의 서포트 벡터) 확인하기
inlier_mask = rans.inlier_mask_
outlier_mask = np.logical_not(inlier_mask)

print(inlier_mask)
print(outlier_mask)

[False  True  True  True False  True  True False  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True False
  True  True  True  True False  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True False  True  True
  True  True  True  True False False False  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True False  True  True False  True  True  True  True
  True  True False  True False  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
 False  True  True  True  True  True  True  True  True False  True  True
  True  True  True  True False  True  True  True  True  True  True  True
  True False  True  True  True False False False  T

In [33]:
# 선형 SVM 회귀와 비선형 커털 SVM 회귀 각각 적합
from sklearn.svm import SVR
svl = SVR(kernel='linear', C=1.0, epsilon=0.1)
svr = SVR(kernel='rbf', C=1.0, epsilon=0.1, gamma='scale')
svl.fit(X_train, y_train)
svr.fit(X_train, y_train)
y_train_predsvl = svl.predict(X_train)
y_train_predsvr = svr.predict(X_train)
y_test_predsvl = svl.predict(X_test)
y_test_predsvr = svr.predict(X_test)

In [34]:
# 선형 SVM 회귀 vs. 비선형 커널 SVM 회귀 MSE 비교
from sklearn.metrics import mean_squared_error
mse_l_train = mean_squared_error(y_train, y_train_predsvl)
mse_l_test = mean_squared_error(y_test, y_test_predsvl)
mse_n_train = mean_squared_error(y_train, y_train_predsvr)
mse_n_test = mean_squared_error(y_test, y_test_predsvr)
print(mse_l_train, mse_l_test, mse_n_train, mse_n_test)

21.985686319030847 17.82436651889724 66.81630398602833 75.20399632666582


In [35]:
# 선형 SVM 회귀 vs. 비선형 커널 SVM 회귀 R^2 계산
from sklearn.metrics import r2_score
R2_l = r2_score(y_train, y_train_predsvl)
R2_n = r2_score(y_train, y_train_predsvr)
print(R2_l, R2_n)

0.7292038429875223 0.1770282681813704
