## 제5장. 머신러닝 프로세스
## 제4절. 머신러닝 분석과정 빠르게 맛보기 - 회귀분석

In [None]:
from sklearn.datasets import load_boston
import pandas as pd

boston = load_boston() # 보스턴 주택가격 로드하기
boston_dt = boston.data # 독립변수(feature)만으로 된 numpy 형태
price = boston.target # 종속변수 값을 numpy 형태로 가짐

df = pd.DataFrame(boston_dt, columns=boston.feature_names)
df['PRICE'] = price

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 3개의 행과 4개의 열을 가진 subplot 그리기
fig, axs = plt.subplots(figsize=(16,10), ncols=4, nrows=3, 
constrained_layout=True)
features = df.columns.difference(['PRICE', 'CHAS'])

for i, feature in zip(range(12), features):
    row = int(i/4) # 행번호 설정
    col = i%4 # 열번호 설정
    
    # seaborn의 regplot을 이용해 산점도와 선형 회귀직선을 함께 시각화함
    sns.regplot(x=feature, y=df['PRICE'], data=df, ax=axs[row][col])

In [None]:
from sklearn.model_selection import train_test_split
x = df[['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']].values
y = df['PRICE'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
print('학습데이터세트 PRICE 평균: ', y_train.mean())
print('평가데이터세트 PRICE 평균: ', y_test.mean())

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)

In [None]:
from sklearn.linear_model import LinearRegression
linear = LinearRegression()
linear.fit(x_train_scaled, y_train)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score
import numpy as np

pred_train = linear.predict(x_train_scaled)

mae = mean_absolute_error(y_train, pred_train)
mse = mean_squared_error(y_train, pred_train)
rmse = np.sqrt(mse)
r2 = r2_score(y_train, pred_train)

print('MAE: {0: .5f}'.format(mae))
print('MSE: {0: .5f}'.format(mse))
print('RMSE: {0: .5f}'.format(rmse))
print('R2: {0: .5f}'.format(r2))

In [None]:
x_test_scaled = scaler.transform(x_test)
pred = linear.predict(x_test_scaled) 

In [None]:
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, pred)
print('MAE: {0: .5f}'.format(mae))
print('MSE: {0: .5f}'.format(mse))
print('RMSE: {0: .5f}'.format(rmse))
print('R2: {0: .5f}'.format(r2))

In [None]:
pred_df = pd.DataFrame(pred, columns=['pred Price'])
pred_df.head()

In [None]:
actual = pd.DataFrame(y_test, columns=['actual Price'])
actual.head()

In [None]:
reg_result = pd.concat([actual, pred_df], axis=1)
reg_result.to_csv('reg_result.csv', index=False, encoding='utf-8-sig')
reg_result.head()