<a href="https://colab.research.google.com/github/s2ul2/bitamin/blob/main/7week.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# 보스턴 주택 데이터셋
from sklearn import datasets
housing = datasets.load_boston()

In [None]:
housing.keys()

In [None]:
# pandas dataframe로 변환
data = pd.DataFrame(housing['data'], columns = housing['feature_names'])
target = pd.DataFrame(housing['target'], columns = ['Target']) 

# 데이터 셋 크기
print(data.shape)
print(target.shape)

In [None]:
# 데이터 프레임 결합 - data와 target
df = pd.concat([data, target], axis = 1)
df.head(3)

In [None]:
df.info

In [None]:
df.isnull().sum()

In [None]:
# 상관관계 분석
df_corr = df.corr()

plt.figure(figsize = (10, 10))
sns.set(font_scale = 0.8)
sns.heatmap(df_corr, annot = True, cbar = False)
plt.show()

In [None]:
# target 변수와 상관관계가 높은 순으로 출력
corr_order = df.corr().loc[:'LSTAT', 'Target'].abs().sort_values(ascending = False)
corr_order

In [None]:
# 시각화로 분석할 피처 선택 추출
plot_cols = ['Target', 'LSTAT', 'RM', 'PTRATIO', 'INDUS']
plot_df = df.loc[:, plot_cols]
plot_df.head()

In [None]:
# regplot으로 선형회귀선 표시
plt.figure(figsize = (10, 10))
for idx, col in enumerate(plot_cols[1:]):
    ax1 = plt.subplot(2, 2, idx + 1)
    sns.regplot(x = col, y = plot_cols[0], data = plot_df, ax = ax1)
plt.show()

In [None]:
# 피처 스케일링
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

df_scaled = df.iloc[:, :-1] # 마지막 열인 target을 제외하고 스케일러를 돌림
scaler.fit(df_scaled)
df_scaled = scaler.transform(df_scaled)

# 스케일링 변환된 값을 데이터프레임에 반영
df.iloc[:, :-1] = df_scaled[:, :]
df.head()



In [None]:
# 학습 데이터와 테스트 데이터 분할
from sklearn.model_selection import train_test_split
x_data = df.loc[:, ['LSTAT', 'RM']]
y_data = df.loc[:, 'Target']
x_train, x_test, y_train, y_test = train_test_split(x_data,
                                                    y_data,
                                                    test_size = 0.2,
                                                    shuffle = True,
                                                    random_state = 12)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train, y_train)

print("회귀계수(기울기) : ", np.round(lr.coef_, 1))
print("상수항(절편) : ", np.round(lr.intercept_, 1))

In [None]:
y_test_pred = lr.predict(x_test)

# 예측값과 실제값의 분포
plt.figure(figsize = (10, 5))
plt.scatter(x_test['LSTAT'], y_test, label = 'y_test')
plt.scatter(x_test['LSTAT'], y_test_pred, c = 'r', label = 'y_pred')
plt.legend(loc = 'best')
plt.show()

In [None]:
# 성능 평가 - MSE 사용
from sklearn.metrics import mean_squared_error
y_train_pred = lr.predict(x_train)

train_mse = mean_squared_error(y_train, y_train_pred)  # 훈련 데이터의 평가 점수
print('Train MSE : %.4f' % train_mse)

test_mse = mean_squared_error(y_test, y_test_pred)
print('Test MSE : %.4f' % test_mse)