# 보스턴 집값 예측 모델 
- 데이터셋 : boston.csv
- 학습방법 : 지도학습 -> 회귀
- 피쳐/독립 : 13개 
- 타겟/종속 : 1개 

In [1]:
# 모듈 로딩 
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split

In [2]:
# 데이터
DATA_FILE = '../DATA/boston.csv'

In [3]:
data_df = pd.read_csv(DATA_FILE)
data_df.head(2)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6


In [4]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


## [2] 전처리
### [2-1] 데이터 정제

In [5]:
# 결측치, 중복값, 이상치, 컬럼별 고유값 추출로 이상 데이터 체크

## [2-2] 표준화 & 정규화 
-> 진행 여부에 따라 성능의 변화는 경우에 따라 다름
- 정규분포 데이터셋을 기반으로 한 모델 -> StandardScaler, MinMaxScaler, Log 변환 
- 피쳐의 값의 범위 차이를 줄이기 -> 피쳐 스케일링, MinMaxScaler, RobustScaler....
- 범주형 피쳐 -> 수치화 인코딩 : OneHotEncoder, OrdinalEncoder
- 문자열 타겟 -> 정수 라벨 인코딩 : LabelEncoder

## [2-3] 피쳐와 타겟 분리

In [6]:
feature_df = data_df.iloc[:,:-1]
target_sr = data_df['MEDV']

In [7]:
print(f'feature_df : {feature_df.shape} target_sr : {target_sr.shape}')

feature_df : (506, 13) target_sr : (506,)


## [3] 학습 준비
### [3-1] 학습용 데이터셋과 테스트용 데이터셋 분리

In [8]:
x_train, x_test, y_train, y_test = train_test_split(feature_df, target_sr, random_state=10)

In [9]:
print(f'x_train : {x_train.shape}, y_train : {y_train.shape}')
print(f'x_test : {x_test.shape}, y_test : {y_test.shape}')

x_train : (379, 13), y_train : (379,)
x_test : (127, 13), y_test : (127,)


## [3-2] 학습용 데이터셋으로 스케일러 생성 

In [10]:
### - 수치 피쳐 값의 범위 차가 큼 -> scaling 진행 
ss_scaler = StandardScaler()
ss_scaler.fit(x_train)

In [11]:
x_train_scaled = ss_scaler.transform(x_train)
x_test_scaled = ss_scaler.transform(x_test)

## [4] 학습 진행 -> 교차검증으로 진행

### Ridge

In [12]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import Ridge

In [13]:
# 모델 인스턴스 생성 
ridge_model = Ridge(alpha=1.0) # 기본값 1.0

In [14]:
# 학습 진행 
# - cv : 3개
# - scoring : 'mean_squared_error','r2'ArithmeticError
# - return_train_score

result = cross_validate(ridge_model, x_train_scaled, y_train, cv=3, scoring=['neg_mean_squared_error','r2'], return_train_score=True)
# neg : 
result

{'fit_time': array([0.        , 0.00074434, 0.0004921 ]),
 'score_time': array([0.0092628, 0.       , 0.       ]),
 'test_neg_mean_squared_error': array([-17.32029712, -22.58256569, -22.65758521]),
 'train_neg_mean_squared_error': array([-20.14363572, -18.21077226, -17.29366176]),
 'test_r2': array([0.74828253, 0.75629228, 0.68099104]),
 'train_r2': array([0.75566257, 0.74003888, 0.78609709])}

In [15]:
### 모델 성능을 좌우하는 Hyper-parameter 제어 즉, 튜닝
alpha_values = [0.,1.0,10,100]

for value in alpha_values:
    ridge_model = Ridge(alpha=value)
    # 모델 인스턴스 생성
    ridge_model = Ridge(alpha=1.0) # 기본값 1.0

    # 학습 진행
    # - cv : 3개
    # - scoring : 'mean_squared_error', 'r2'
    # return_train_score
    result = cross_validate(ridge_model, x_train_scaled, y_train, cv = 3, scoring=['neg_mean_squared_error','r2'], return_train_score=True)
    
    result_df = pd.DataFrame(result)
    print(f'[Ridge(alpha={value})]')
    print(result_df, end='\n\n')

[Ridge(alpha=0.0)]
   fit_time  score_time  test_neg_mean_squared_error  \
0  0.000996    0.000998                   -17.320297   
1  0.000998    0.000997                   -22.582566   
2  0.001081    0.000921                   -22.657585   

   train_neg_mean_squared_error   test_r2  train_r2  
0                    -20.143636  0.748283  0.755663  
1                    -18.210772  0.756292  0.740039  
2                    -17.293662  0.680991  0.786097  

[Ridge(alpha=1.0)]
   fit_time  score_time  test_neg_mean_squared_error  \
0  0.000777         0.0                   -17.320297   
1  0.000000         0.0                   -22.582566   
2  0.000000         0.0                   -22.657585   

   train_neg_mean_squared_error   test_r2  train_r2  
0                    -20.143636  0.748283  0.755663  
1                    -18.210772  0.756292  0.740039  
2                    -17.293662  0.680991  0.786097  

[Ridge(alpha=10)]
   fit_time  score_time  test_neg_mean_squared_error  \
0  0

In [16]:
### 모델 성능을 좌우하는 Hyper-parameter 제어 즉, 튜닝
alpha_values = [0.,1.0,10,100]

for value in alpha_values:
    ridge_model = Ridge(alpha=value)
# 모델 인스턴스 생성
    ridge_model = Ridge(alpha=1.0) # 기본값 1.0

# 학습 진행
# - cv : 3개
# - scoring : 'mean_squared_error', 'r2'
# return_train_score
    result = cross_validate(ridge_model, x_train_scaled, y_train, cv = 3, scoring=['neg_mean_squared_error','r2'],
                            return_train_score=True, return_estimator= True)
    
    result_df = pd.DataFrame(result)[['test_r2','train_r2']]
    print(f'[Ridge(alpha={value})]')
    print(result_df, end='\n\n')

[Ridge(alpha=0.0)]
    test_r2  train_r2
0  0.748283  0.755663
1  0.756292  0.740039
2  0.680991  0.786097

[Ridge(alpha=1.0)]
    test_r2  train_r2
0  0.748283  0.755663
1  0.756292  0.740039
2  0.680991  0.786097

[Ridge(alpha=10)]
    test_r2  train_r2
0  0.748283  0.755663
1  0.756292  0.740039
2  0.680991  0.786097

[Ridge(alpha=100)]
    test_r2  train_r2
0  0.748283  0.755663
1  0.756292  0.740039
2  0.680991  0.786097



In [17]:
# 둘의 차이가 작은 것이 좋으므로 차이 구함 

alpha_values = [0.,1.0,10,100]

for value in alpha_values:
    ridge_model = Ridge(alpha=value)
    ridge_model = Ridge(alpha=1.0) # 기본값 1.0

    result = cross_validate(ridge_model, x_train_scaled, y_train, cv = 3, scoring=['neg_mean_squared_error','r2'],
                            return_train_score=True, return_estimator= True)
    
    result_df = pd.DataFrame(result)[['test_r2','train_r2']]

    result_df['diff'] = abs(result_df['test_r2']- result_df['train_r2']) # 차이만 보면 되므로 절대값 사용 
    best_idx = result_df['diff'].sort_values()[0] # 정렬했으므로 가장 작은 값 출력 - 정렬했으므로 0번이 가장 작음 
    print(best_idx)
    print(f'[Ridge(alpha={value})]')
    print(result_df, end='\n\n')

0.007380040333377247
[Ridge(alpha=0.0)]
    test_r2  train_r2      diff
0  0.748283  0.755663  0.007380
1  0.756292  0.740039  0.016253
2  0.680991  0.786097  0.105106

0.007380040333377247
[Ridge(alpha=1.0)]
    test_r2  train_r2      diff
0  0.748283  0.755663  0.007380
1  0.756292  0.740039  0.016253
2  0.680991  0.786097  0.105106

0.007380040333377247
[Ridge(alpha=10)]
    test_r2  train_r2      diff
0  0.748283  0.755663  0.007380
1  0.756292  0.740039  0.016253
2  0.680991  0.786097  0.105106

0.007380040333377247
[Ridge(alpha=100)]
    test_r2  train_r2      diff
0  0.748283  0.755663  0.007380
1  0.756292  0.740039  0.016253
2  0.680991  0.786097  0.105106



In [26]:
# 둘의 차이가 작은 것이 좋으므로 차이 구함 

alpha_values = [0.,1.0,10,100]

for value in alpha_values:
    ridge_model = Ridge(alpha=value)
    ridge_model = Ridge(alpha=1.0) # 기본값 1.0

    result = cross_validate(ridge_model, x_train_scaled, y_train, cv = 3, scoring=['neg_mean_squared_error','r2'],
                            return_train_score=True, return_estimator= True)
    
    result_df = pd.DataFrame(result)[['test_r2','train_r2']]

    result_df['diff'] = abs(result_df['test_r2']- result_df['train_r2']) # 차이만 보면 되므로 절대값 사용 

    print(result['estimator'][0].coef_) # 그냥 확인용

    # print(f'[Ridge(alpha={value})]')
    # print(result_df, end='\n\n')

[-1.39035961  1.53043843  0.11109741  0.6621853  -2.29024619  2.34249774
  0.10030677 -3.52062389  2.57481444 -2.20749462 -1.86406784  1.03607796
 -3.48102887]
[-1.39035961  1.53043843  0.11109741  0.6621853  -2.29024619  2.34249774
  0.10030677 -3.52062389  2.57481444 -2.20749462 -1.86406784  1.03607796
 -3.48102887]
[-1.39035961  1.53043843  0.11109741  0.6621853  -2.29024619  2.34249774
  0.10030677 -3.52062389  2.57481444 -2.20749462 -1.86406784  1.03607796
 -3.48102887]
[-1.39035961  1.53043843  0.11109741  0.6621853  -2.29024619  2.34249774
  0.10030677 -3.52062389  2.57481444 -2.20749462 -1.86406784  1.03607796
 -3.48102887]


In [24]:
# 둘의 차이가 작은 것이 좋으므로 차이 구함 

alpha_values = [0.,1.0,10,100]

for value in alpha_values:
    ridge_model = Ridge(alpha=value)

    result = cross_validate(ridge_model, x_train_scaled, y_train, cv = 3, scoring=['neg_mean_squared_error','r2'],
                            return_train_score=True, return_estimator= True)
    
    result_df = pd.DataFrame(result)[['test_r2','train_r2']]

    result_df['diff'] = abs(result_df['test_r2']- result_df['train_r2']) # 차이만 보면 되므로 절대값 사용 

    print(result['estimator'][0].coef_)

    # print(f'[Ridge(alpha={value})]')
    # print(result_df, end='\n\n')

[-1.41407793  1.56590993  0.15536906  0.65522098 -2.36200159  2.31948624
  0.1173831  -3.59071105  2.71475429 -2.33252925 -1.88390034  1.04036915
 -3.50250877]
[-1.39035961  1.53043843  0.11109741  0.6621853  -2.29024619  2.34249774
  0.10030677 -3.52062389  2.57481444 -2.20749462 -1.86406784  1.03607796
 -3.48102887]
[-1.23221033  1.29302258 -0.12737786  0.70280521 -1.80949922  2.48028701
 -0.00860666 -2.99831755  1.75466332 -1.51704375 -1.73434856  1.00368486
 -3.30809117]
[-0.78141029  0.70910255 -0.46407849  0.72503917 -0.69294458  2.41757287
 -0.24148703 -1.21831206  0.28616643 -0.63423538 -1.31602563  0.78528977
 -2.39571659]


f1*w1 + f2+w2 + ........ + fnwn + b 로 식이 만들어질 때 alpha를 통해 가중치를 조절하여 나온 계수값들이 위의 출력값 (?)


#### Lasso

In [19]:
from sklearn.linear_model import Lasso

In [23]:
# 둘의 차이가 작은 것이 좋으므로 차이 구함 

alpha_values = [0.,1.0,10,100]

for value in alpha_values:
    ridge_model = Lasso(alpha=value)
    # ridge_model = Lasso(alpha=1.0) # 기본값 1.0

    result = cross_validate(ridge_model, x_train_scaled, y_train, cv = 3, scoring=['neg_mean_squared_error','r2'],
                            return_train_score=True, return_estimator= True)
    
    result_df = pd.DataFrame(result)[['test_r2','train_r2']]

    result_df['diff'] = abs(result_df['test_r2']- result_df['train_r2']) # 차이만 보면 되므로 절대값 사용 

    print(result['estimator'][0].coef_)

    # print(f'[Ridge(alpha={value})]')
    # print(result_df, end='\n\n')

[-1.41407793  1.56590993  0.15536906  0.65522098 -2.36200159  2.31948624
  0.1173831  -3.59071105  2.71475429 -2.33252925 -1.88390034  1.04036915
 -3.50250877]
[-0.18119516  0.         -0.          0.         -0.          2.6706524
 -0.         -0.         -0.         -0.1542158  -1.17708874  0.36943757
 -3.33718723]
[-0.  0. -0.  0. -0.  0. -0.  0. -0. -0. -0.  0. -0.]
[-0.  0. -0.  0. -0.  0. -0.  0. -0. -0. -0.  0. -0.]


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [21]:
# 둘의 차이가 작은 것이 좋으므로 차이 구함 

alpha_values = [0.,1.0,10,100]

for value in alpha_values:
    ridge_model = Lasso(alpha=value, max_iter=3)

    result = cross_validate(ridge_model, x_train_scaled, y_train, cv = 3, scoring=['neg_mean_squared_error','r2'],
                            return_train_score=True, return_estimator= True)
    
    result_df = pd.DataFrame(result)[['test_r2','train_r2']]

    result_df['diff'] = abs(result_df['test_r2']- result_df['train_r2']) # 차이만 보면 되므로 절대값 사용 

    print(result['estimator'][0].coef_)

    # print(f'[Ridge(alpha={value})]')
    # print(result_df, end='\n\n')

[-0.76918209  1.30798802 -1.3660128   0.70871821 -1.12810945  3.13078874
  0.20140226 -3.18951128  0.40006951 -1.02796444 -1.33246342  1.05170534
 -2.85931196]
[-0.12685525  0.         -0.68948499  0.         -0.35867851  3.50097227
 -0.         -0.         -0.02775436 -0.34045443 -1.07046702  0.47097032
 -2.11146537]
[-0.  0. -0.  0. -0.  0. -0.  0. -0. -0. -0.  0. -0.]
[-0.  0. -0.  0. -0.  0. -0.  0. -0. -0. -0.  0. -0.]


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


-> 데이터가 부족

### Hyper-Parameter 튜닝과 교차 검증을 동시에 진행

In [27]:
from sklearn.model_selection import GridSearchCV

In [30]:
# Ridge의 Hyper-parameter 값 설정 
params = {'alpha' : [0,0.1,0.5,1.0], 'max_iter' : [3,5]}

# -> 0., 3 => model
# -> 0., 5 => model 
# --> 총 조합하면 8개 Ridge 모델 생성 

In [32]:
# 인스턴스 생성 
r_model = Ridge()

search_cv = GridSearchCV(r_model, params, cv = 3, verbose= True, return_train_score=True) # default cv : 5

# verbose = True : 진행상황 

In [33]:
# 학습 진행 
search_cv.fit(x_train_scaled, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


- Fitting 3 folds for each of 8 candidates : 8개의 후보 모델 존재

In [34]:
# fit() 진행 후 모델 파라미터 확인 
search_cv.best_params_

{'alpha': 1.0, 'max_iter': 3}

In [35]:
search_cv.best_estimator_

In [36]:
search_cv.best_index_

6

In [37]:
search_cv.cv_results_

{'mean_fit_time': array([0.00130113, 0.00096281, 0.00135573, 0.00066257, 0.00099723,
        0.        , 0.00353797, 0.00066312]),
 'std_fit_time': array([5.75806959e-04, 7.81058303e-04, 4.52098223e-04, 4.68523063e-04,
        1.23630756e-06, 0.00000000e+00, 4.31193712e-03, 4.68899719e-04]),
 'mean_score_time': array([0.00042311, 0.00127999, 0.00066336, 0.00090289, 0.00013622,
        0.        , 0.00066463, 0.00033228]),
 'std_score_time': array([0.00030149, 0.00116807, 0.00046907, 0.00013794, 0.00019264,
        0.        , 0.00046999, 0.00046991]),
 'param_alpha': masked_array(data=[0, 0, 0.1, 0.1, 0.5, 0.5, 1.0, 1.0],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_iter': masked_array(data=[3, 5, 3, 5, 3, 5, 3, 5],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'alpha': 0, 'max_iter': 3},
  {'alph

In [38]:
# 위의 내용이 dict 라서 dict 에 넣음 

result_df = pd.DataFrame(search_cv.cv_results_)
result_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_max_iter,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.001301,0.000576,0.000423,0.000301,0.0,3,"{'alpha': 0, 'max_iter': 3}",0.747022,0.756482,0.680801,0.728101,0.033669,7,0.75572,0.740082,0.786156,0.760653,0.019131
1,0.000963,0.000781,0.00128,0.001168,0.0,5,"{'alpha': 0, 'max_iter': 5}",0.747022,0.756482,0.680801,0.728101,0.033669,7,0.75572,0.740082,0.786156,0.760653,0.019131
2,0.001356,0.000452,0.000663,0.000469,0.1,3,"{'alpha': 0.1, 'max_iter': 3}",0.747159,0.756462,0.680831,0.728151,0.033675,5,0.75572,0.740081,0.786156,0.760652,0.019131
3,0.000663,0.000469,0.000903,0.000138,0.1,5,"{'alpha': 0.1, 'max_iter': 5}",0.747159,0.756462,0.680831,0.728151,0.033675,5,0.75572,0.740081,0.786156,0.760652,0.019131
4,0.000997,1e-06,0.000136,0.000193,0.5,3,"{'alpha': 0.5, 'max_iter': 3}",0.747682,0.756385,0.680927,0.728331,0.033708,3,0.755705,0.74007,0.786141,0.760639,0.019129
5,0.0,0.0,0.0,0.0,0.5,5,"{'alpha': 0.5, 'max_iter': 5}",0.747682,0.756385,0.680927,0.728331,0.033708,3,0.755705,0.74007,0.786141,0.760639,0.019129
6,0.003538,0.004312,0.000665,0.00047,1.0,3,"{'alpha': 1.0, 'max_iter': 3}",0.748283,0.756292,0.680991,0.728522,0.033768,1,0.755663,0.740039,0.786097,0.7606,0.019124
7,0.000663,0.000469,0.000332,0.00047,1.0,5,"{'alpha': 1.0, 'max_iter': 5}",0.748283,0.756292,0.680991,0.728522,0.033768,1,0.755663,0.740039,0.786097,0.7606,0.019124
