In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn')
sns.set(font_scale=1.5)

import warnings
warnings.filterwarnings('ignore')


import missingno as msno

from sklearn.datasets import load_boston


In [7]:
boston = load_boston()

boston_df = pd.DataFrame(boston.data, columns = boston.feature_names)

boston_df['PRICE'] = boston.target
print('Boston 데이터 세트 크기:', boston_df.shape)
boston_df.head()

y_target = boston_df['PRICE']
X_features = boston_df.drop('PRICE', axis=1, inplace=False)

Boston 데이터 세트 크기: (506, 14)


In [78]:
from sklearn.linear_model import Lasso, ElasticNet, Ridge
from sklearn.model_selection import cross_val_score

In [63]:
def get_linear_reg_eval(model_name, params=None, X_data_n=None, y_target_n=None, verbose=True):
    coeff_df = pd.DataFrame()
    if verbose : print('#####', model_name, '#####')
    for param in params:
        if model_name == 'Ridge':
            model = Ridge(alpha=param)
        elif model_name == 'Lasso' :
            model = Lasso(alpha=param)
        elif model_name == 'ElasticNet' :
            model = ElasticNet(alpha=param, l1_ratio=0.7)
        neg_mse_scores = cross_val_score(model, X_features, y_target, scoring='neg_mean_squared_error', cv= 5)
        avg_rmse = np.mean(np.sqrt(-1 * neg_mse_scores))
        print('alpha {}일 때 5폴드 세트의 평균 RMSE: {:.3f}'.format(param, avg_rmse))
        model.fit(X_features, y_target)
            
        coeff = pd.Series(data=model.coef_, index=X_features.columns)
        colname='alpha:'+str(param)
        coeff_df[colname] = coeff
    return coeff_df

In [64]:
lasso_alphas = [0.07, 0.1, 0.5, 1, 3]
coeff_lasso_df = get_linear_reg_eval('Lasso', params=lasso_alphas, X_data_n=X_features,
                                     y_target_n=y_target)

##### Lasso #####
alpha 0.07일 때 5폴드 세트의 평균 RMSE: 5.612
alpha 0.1일 때 5폴드 세트의 평균 RMSE: 5.615
alpha 0.5일 때 5폴드 세트의 평균 RMSE: 5.669
alpha 1일 때 5폴드 세트의 평균 RMSE: 5.776
alpha 3일 때 5폴드 세트의 평균 RMSE: 6.189


In [65]:
coeff_lasso_df

Unnamed: 0,alpha:0.07,alpha:0.1,alpha:0.5,alpha:1,alpha:3
CRIM,-0.098193,-0.097894,-0.083289,-0.063437,-0.0
ZN,0.049059,0.049211,0.049544,0.049165,0.037231
INDUS,-0.04212,-0.036619,-0.005253,-0.0,-0.0
CHAS,1.434343,0.95519,0.0,0.0,0.0
NOX,-0.0,-0.0,-0.0,-0.0,0.0
RM,3.789725,3.703202,2.498212,0.949811,0.0
AGE,-0.011706,-0.010037,0.003604,0.02091,0.042495
DIS,-1.176583,-1.160538,-0.936605,-0.66879,-0.0
RAD,0.270936,0.274707,0.277451,0.264206,0.061864
TAX,-0.01429,-0.01457,-0.015442,-0.015212,-0.008602


In [66]:
sort_column = 'alpha:'+str(lasso_alphas[0])
coeff_lasso_df.sort_values(by=sort_column, ascending=False)

Unnamed: 0,alpha:0.07,alpha:0.1,alpha:0.5,alpha:1,alpha:3
RM,3.789725,3.703202,2.498212,0.949811,0.0
CHAS,1.434343,0.95519,0.0,0.0,0.0
RAD,0.270936,0.274707,0.277451,0.264206,0.061864
ZN,0.049059,0.049211,0.049544,0.049165,0.037231
B,0.010248,0.010249,0.009469,0.008247,0.00651
NOX,-0.0,-0.0,-0.0,-0.0,0.0
AGE,-0.011706,-0.010037,0.003604,0.02091,0.042495
TAX,-0.01429,-0.01457,-0.015442,-0.015212,-0.008602
INDUS,-0.04212,-0.036619,-0.005253,-0.0,-0.0
CRIM,-0.098193,-0.097894,-0.083289,-0.063437,-0.0


In [67]:
lasso_alphas[0]

0.07

엘라스틱 넥

In [70]:
elastic_alphas = [ 0.07, 0.1, 0.5, 1, 3]
coeff_elastic_df = get_linear_reg_eval('ElasticNet', params=elastic_alphas, X_data_n=X_features, y_target_n=y_target)

##### ElasticNet #####
alpha 0.07일 때 5폴드 세트의 평균 RMSE: 5.542
alpha 0.1일 때 5폴드 세트의 평균 RMSE: 5.526
alpha 0.5일 때 5폴드 세트의 평균 RMSE: 5.467
alpha 1일 때 5폴드 세트의 평균 RMSE: 5.597
alpha 3일 때 5폴드 세트의 평균 RMSE: 6.068


In [72]:
sort_column = 'alpha:'+str(elastic_alphas[0])
coeff_elastic_df.sort_values(by=sort_column, ascending=False)

Unnamed: 0,alpha:0.07,alpha:0.1,alpha:0.5,alpha:1,alpha:3
RM,3.574162,3.414154,1.918419,0.938789,0.0
CHAS,1.330724,0.979706,0.0,0.0,0.0
RAD,0.27888,0.283443,0.300761,0.289299,0.146846
ZN,0.050107,0.050617,0.052878,0.052136,0.038268
B,0.010122,0.010067,0.009114,0.00832,0.00702
AGE,-0.010116,-0.008276,0.00776,0.020348,0.043446
TAX,-0.014522,-0.014814,-0.016046,-0.016218,-0.011417
INDUS,-0.044855,-0.042719,-0.023252,-0.0,-0.0
CRIM,-0.099468,-0.099213,-0.08907,-0.073577,-0.019058
NOX,-0.175072,-0.0,-0.0,-0.0,-0.0


In [95]:
def get_scaled_data(method='None', p_degree=None, input_data=None):
    if method == 'Standard':
        scaled_data = StandardScaler().fit_transform(input_data)
    elif method == 'MinMa':
        scaled_data = MinMaxScaler().fit_transform(input_data)
    elif method == 'Log':
        scaled_data = np.log1p(input_data)
    else:
        scaled_data = input_data
        
    if p_degree != None:
        scaled_data = PolynomialFeatures(degree=p_degree, include_bias=False).fit_transform(scaled_data)
        
    return scaled_data

In [96]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures

In [97]:
alphas = [0.1, 1, 10, 100]

scale_methods=[(None, None), ('Standeard', None), ('Standard', 2), ('MinMax', None), ('MinMax', 2), ('Log', None)]

for scale_method in scale_methods:
    X_features_scaled = get_scaled_data(method=scale_method[0], p_degree=scale_method[1], input_data=X_features)
    print('\n## 변환 유형:{0}, Polynomial Degee:{1}'.format(scale_method[0], scale_method[1]))
    get_linear_reg_eval('Ridge', params=alphas, X_data_n=X_features_scaled, y_target_n=y_target, verbose=False)


## 변환 유형:None, Polynomial Degee:None
alpha 0.1일 때 5폴드 세트의 평균 RMSE: 5.788
alpha 1일 때 5폴드 세트의 평균 RMSE: 5.653
alpha 10일 때 5폴드 세트의 평균 RMSE: 5.518
alpha 100일 때 5폴드 세트의 평균 RMSE: 5.330

## 변환 유형:Standeard, Polynomial Degee:None
alpha 0.1일 때 5폴드 세트의 평균 RMSE: 5.788
alpha 1일 때 5폴드 세트의 평균 RMSE: 5.653
alpha 10일 때 5폴드 세트의 평균 RMSE: 5.518
alpha 100일 때 5폴드 세트의 평균 RMSE: 5.330

## 변환 유형:Standard, Polynomial Degee:2
alpha 0.1일 때 5폴드 세트의 평균 RMSE: 5.788
alpha 1일 때 5폴드 세트의 평균 RMSE: 5.653
alpha 10일 때 5폴드 세트의 평균 RMSE: 5.518
alpha 100일 때 5폴드 세트의 평균 RMSE: 5.330

## 변환 유형:MinMax, Polynomial Degee:None
alpha 0.1일 때 5폴드 세트의 평균 RMSE: 5.788
alpha 1일 때 5폴드 세트의 평균 RMSE: 5.653
alpha 10일 때 5폴드 세트의 평균 RMSE: 5.518
alpha 100일 때 5폴드 세트의 평균 RMSE: 5.330

## 변환 유형:MinMax, Polynomial Degee:2
alpha 0.1일 때 5폴드 세트의 평균 RMSE: 5.788
alpha 1일 때 5폴드 세트의 평균 RMSE: 5.653
alpha 10일 때 5폴드 세트의 평균 RMSE: 5.518
alpha 100일 때 5폴드 세트의 평균 RMSE: 5.330

## 변환 유형:Log, Polynomial Degee:None
alpha 0.1일 때 5폴드 세트의 평균 RMSE: 5.788
alpha 1일 때 5폴드 세트의 평균 RMS