# 선형 회귀 모델을 위한 데이터 변환
- 선형 모델은 일반적으로 피처와 타깃값 간에 선형의 관계가 있다고 가정
- 정규 분포 형태 선호
- 무조건 예측 성능이 향상되는 것은 아니지만, 스케일링/정규화 해주는 것이 일반적

# Scaling
## StandardScaler, MinMaxScaler
## 다항 특성을 적용하여 변환
## log transformation
- 타겟값은 일반적으로 log변환

In [13]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

In [17]:
def get_scaled_data(method = 'None', p_degree = None, input_data = None) :
    
    if method == 'Standard' :
        scaled_data = StandardScaler().fit_transform(input_data)
    elif method == 'MinMax' :
        scaled_data = MinMaxScaler().fit_transform(input_data)
    elif method == 'Log' :
        scaled_data = np.log1p(input_data)
    else :
        scaled_data = input_data
        
    if p_degree :
        scaled_data = PolynomialFeatures(degree = p_degree, include_bias=False).fit_transform(scaled_data)
        
    return scaled_data

In [31]:
from sklearn.datasets import load_boston

boston = load_boston()
boston_df = pd.DataFrame(boston.data, columns = boston.feature_names)
boston_df['price'] = boston.target

X_data = boston_df.drop('price', axis = 1)
y_target = boston_df.price

def get_linear_reg_eval(model_name, params = None, X_data_n = None, y_target_n = None, verbose = True) :
    coeff_df = pd.DataFrame()
    
    if verbose : print('#####', model_name,'#####')
        
    for param in params :
        if model_name == 'Ridge' : model = Ridge(alpha = param)
        elif model_name == 'Lasso' : model = Lasso(alpha = param)
        elif model_name == 'ElasticNet' : model = ElasticNet(alpha = param, l1_ratio = 0.7)
            
        neg_mse_scores = cross_val_score(model, X_data_n, y_target_n, scoring = 'neg_mean_squared_error', cv= 5)
        avg_rmse = np.mean(np.sqrt(-1*neg_mse_scores))
        
        print('alpha {0}일 때, 5fold average RMSE : {1:.3f}'.format(param, avg_rmse))
        
        model.fit(X_data_n, y_target_n)
        coeff = pd.Series(data = model.coef_, index = X_data.columns)
        colname = 'alpha:'+str(param)
        coeff_df[colname] = coeff
        
    return coeff_df

In [27]:
X_data.shape

(506, 13)

In [28]:
X_data_scaled.shape

(506, 104)

In [30]:
StandardScaler().fit_transform(X_data).shape

(506, 13)

In [34]:
from sklearn.linear_model import Ridge
alphas = [0.1,1,10,100]

scaled_methods = [(None, None), ('Standard', None), ('Standard',2),
                 ('MinMax', None), ('MinMax', 2), ('Log', None)]

for scaled_method in scaled_methods :
    X_data_scaled = get_scaled_data(method = scaled_method[0], p_degree=scaled_method[1],
                                   input_data=X_data)
    print('\n ##변환 유형 : {0}, Polynomial Degree:{1}'.format(scaled_method[0], scaled_method[1]))
    
    for alpha in alphas :
        ridge = Ridge(alpha=alpha)
        neg_mse_scores = cross_val_score(ridge, X_data_scaled, y_target, scoring = 'neg_mean_squared_error', cv= 5)
        avg_rmse = np.mean(np.sqrt(-1*neg_mse_scores))
        
        print('alpha {0}일 때, 5fold average RMSE : {1:.3f}'.format(alpha, avg_rmse))


 ##변환 유형 : None, Polynomial Degree:None
alpha 0.1일 때, 5fold average RMSE : 5.788
alpha 1일 때, 5fold average RMSE : 5.653
alpha 10일 때, 5fold average RMSE : 5.518
alpha 100일 때, 5fold average RMSE : 5.330

 ##변환 유형 : Standard, Polynomial Degree:None
alpha 0.1일 때, 5fold average RMSE : 5.826
alpha 1일 때, 5fold average RMSE : 5.803
alpha 10일 때, 5fold average RMSE : 5.637
alpha 100일 때, 5fold average RMSE : 5.421

 ##변환 유형 : Standard, Polynomial Degree:2
alpha 0.1일 때, 5fold average RMSE : 8.827
alpha 1일 때, 5fold average RMSE : 6.871
alpha 10일 때, 5fold average RMSE : 5.485
alpha 100일 때, 5fold average RMSE : 4.634

 ##변환 유형 : MinMax, Polynomial Degree:None
alpha 0.1일 때, 5fold average RMSE : 5.764
alpha 1일 때, 5fold average RMSE : 5.465
alpha 10일 때, 5fold average RMSE : 5.754
alpha 100일 때, 5fold average RMSE : 7.635

 ##변환 유형 : MinMax, Polynomial Degree:2
alpha 0.1일 때, 5fold average RMSE : 5.298
alpha 1일 때, 5fold average RMSE : 4.323
alpha 10일 때, 5fold average RMSE : 5.185
alpha 100일 때, 5fold avera

# logistic regression

##### penalty : l2 or l1 regulariztion
##### C : 1/alpha

In [36]:
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression

cancer = load_breast_cancer()

In [40]:
from sklearn.model_selection import train_test_split
scaler = StandardScaler()
data_scaled = scaler.fit_transform(cancer.data)

X_train, X_test, y_train, y_test = train_test_split(data_scaled, cancer.target,
                                                  test_size = 0.3, random_state = 0)

In [45]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score, roc_auc_score

lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

print('accuracy : {0:.3f}'.format(accuracy_score(y_test, lr_pred)))
print('roc_auc : {0:.3f}'.format(roc_auc_score(y_test, lr_pred)))

accuracy : 0.982
roc_auc : 0.979


In [46]:
from sklearn.model_selection import GridSearchCV

params = {'penalty' : ['l2','l1'],
         'C' : [0.01,0.1,1,1,5,10]}

grid_cv = GridSearchCV(lr, param_grid=params, scoring='accuracy', cv = 3)
grid_cv.fit(data_scaled, cancer.target)

print('best hyperparameter : {0}, best accuracy : {1:.3f}'.format(grid_cv.best_params_, grid_cv.best_score_))

best hyperparameter : {'C': 0.1, 'penalty': 'l2'}, best accuracy : 0.979


In [49]:
#default lr
lr.fit(X_train ,y_train)
#hyperparameter tuning lr
lr_best = LogisticRegression(C=0.1, penalty='l2')
lr_best.fit(X_train, y_train)

print('default logistic regression accuracy : {0:.3f}'.format( accuracy_score(lr.predict(X_test), y_test) ))
print('tuning logistic regression accuracy : {0:.3f}'.format( accuracy_score(lr_best.predict(X_test), y_test) )) #???

default logistic regression accuracy : 0.982
tuning logistic regression accuracy : 0.971
