## Lasso 클래스 사용하기

In [4]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np

X, y = load_diabetes(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.33, 
                                                    random_state=1234)

X_train.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
221,-0.045472,-0.044642,-0.03854,-0.026328,-0.015328,0.000878,-0.032356,-0.002592,0.001148,-0.038357
176,0.045341,0.05068,0.019662,0.039087,0.020446,0.02593,0.008142,-0.002592,-0.003301,0.019633
227,0.067136,0.05068,-0.029918,0.057437,-0.000193,-0.015719,0.074412,-0.050564,-0.03846,0.007207
434,0.016281,-0.044642,0.001339,0.008101,0.005311,0.010899,0.030232,-0.039493,-0.045424,0.032059
110,0.041708,0.05068,-0.032073,-0.022885,-0.049727,-0.040144,0.030232,-0.039493,-0.126097,0.015491


In [7]:
scaler = StandardScaler()
reg = Lasso(alpha=10, random_state=1234)

pipe = Pipeline(steps=[("scaler", scaler), ("reg", reg)])
pipe = pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_pred = pipe.predict(X_test)

print(f'학습 데이터셋 MAE:{np.abs(y_train_pred - y_train).mean(): .3f}')
print(f'테스트 데이터셋 MAE:{np.abs(y_pred - y_test).mean(): .3f}')

학습 데이터셋 MAE: 47.475
테스트 데이터셋 MAE: 46.139


In [8]:
print('추정 파라미터의 값:\n', reg.coef_)  #피처개수

추정 파라미터의 값:
 [ 0.         -0.         22.14741042  7.25438995 -0.         -0.
 -4.43633143  0.         18.8046418   0.        ]


In [8]:
print('추정 절편의 값:\n', reg.intercept_)

추정 절편의 값:
 150.9527027027027


## 스케일링과 LassoCV 파이프라인

In [32]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np

X, y = load_diabetes(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.33, 
                                                    random_state=1234)

In [36]:
lassoCV = LassoCV(alphas=[0.01, 0.1, 1, 10, 100], cv=10) 
scaler=StandardScaler()
pipeline = Pipeline(steps=[("scaler", scaler), ("reg", lassoCV)]).fit(X_train, y_train)
y_pred = reg.predict(X_test)

print('최적의 alpha:', lassoCV.alpha_)
print(f'계수: {lassoCV.coef_}')
print(f'MAE:{(np.abs(y_pred - y_test)).mean(): .3f}')

최적의 alpha: 1.0
계수: [ -0.         -11.51737805  23.99135346  14.40671787  -5.33712387
  -0.          -9.7359812    2.29157958  23.67450824   3.04281307]
MAE: 63.786




## 불필요한 피처가 많은 상황에서 선형 모델 비교하기

In [37]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso

X, y = make_regression(n_samples=300, 
                       n_features=400, 
                       n_informative=50, 
                       n_targets=1, 
                       bias=0.0, 
                       noise=10.0, 
                       random_state=1234)
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.33, 
                                                    random_state=1234)

In [38]:
reg1 = LinearRegression()
reg1 = reg1.fit(X_train, y_train)
y_train_pred = reg1.predict(X_train)
print(f'학습 데이터셋 기준 OLS 모델의 MSE:{((y_train - y_train_pred)**2).mean(): .2f}')

y_test_pred = reg1.predict(X_test)
print(f'테스트 데이터셋 기준 OLS 모델의 MSE:{((y_test - y_test_pred)**2).mean(): .2f}')

학습 데이터셋 기준 OLS 모델의 MSE: 0.00
테스트 데이터셋 기준 OLS 모델의 MSE: 66401.28


In [39]:
reg2 = Lasso()
reg2 = reg2.fit(X_train, y_train)
y_train_pred = reg2.predict(X_train)
print(f'학습 데이터셋 기준 LASSO 모델의 MSE:{((y_train - y_train_pred)**2).mean(): .2f}')

y_test_pred = reg2.predict(X_test)
print(f'테스트 데이터셋 기준 LASSO 모델의 MSE:{((y_test - y_test_pred)**2).mean(): .2f}')

학습 데이터셋 기준 LASSO 모델의 MSE: 130.41
테스트 데이터셋 기준 LASSO 모델의 MSE: 457.40


In [40]:
reg3 = LassoCV()
reg3 = reg3.fit(X_train, y_train)
y_train_pred = reg3.predict(X_train)
print(f'학습 데이터셋 기준 LASSO 모델의 MSE:{((y_train - y_train_pred)**2).mean(): .2f}')

y_test_pred = reg3.predict(X_test)
print(f'테스트 데이터셋 기준 LASSO 모델의 MSE:{((y_test - y_test_pred)**2).mean(): .2f}')

학습 데이터셋 기준 LASSO 모델의 MSE: 11.23
테스트 데이터셋 기준 LASSO 모델의 MSE: 403.33


#### LassoCV 의 경우 alpha와 cv를 따로 적지 않으면 3번의 교차검증을 통해, 자동으로 세팅된 후보군 안에서 최적화된 alpha를 찾아낸다.

## Ridge 클래스 및 RidgeCV 활용하기

In [52]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
import numpy as np

X, y = load_diabetes(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=1234)

In [53]:
alphas = [0.05, 0.1, 1]

for alpha in alphas:
    reg = Ridge(alpha=alpha)
    reg = reg.fit(X_train, y_train)

    y_pred_train = reg.predict(X_train)
    print(f'alpha 값이 {alpha}일 경우:')
    print(f'학습 데이터셋 MAE:{np.abs(y_pred_train - y_train).mean(): .3f}')
    y_pred = reg.predict(X_test)
    print(f'테스트 데이터셋 MAE:{np.abs(y_pred - y_test).mean(): .3f}\n')

alpha 값이 0.05일 경우:
학습 데이터셋 MAE: 43.699
테스트 데이터셋 MAE: 43.034

alpha 값이 0.1일 경우:
학습 데이터셋 MAE: 43.928
테스트 데이터셋 MAE: 43.366

alpha 값이 1일 경우:
학습 데이터셋 MAE: 48.966
테스트 데이터셋 MAE: 49.582



In [54]:
alphas = [0.05, 0.1, 1]

reg = RidgeCV(alphas=alphas)
reg = reg.fit(X_train, y_train)

y_pred_train = reg.predict(X_train)
print(f'학습 데이터셋 MAE:{np.abs(y_pred_train - y_train).mean(): .3f}')
y_pred = reg.predict(X_test)
print(f'테스트 데이터셋 MAE:{np.abs(y_pred - y_test).mean(): .3f}\n')

학습 데이터셋 MAE: 43.699
테스트 데이터셋 MAE: 43.034



### 비선형 문제에 선형회귀,Ridge, Lasso 적용하기

In [46]:
import numpy as np
import pandas as pd

n1, n2, n_train = 2500, 2500, 4000
m, v = 1, 0.1

np.random.seed(1234)
X1 = np.random.normal(m, v, size=n1)
X2 = np.random.normal(m, v, size=n1)
X3 = np.random.normal(m, v, size=n1)
X4 = np.random.normal(m, v, size=n1)
y = X1*200 + X2*30 + X3*100 + X4*500
df1 = pd.DataFrame({'y': y, 'X1': X1, 'X2': X2, 'X3': X3, 'X4': X4})
df1['X5'] = 0

X1 = np.random.normal(m, v, size=n2)
X2 = np.random.normal(m, v, size=n2)
X3 = np.random.normal(m, v, size=n2)
X4 = np.random.normal(m, v, size=n2)
y = -X1*500 - X2*300 - X3*200 - X4*100
df2 = pd.DataFrame({'y': y, 'X1': X1, 'X2': X2, 'X3': X3, 'X4': X4})
df2['X5'] = 1

df = pd.concat([df1, df2])
df = df.sample(frac=1, random_state=1234)
df

Unnamed: 0,y,X1,X2,X3,X4,X5
206,-1043.525008,0.854545,0.980386,1.084388,1.052588,1
2436,783.900028,0.742729,1.102032,0.776963,1.049194,0
1201,909.283251,0.958107,1.159922,1.071176,1.151493,0
1486,832.806098,1.066110,0.975990,0.939312,0.992746,0
1786,-1101.601106,0.940846,1.111162,0.959809,1.058676,1
...,...,...,...,...,...,...
664,842.658338,1.176910,1.054615,1.047532,0.941769,0
776,-1209.189849,1.125480,1.030696,1.174190,1.024031,1
1318,878.704510,1.140897,1.047285,0.954595,1.047294,0
723,877.670564,0.948883,1.220475,1.123088,1.077942,0


In [44]:
train, test = df.iloc[:n_train, :], df.iloc[n_train:, :]

X_train, y_train = train.iloc[:, 1:], train.iloc[:, 0]
X_test, y_test = test.iloc[:, 1:], test.iloc[:, 0]

In [50]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge

reg1 = LinearRegression()
y_pred = reg1.fit(X_train, y_train).predict(X_test)
print(f'OLS 모델의 MAE:{(np.abs(y_pred - y_test)).mean(): .2f}')

reg2 = Lasso()
y_pred = reg2.fit(X_train, y_train).predict(X_test)
((y_pred - y_test)**2).mean()
print(f'LASSO 모델의 MAE:{(np.abs(y_pred - y_test)).mean(): .2f}')

reg3 = Ridge()
y_pred = reg3.fit(X_train, y_train).predict(X_test)
((y_pred - y_test)**2).mean()
print(f'릿지 회귀 모델의 MAE:{(np.abs(y_pred - y_test)).mean(): .2f}')

reg4 = LassoCV()
y_pred = reg4.fit(X_train, y_train).predict(X_test)
((y_pred - y_test)**2).mean()
print(f'LASSOCV 모델의 MAE:{(np.abs(y_pred - y_test)).mean(): .2f}')

reg5 = RidgeCV()
y_pred = reg5.fit(X_train, y_train).predict(X_test)
((y_pred - y_test)**2).mean()
print(f'릿지CV 모델의 MAE:{(np.abs(y_pred - y_test)).mean(): .2f}')

OLS 모델의 MAE: 42.62
LASSO 모델의 MAE: 51.79
릿지 회귀 모델의 MAE: 49.58
LASSOCV 모델의 MAE: 42.90
릿지CV 모델의 MAE: 43.37
