# 실습 - 모델선택(Model Selection)



# **1. 필요한 라이브러리 불러오기**

In [1]:
# 데이터 라이브러리
import pandas as pd

# 선형회귀, ridge regression, lasso 관련 scikit-learn 라이브러리
from sklearn import linear_model
from sklearn.metrics import mean_squared_error 

# K-fold cross validation 관련 라이브러리
from sklearn.model_selection import KFold

# **2. Ridge regression 간단한 예제**

In [2]:
# Ridge regression object 생성
reg = linear_model.Ridge(alpha=0.1) 

# training
reg.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1]) 

# test
pred_test = reg.predict([[0, 1]])

In [None]:
# test data 예측값
pred_test

In [None]:
# coefficient
reg.coef_

# **3. Lasso 간단한 예제**

In [5]:
# Lasso object 생성
reg = linear_model.Lasso(alpha=0.1) 

# training
reg.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1]) 

# test
pred_test = reg.predict([[0, 1]])

In [None]:
# test data 예측값
pred_test

In [None]:
# coefficient
reg.coef_

# **4. Advertising 데이터 불러오기**



In [None]:
# 데이터 loading
from google.colab import drive
drive.mount('/content/drive')

ad = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/Advertising.csv", index_col=0)

In [None]:
ad

# **5. Advertising data Lasso 예제(최적의 lambda 선정)**

In [None]:
# k-fold CV의 fold 수 지정
n_fold = 5

for t_param in (0.001, 1, 1000):
  print("Lambda: ", t_param)
  kf = KFold(n_splits=n_fold) 
  idx = 1

  sum_val_mse = 0
  for train, val in kf.split(ad):
    print("Fold: #", idx)

    # training set의 feature와 response 분리
    train_X = ad.iloc[train][["TV", "Radio", "Newspaper"]]
    train_y = ad.iloc[train][["Sales"]]

    # validation set의 feature와 response 분리
    val_X = ad.iloc[val][["TV", "Radio", "Newspaper"]]
    val_y = ad.iloc[val][["Sales"]]

    # Lasso object 생성
    regr = linear_model.Lasso(alpha=t_param)

    # training set을 이용하여 적합
    regr.fit(train_X[["TV", "Radio", "Newspaper"]], train_y)

    # coefficients 출력
    print("Coefficients: \n", regr.coef_)

    # validation set을 이용하여 예측
    val_y_pred = regr.predict(val_X[["TV", "Radio", "Newspaper"]])
    
    # validation MSE
    val_mse = mean_squared_error(val_y, val_y_pred)

    # validation MSE 합계
    sum_val_mse += val_mse

    print("------------------------------")
    idx+=1
  print("Average Validation MSE: %.3f" % (sum_val_mse / n_fold))
  print("******************************")