## 데이터 불러오기

In [1]:
import pandas as pd
df=pd.read_csv('https://raw.githubusercontent.com/ralbu85/DataScience_2022S/master/data/auto.csv')

In [2]:
df

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,mpg
0,8,307.0,130.0,3504.0,12.0,70,1,18.0
1,8,350.0,165.0,3693.0,11.5,70,1,15.0
2,8,318.0,150.0,3436.0,11.0,70,1,18.0
3,8,304.0,150.0,3433.0,12.0,70,1,16.0
4,8,302.0,140.0,3449.0,10.5,70,1,17.0
...,...,...,...,...,...,...,...,...
387,4,140.0,86.0,2790.0,15.6,82,1,27.0
388,4,97.0,52.0,2130.0,24.6,82,2,44.0
389,4,135.0,84.0,2295.0,11.6,82,1,32.0
390,4,120.0,79.0,2625.0,18.6,82,1,28.0


## 데이터 나누기

In [3]:
df=pd.get_dummies(df,columns=['origin'])
y=df['mpg']
X=df.drop(columns=['mpg'])

## 교차검증

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_validate

reg= LinearRegression()
kfold = KFold(5,shuffle=True)
result=cross_validate(estimator=reg, X=X, y=y, cv=kfold, scoring=['neg_mean_squared_error','r2'], return_train_score=True)
result

{'fit_time': array([0.0028553 , 0.00196171, 0.00190687, 0.00192261, 0.00190735]),
 'score_time': array([0.00177574, 0.00157762, 0.00164819, 0.00159502, 0.00152898]),
 'test_neg_mean_squared_error': array([-13.88489386, -10.31628049, -11.17414797,  -8.97219426,
        -12.18829307]),
 'train_neg_mean_squared_error': array([ -9.93622668, -10.94725059, -10.62272588, -11.11879969,
        -10.45256934]),
 'test_r2': array([0.76432928, 0.77496584, 0.82889072, 0.86687802, 0.81284998]),
 'train_r2': array([0.83731874, 0.83020708, 0.82151804, 0.81174759, 0.82463489])}

## 하이퍼패러미터 변경(차수 K)

In [44]:
kfold = KFold(5,shuffle=True)
for i in range(1,5):
    transformer = PolynomialFeatures(degree=i, interaction_only=False)
    X_=transformer.fit_transform(X) #데이터에 차수 추가
    reg= LinearRegression()    
    result=cross_validate(estimator=reg, X=X_, y=y, cv=kfold, scoring=['neg_mean_squared_error','r2'], return_train_score=True)
    print(i,'train',result['train_neg_mean_squared_error'].mean(), result['train_r2'].mean())
    print(i,'test',result['test_neg_mean_squared_error'].mean(), result['test_r2'].mean())

1 train -10.603577243981722 0.8254812886644662
1 test -11.411635706286182 0.81198093672582
2 train -12.495316646462372 0.7947941035264618
2 test -25.383526960444993 0.5762603237277467
3 train -3.3867599948503164 0.944214356799338
3 test -18262.579493294812 -394.33279205024206
4 train -0.23191616238540158 0.9961889558558588
4 test -116717.17103776161 -1882.7260784282275


## Regularization

In [48]:
from sklearn.linear_model import Ridge, Lasso
reg = Lasso(alpha=0.01)
reg.fit(X,y)
reg.coef_

array([-0.44629638,  0.02286369, -0.01762971, -0.00671945,  0.07651421,
        0.77615517, -2.56966676,  0.        ,  0.16691861])

## Lambda의 크기에 따른 계수의 변화를 살펴보자

In [32]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12,8))
lambdas = [0,0.0001,0.001,0.01,0.1,1]
for i in lambdas:
    reg = Lasso(alpha=i)
    reg.fit(X,y)
    print(i, reg.coef_)

0 [-0.48970942  0.02397864 -0.01818346 -0.00671038  0.07910304  0.77702694
 -2.76337047 -0.13336811  0.08985776]
0.0001 [-0.48927812  0.02396754 -0.01817795 -0.00671047  0.07907707  0.77701821
 -2.6293992   0.          0.22266311]
0.001 [-0.4853459   0.02386686 -0.0181282  -0.0067113   0.07884366  0.77693967
 -2.62396889  0.          0.21759332]
0.01 [-0.44629638  0.02286369 -0.01762971 -0.00671945  0.07651421  0.77615517
 -2.56966676  0.          0.16691861]
0.1 [-0.06660805  0.0126583  -0.01331772 -0.00673194  0.05328912  0.76251936
 -1.8486989   0.          0.        ]
1 [-0.          0.         -0.00734394 -0.00646937  0.          0.66308442
 -0.          0.          0.        ]


<Figure size 864x576 with 0 Axes>

In [36]:
import warnings
warnings.filterwarnings('ignore')

## 하이퍼패러미터 변경(다항식 추가)
kfold = KFold(5,shuffle=True)
lambdas = [0,0.001,0.01,0.1,0.5,1,10,100]
for l in lambdas:    
    reg= Lasso(alpha=l)
    result=cross_validate(estimator=reg, X=X, y=y, cv=kfold, scoring=['neg_mean_squared_error','r2'], return_train_score=True)
    print(l,'train',result['train_neg_mean_squared_error'].mean(), result['train_r2'].mean())
    print(l,'test',result['test_neg_mean_squared_error'].mean(), result['test_r2'].mean())

0 train -10.624943723771423 0.8250576243665207
0 test -11.21925019070598 0.813454554854224
0.001 train -10.620527006157564 0.825147611809555
0.001 test -11.23574532721657 0.8137264490227409
0.01 train -10.612339518808165 0.8251440463176248
0.01 test -11.29767833712374 0.8107973651854292
0.1 train -10.761675967779825 0.822619660347095
0.1 test -11.348198460238105 0.8083323211338123
0.5 train -11.630788556471327 0.808560737984612
0.5 test -11.986698781556473 0.802152288128758
1 train -11.691459585895021 0.8073484751826163
1 test -12.057468542892227 0.795984989758138
10 train -18.017024898571727 0.7029458828695535
10 test -18.41815129226664 0.6885933992720603
100 train -18.68068965013027 0.6925099043382046
100 test -18.771463503708016 0.6897321621401196


## 다항식 변경 + hyperparameter tuning

In [35]:
# 1. 공정한 비교가 아님 (제대로 하려면 동일한 데이터셋으로 쪼개진 상태를 통제하면서 수행해야 하므로 반복이 더 추가됨)
# 2. 데이터의 결과물을 정리하기가 어려움
# 3. For loop를 써서 지저분해보임

import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import Lasso
kfold = KFold(5,shuffle=True)
for i in range(1,5):
    for j in [0,0.001,0.01,0.05,0.1,0.15,0.2]:
        transformer = PolynomialFeatures(degree=i, interaction_only=False)
        X_=transformer.fit_transform(X)
        reg= Lasso(alpha=j)    
        result=cross_validate(estimator=reg, X=X_, y=y, cv=kfold, scoring=['neg_mean_squared_error','r2'], return_train_score=True)
    #     print(i,'train',result['train_neg_mean_squared_error'].mean(), result['train_r2'].mean())
        print(i,j,'test',result['test_neg_mean_squared_error'].mean(), result['test_r2'].mean())

1 0 test -11.63148527908695 0.8056687503746932
1 0.001 test -11.138428605710804 0.8144368786264593
1 0.01 test -11.549574406940783 0.8054561475252108
1 0.05 test -11.119504263638106 0.8155569623433042
1 0.1 test -11.200069250289378 0.8161722772313936
1 0.15 test -11.43423340019818 0.8083366088973218
1 0.2 test -11.557466328486244 0.8061169051476436
2 0 test -8.576015406754212 0.8548652554393769
2 0.001 test -8.114738685782813 0.860315503686085
2 0.01 test -8.528231706447212 0.8559216901566019
2 0.05 test -8.057953107286519 0.8665203943751448
2 0.1 test -8.85364004753627 0.8444393041703198
2 0.15 test -7.899586590121771 0.8668743720868713
2 0.2 test -8.095252912446265 0.8635662852321735
3 0 test -7.4302476856288235 0.8781159761084449
3 0.001 test -7.370654769332288 0.8789208439755655
3 0.01 test -7.683707120426044 0.8718186984132344
3 0.05 test -8.056370320277136 0.8664261950384619
3 0.1 test -7.6199888420107085 0.8734452091164002
3 0.15 test -7.709105091974524 0.8726574685241811
3 0.2 