### 변수 선택법

❗️알고리즘 시각화해서 정리하기❗️

In [1]:
import pandas as pd
import numpy as np
import itertools
from patsy import dmatrices

from statsmodels.api import OLS

In [4]:
cars = pd.read_csv('https://raw.githubusercontent.com/ADPclass/ADP_book_ver01/main/data/Cars93.csv')
# 데이터 전처리
## 컬럼 특수문자 제거
cars.columns = cars.columns.str.replace('.', '')
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93 entries, 0 to 92
Data columns (total 27 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Manufacturer      93 non-null     object 
 1   Model             93 non-null     object 
 2   Type              93 non-null     object 
 3   MinPrice          93 non-null     float64
 4   Price             93 non-null     float64
 5   MaxPrice          93 non-null     float64
 6   MPGcity           93 non-null     int64  
 7   MPGhighway        93 non-null     int64  
 8   AirBags           59 non-null     object 
 9   DriveTrain        93 non-null     object 
 10  Cylinders         93 non-null     object 
 11  EngineSize        93 non-null     float64
 12  Horsepower        93 non-null     int64  
 13  RPM               93 non-null     int64  
 14  Revpermile        93 non-null     int64  
 15  Mantransavail     93 non-null     object 
 16  Fueltankcapacity  93 non-null     float64
 17 

* ```dmatrices(formula, data, return_type)```: y, x 순으로 반환

In [9]:
y,x = dmatrices('Price ~ EngineSize + RPM + Weight + Length + MPGcity + MPGhighway'\
    , data=cars, return_type='dataframe')

In [13]:
x.head()

Unnamed: 0,Intercept,EngineSize,RPM,Weight,Length,MPGcity,MPGhighway
0,1.0,1.8,6300.0,2705.0,177.0,25.0,31.0
1,1.0,3.2,5500.0,3560.0,195.0,18.0,25.0
2,1.0,2.8,5500.0,3375.0,180.0,20.0,26.0
3,1.0,2.8,5500.0,3405.0,193.0,19.0,26.0
4,1.0,3.5,5700.0,3640.0,186.0,22.0,30.0


In [15]:
import time
import itertools
 
def processSubset(x, y, feature_set):
    model = OLS(y, x[list(feature_set)])
    regr = model.fit()
    AIC = regr.aic #모델의 AIC
    return {"model": regr, "AIC": AIC}

# 전진선택법
def forward(x, y, predictors):
    # 데이터 변수들이 미리 정의된 predictors에 있는지 없는지 확인 및 분류
    remain_predictors = [p for p in x.columns.difference(['Intercept']) if p not in predictors] # df.columns.difference(['제외할 컬럼명'])
    results = []
    for p in remain_predictors: # [컬럼1, 컬럼2, 컬럼3]
        results.append(processSubset(x, y, feature_set=predictors+[p]+['Intercept'])) # 모델과 AIC값 dic반환
    models = pd.DataFrame(results) # 결과 비교용 Df
    best_model = models.loc[models["AIC"].argmin()] # Series.argmin(): 최솟값의 인덱스 AIC가 낮을수록 적합도가 높으니까!
    print('Processed', models.shape[0], 'models on', len(predictors)+1, 'predictors in')
    print('Selected predictors:', best_model['model'].model.exog_names, 'AIC:', best_model[1])
        
    return best_model

# 후진제거법
def backward(x, y, predictors): 
    tic = time.time() # 현재시간을 초단위로 반환 (1970.01.01 00:00~)
    results = []
    
    for combo in itertools.combinations(predictors, len(predictors)-1): # itertools.combinations(a,b) a의 원소를 사용해 b개로 이루어진 모든 조합을 만들어줌
        results.append(processSubset(x, y, feature_set = list(combo)+['Intercept']))
    models = pd.DataFrame(results)
    best_model = models.loc[models['AIC'].argmin()]
    toc = time.time()
    
    print('Processed', models.shape[0], 'models on', len(predictors)-1, 'predictors in', (toc-tic)) # 소요시간?
    print('Selected predictors:', best_model['model'].model.exog_names, 'AIC:', best_model[1])
    
    return best_model

# 단계적 선택법
def Stepwise_model(x, y):
    Stepmodels = pd.DataFrame(columns=["model", "AIC"])
    tic = time.time()
    predictors = []
    Smodel_before = processSubset(x, y, predictors+['Intercept'])['AIC'] # 상수항만 들어간 모델(기준값)
    
    for i in range(1, len(x.columns.difference(['Intercept']))+1): # 
        # 전진
        print('[ forward ]')
        forward_result = forward(x, y, predictors=predictors) # ['model', 'AIC']
        Stepmodels.loc[i] = forward_result # bestmodel(['model', 'AIC'])
        predictors = Stepmodels.loc[i]['model'].model.exog_names # OLSResults.model.변수명
        predictors = [k for k in predictors if k != 'Intercept']
        # 후진
        print('[ backward ]')
        backward_result = backward(x,y, predictors=predictors) # bestmodel(['model', 'AIC'])
        
        # 전진, 후진 비교
        if backward_result['AIC'] < forward_result['AIC']:
            Stepmodels.loc[i] = backward_result
            predictors = Stepmodels.loc[i]['model'].model.exog_names
            predictors = [k for k in predictors if k != 'Intercept']
            Smodel_before = Stepmodels.loc[i]['AIC']
        
        if backward_result['AIC'] > forward_result['AIC']:
            pass # 책에는 break으로 오기재되어있음! break은 for문을 중단시킴.
        else:
            Smodel_before = Stepmodels.loc[i]['AIC']
    toc = time.time()
    print("Total elapsed time:", (toc-tic), 'seconds.')
    
    return Stepmodels

Stepmodels = Stepwise_model(x, y)
Stepmodels

[ forward ]
Processed 6 models on 1 predictors in
Selected predictors: ['Weight', 'Intercept'] AIC: 638.2790669305498
[ backward ]
Processed 1 models on 0 predictors in 0.0006358623504638672
Selected predictors: ['Intercept'] AIC: 686.7529864151061
[ forward ]
Processed 5 models on 2 predictors in
Selected predictors: ['Weight', 'RPM', 'Intercept'] AIC: 624.5241529342444
[ backward ]
Processed 2 models on 1 predictors in 0.000982046127319336
Selected predictors: ['Weight', 'Intercept'] AIC: 638.2790669305498
[ forward ]
Processed 4 models on 3 predictors in
Selected predictors: ['Weight', 'RPM', 'EngineSize', 'Intercept'] AIC: 616.0976497740975
[ backward ]
Processed 3 models on 2 predictors in 0.0018210411071777344
Selected predictors: ['Weight', 'RPM', 'Intercept'] AIC: 624.5241529342444
[ forward ]
Processed 3 models on 4 predictors in
Selected predictors: ['Weight', 'RPM', 'EngineSize', 'MPGcity', 'Intercept'] AIC: 616.1664162275082
[ backward ]
Processed 4 models on 3 predictors i

Unnamed: 0,model,AIC
1,<statsmodels.regression.linear_model.Regressio...,638.279067
2,<statsmodels.regression.linear_model.Regressio...,624.524153
3,<statsmodels.regression.linear_model.Regressio...,616.09765
4,<statsmodels.regression.linear_model.Regressio...,616.09765
5,<statsmodels.regression.linear_model.Regressio...,616.09765
6,<statsmodels.regression.linear_model.Regressio...,616.09765


In [36]:
# 복습
import itertools
# aic 점수
def aic_val(x,y,cols):
    model = OLS(y,x[cols])
    regr = model.fit()
    AIC = regr.aic
    return {'model':regr, 'AIC':AIC}

# 전진
def forward(x,y,cols):
    results = []
    remain_col = [k for k in x.columns.difference(['Intercept']) if k not in cols]
    for col in remain_col:
        results.append(aic_val(x, y, cols+[col]+['Intercept']))
    models = pd.DataFrame(results)
    best = models.loc[models.AIC.argmin()]
    return best

# 후진
def backward(x,y,cols): # [컬럼1, 컬럼2, 컬럼3]
    results = []
    for combo in itertools.combinations(cols, len(cols)-1):
        results.append(aic_val(x,y,list(combo)+['Intercept'])) # combo: tuple
    models = pd.DataFrame(results)
    best = models.loc[models.AIC.argmin()]
    return best
    
# 단계적
def stepwise(x,y):
    # 기준값 (상수항만 넣은 모델의 AIC)
    stepwise_models = pd.DataFrame(columns=['model', 'AIC'])
    cols = []
    aic_before = aic_val(x,y,cols+['Intercept'])
    for i in range(1, len(x.columns)):
        forwards_best = forward(x,y,cols) # 전진선택법 결과
        stepwise_models.loc[i] = forwards_best # 최종 결과에 반영
        forward_col = forwards_best.model.model.exog_names
        cols = [k for k in forward_col if k != 'Intercept']
        backwards_best = backward(x, y, cols) # 후진선택법 결과
        
        # 비교
        if backwards_best.AIC < forwards_best.AIC: # 후진선택 결과가 더 좋으면
            stepwise_models.loc[i] = backwards_best # 값 교체
            cols = backwards_best.model.model.exog_names # 컬럼 리스트 재할당
            cols = [k for k in cols if k != 'Intercept']# 상수항 제거
            aic_before = stepwise_models.loc[i].AIC# 기준값 갱신
        else:
            aic_before = stepwise_models.loc[i].AIC
    
    return stepwise_models

stepwise_models = stepwise(x, y)
stepwise_models

Unnamed: 0,model,AIC
1,<statsmodels.regression.linear_model.Regressio...,638.279067
2,<statsmodels.regression.linear_model.Regressio...,624.524153
3,<statsmodels.regression.linear_model.Regressio...,616.09765
4,<statsmodels.regression.linear_model.Regressio...,616.09765
5,<statsmodels.regression.linear_model.Regressio...,616.09765
6,<statsmodels.regression.linear_model.Regressio...,616.09765


In [32]:
model = stepwise_models.iloc[-1].model
model.summary()

0,1,2,3
Dep. Variable:,Price,R-squared:,0.561
Model:,OLS,Adj. R-squared:,0.547
Method:,Least Squares,F-statistic:,37.98
Date:,"Thu, 14 Mar 2024",Prob (F-statistic):,6.75e-16
Time:,15:23:44,Log-Likelihood:,-304.05
No. Observations:,93,AIC:,616.1
Df Residuals:,89,BIC:,626.2
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Weight,0.0073,0.002,3.372,0.001,0.003,0.012
RPM,0.0071,0.001,5.208,0.000,0.004,0.010
EngineSize,4.3054,1.325,3.249,0.002,1.673,6.938
Intercept,-51.7933,9.106,-5.688,0.000,-69.887,-33.699

0,1,2,3
Omnibus:,62.441,Durbin-Watson:,1.406
Prob(Omnibus):,0.0,Jarque-Bera (JB):,361.88
Skew:,2.076,Prob(JB):,2.62e-79
Kurtosis:,11.726,Cond. No.,82700.0


* 해당 모델은 AdjR2rkqtdl 0.547로 모델이 전체 데이터의 54.7%를 설명한다. - **설명력 부족**
* 차량 가격에 가장 큰 영향을 주는 변수는 EngineSize이다.