# 6.5.1 Subset Selection

To run in Colab,
* click this button
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ssuai/machine_learning/blob/main/lab5-1_subset_selection.ipynb)
* run the following cell

In [None]:
# clone the data folder
!git clone https://github.com/ssuai/machine_learning_data.git data

In [1]:
import pandas as pd
import numpy as np

import itertools
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('data/Hitters.csv')
df.head()

df.dropna(inplace=True)
len(df)

df.head()

Unnamed: 0,Player,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,...,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
1,-Alan Ashby,315,81,7,24,38,39,14,3449,835,...,321,414,375,N,W,632,43,10,475.0,N
2,-Alvin Davis,479,130,18,66,72,76,3,1624,457,...,224,266,263,A,W,880,82,14,480.0,A
3,-Andre Dawson,496,141,20,65,78,37,11,5628,1575,...,828,838,354,N,E,200,11,3,500.0,N
4,-Andres Galarraga,321,87,10,39,42,30,2,396,101,...,48,46,33,N,E,805,40,4,91.5,N
5,-Alfredo Griffin,594,169,4,74,51,35,11,4408,1133,...,501,336,194,A,W,282,421,25,750.0,A


In [3]:
# prepare X (feature) and y (output)
X_num = df.select_dtypes('number').drop('Salary', axis=1)
dummies = pd.get_dummies(df[['League', 'Division', 'NewLeague']])

X = pd.concat([X_num, dummies[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)

y = df['Salary']

In [4]:
X

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,League_N,Division_W,NewLeague_N
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,632,43,10,1,1,1
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,880,82,14,0,1,0
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,200,11,3,1,0,1
4,321,87,10,39,42,30,2,396,101,12,48,46,33,805,40,4,1,0,1
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,282,421,25,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,497,127,7,65,48,37,5,2703,806,32,379,311,138,325,9,3,1,0,1
318,492,136,5,76,50,94,12,5511,1511,39,897,451,875,313,381,20,0,0,0
319,475,126,3,61,43,52,6,1700,433,7,217,93,146,37,113,7,0,1,0
320,573,144,9,85,60,78,8,3198,857,97,470,420,332,1314,131,12,0,0,0


use `combinations` from `itertools`

In [5]:
for it in itertools.combinations(X.columns, 2):
    print(it)

('AtBat', 'Hits')
('AtBat', 'HmRun')
('AtBat', 'Runs')
('AtBat', 'RBI')
('AtBat', 'Walks')
('AtBat', 'Years')
('AtBat', 'CAtBat')
('AtBat', 'CHits')
('AtBat', 'CHmRun')
('AtBat', 'CRuns')
('AtBat', 'CRBI')
('AtBat', 'CWalks')
('AtBat', 'PutOuts')
('AtBat', 'Assists')
('AtBat', 'Errors')
('AtBat', 'League_N')
('AtBat', 'Division_W')
('AtBat', 'NewLeague_N')
('Hits', 'HmRun')
('Hits', 'Runs')
('Hits', 'RBI')
('Hits', 'Walks')
('Hits', 'Years')
('Hits', 'CAtBat')
('Hits', 'CHits')
('Hits', 'CHmRun')
('Hits', 'CRuns')
('Hits', 'CRBI')
('Hits', 'CWalks')
('Hits', 'PutOuts')
('Hits', 'Assists')
('Hits', 'Errors')
('Hits', 'League_N')
('Hits', 'Division_W')
('Hits', 'NewLeague_N')
('HmRun', 'Runs')
('HmRun', 'RBI')
('HmRun', 'Walks')
('HmRun', 'Years')
('HmRun', 'CAtBat')
('HmRun', 'CHits')
('HmRun', 'CHmRun')
('HmRun', 'CRuns')
('HmRun', 'CRBI')
('HmRun', 'CWalks')
('HmRun', 'PutOuts')
('HmRun', 'Assists')
('HmRun', 'Errors')
('HmRun', 'League_N')
('HmRun', 'Division_W')
('HmRun', 'NewLeague

In [6]:
import itertools
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def best_subset_selection(X, y, k, var_names=None):
    if var_names is None:
        var_names = X.columns.tolist()
    
    best_model = None
    best_subset = None
    best_mse = float('inf')

    for subset in itertools.combinations(var_names, k):
        model = LinearRegression().fit(X[list(subset)], y)

        mse = mean_squared_error(y, model.predict(X[list(subset)]))

        if mse < best_mse:
            best_mse = mse
            best_model = model
            best_subset = subset
                
    return best_model, best_subset, best_mse


In [7]:
# find best subset with 1 predictor
best_subset_selection(X, y, 1)

(LinearRegression(), ('CRBI',), 137565.32036137575)

In [8]:
# find best subset with 2 predictors
best_subset_selection(X, y, 2)

(LinearRegression(), ('Hits', 'CRBI'), 116526.84368963054)

In [9]:
# find best subset with 3 predictors
best_subset_selection(X, y, 3)

(LinearRegression(), ('Hits', 'CRBI', 'PutOuts'), 111214.05648618752)

In [10]:
# find best subset with 7 predictors
best_subset_selection(X, y, 7)

(LinearRegression(),
 ('Hits', 'Walks', 'CAtBat', 'CHits', 'CHmRun', 'PutOuts', 'Division_W'),
 98503.9828921055)

## Forward Stepwise Selection


In [11]:
def find_next_best_predictor(X, y, best_subset, candidates):

    best_mse = float('inf')
    
    for candidate in candidates:
        model = LinearRegression().fit(X[best_subset+[candidate]], y)
        
        mse = mean_squared_error(y, model.predict(X[best_subset+[candidate]]))
        
        if mse < best_mse:
            best_mse = mse
            next_best = candidate

    return next_best

In [12]:
find_next_best_predictor(X, y, [], X.columns)

'CRBI'

In [13]:
find_next_best_predictor(X, y, ['CRBI'], ['Hits', 'Walks', 'CAtBat'])

'Hits'

In [14]:
def forward_stepwise_selection(X, y, k):
    var_names = X.columns.tolist()

    
    best_subset = []
    candidates = X.columns.tolist()

    
    best_model = None
    best_mse = float('inf')

    for i in range(k):
        next_best = find_next_best_predictor(X, y, best_subset, candidates)
        print(candidates, '->', next_best)
        
        best_subset.append(next_best)
        candidates.remove(next_best)
        
    return best_subset

forward_stepwise_selection(X, y, 7)

['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat', 'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks', 'PutOuts', 'Assists', 'Errors', 'League_N', 'Division_W', 'NewLeague_N'] -> CRBI
['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat', 'CHits', 'CHmRun', 'CRuns', 'CWalks', 'PutOuts', 'Assists', 'Errors', 'League_N', 'Division_W', 'NewLeague_N'] -> Hits
['AtBat', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat', 'CHits', 'CHmRun', 'CRuns', 'CWalks', 'PutOuts', 'Assists', 'Errors', 'League_N', 'Division_W', 'NewLeague_N'] -> PutOuts
['AtBat', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat', 'CHits', 'CHmRun', 'CRuns', 'CWalks', 'Assists', 'Errors', 'League_N', 'Division_W', 'NewLeague_N'] -> Division_W
['AtBat', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat', 'CHits', 'CHmRun', 'CRuns', 'CWalks', 'Assists', 'Errors', 'League_N', 'NewLeague_N'] -> AtBat
['HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat', 'CHits', 'CHmRun', 'CRuns', 'CWalks', 'Assis

['CRBI', 'Hits', 'PutOuts', 'Division_W', 'AtBat', 'Walks', 'CWalks']

## Backward stepwise selection

In [15]:
def find_worst_predictor(X, y, best_subset):

    best_mse = float('inf')
    
    for candidate in best_subset:
        
        predictors = [var for var in best_subset if var != candidate]
        model = LinearRegression().fit(X[predictors], y)
        
        mse = mean_squared_error(y, model.predict(X[predictors]))
        
        if mse < best_mse:
            best_mse = mse
            worst_predictor = candidate

    return worst_predictor

find_worst_predictor(X, y, ['CRBI', 'Hits', 'PutOuts', 'Division_W', 'AtBat', 'Walks', 'CWalks'])

'CWalks'

In [16]:
def backward_stepwise_selection(X, y, k):
    best_subset = X.columns.tolist()  # start from the full model

    for i in range(len(best_subset) - k):
        worst_predictor = find_worst_predictor(X, y, best_subset)
        print(best_subset, '->', worst_predictor)
        
        best_subset.remove(worst_predictor)
        
    return best_subset

In [17]:
backward_stepwise_selection(X, y, 7)

['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat', 'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks', 'PutOuts', 'Assists', 'Errors', 'League_N', 'Division_W', 'NewLeague_N'] -> CHmRun
['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat', 'CHits', 'CRuns', 'CRBI', 'CWalks', 'PutOuts', 'Assists', 'Errors', 'League_N', 'Division_W', 'NewLeague_N'] -> Years
['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'CAtBat', 'CHits', 'CRuns', 'CRBI', 'CWalks', 'PutOuts', 'Assists', 'Errors', 'League_N', 'Division_W', 'NewLeague_N'] -> NewLeague_N
['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'CAtBat', 'CHits', 'CRuns', 'CRBI', 'CWalks', 'PutOuts', 'Assists', 'Errors', 'League_N', 'Division_W'] -> RBI
['AtBat', 'Hits', 'HmRun', 'Runs', 'Walks', 'CAtBat', 'CHits', 'CRuns', 'CRBI', 'CWalks', 'PutOuts', 'Assists', 'Errors', 'League_N', 'Division_W'] -> CHits
['AtBat', 'Hits', 'HmRun', 'Runs', 'Walks', 'CAtBat', 'CRuns', 'CRBI', 'CWalks', 'PutOuts', 'Assists', 'Error

['AtBat', 'Hits', 'Walks', 'CRuns', 'CWalks', 'PutOuts', 'Division_W']