# Subset Selection
## CMSE 381 - Spring 2024




In [1]:
# Get the dataset ready
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [2]:
# First, we're going to do all the data loading we've had for a while for this data set
auto = pd.read_csv('../../DataSets/Auto.csv')
auto = auto.replace('?', np.nan)
auto = auto.dropna()
auto.horsepower = auto.horsepower.astype('int')

#this shuffles my data set in advance so that i don't need to worry about it later 
auto = auto.sample(frac=1).reset_index(drop=True)


auto.head()


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,19.0,4,120.0,88,3270,21.9,76,2,peugeot 504
1,38.0,4,105.0,63,2125,14.7,82,1,plymouth horizon miser
2,30.5,4,97.0,78,2190,14.1,77,2,volkswagen dasher
3,14.0,8,400.0,175,4385,12.0,72,1,pontiac catalina
4,16.0,6,225.0,105,3439,15.5,71,1,plymouth satellite custom


Let's try to run subset selection on the `auto` data set! We're going to use `cylinders`, `horsepower`, `weight`, and `acceleration` to predict `mpg`. 

In [3]:
inputvars = ['cylinders','horsepower','weight', 'acceleration']

In [4]:
from itertools import combinations

Here's some code stolen from the last few days to run linear regression on a subset of the input variables. 

In [5]:
def myscore_train(df,listofvars, outputvar = 'mpg'):
    X = df[list(listofvars)]
    y = df[outputvar]
    
    #build linear regression model
    model = LinearRegression()
    model.fit(X,y)
    
    testscore = mean_squared_error(y, model.predict(X))
    
    #view mean absolute error
    return testscore
    
myvars = ('cylinders', 'acceleration')
myscore_train(auto,myvars)

23.942446650601354

In [6]:
def myscore_cv(df,listofvars, outputvar = 'mpg'):
    X = df[list(listofvars)]
    y = df[outputvar]
    
    #build linear regression model
    model = LinearRegression()
    

    #use 5-fold CV to evaluate model
    scores = cross_val_score(model, X,y, 
                             scoring='neg_mean_squared_error',
                             cv=5)

    #view mean absolute error
    return np.average(np.absolute(scores))
    

myvars = ('cylinders', 'acceleration')
myscore_cv(auto,myvars)

24.197080654251987

## Homework problem


&#9989; **<font color=red>Please answer this problem in homework :</font>** write a function that does forward selection and another function that does backward selection. 



In [7]:
# Your code here #
def forward(predictors):

    # Pull out predictors we still need to process
    remaining_predictors = [p for p in inputvars if p not in predictors]
    
    results = []
    
    for p in remaining_predictors:
        results.append(processSubset(predictors+[p]))
    
    # Wrap everything up in a nice dataframe
    models = pd.DataFrame(results)
    
    # Choose the model with the highest RSS
    best_model = models.loc[models['RSS'].argmin()]
    

    print("Processed ", models.shape[0], "models on", len(predictors)+1, "predictors")
    
    # Return the best model, along with some other useful information about the model
    return best_model


y = auto['mpg'].values
def processSubset(feature_set):
    # Fit model on feature_set and calculate RSS
    model = LinearRegression()
    model.fit(auto[list(feature_set)],y)
    
    RSS = mean_squared_error(y, model.predict(auto[list(feature_set)]))
    return {"model":feature_set, "RSS":RSS}


In [8]:
models_fwd = pd.DataFrame(columns=["RSS", "model"])


# We start with no variables at all in our model
predictors = []

for i in range(1,len(inputvars)+1): 
    
    # Now we run our forward one step function from above to get 
    # the best model.     
    models_fwd.loc[i] = forward(predictors)
    
    # Then we extract the names of the variables that were used 
    # in that found model. When the for loop repeats, this will 
    # get sent back to predictors to the next step.
    predictors = models_fwd.loc[i,'model']
    
    # If you want to get something printed out to see which 
    # variables are used at each step, uncomment this line:
#     print('Starting with predictors:\n', predictors)


models_fwd

Processed  4 models on 1 predictors
Processed  3 models on 2 predictors
Processed  2 models on 3 predictors
Processed  1 models on 4 predictors


Unnamed: 0,RSS,model
1,18.676617,[weight]
2,17.841442,"[weight, horsepower]"
3,17.763871,"[weight, horsepower, cylinders]"
4,17.7614,"[weight, horsepower, cylinders, acceleration]"


In [9]:
# pick the best model using the cv score
myscores = []
myvars = []
for i in range(1,len(models_fwd)+1):
        myscores.append(myscore_cv(auto,models_fwd.loc[i, "model"]))
        myvars.append(models_fwd.loc[i, "model"])
myResults = pd.DataFrame({'Vars':myvars, 'TestScore':myscores})
myResults

Unnamed: 0,Vars,TestScore
0,[weight],18.906393
1,"[weight, horsepower]",18.178709
2,"[weight, horsepower, cylinders]",18.143382
3,"[weight, horsepower, cylinders, acceleration]",18.181214


In [10]:
indexmin = myResults.idxmin(numeric_only = True)
print('Best Model:', myResults.Vars[indexmin])

Best Model: 2    [weight, horsepower, cylinders]
Name: Vars, dtype: object


In [11]:
#Backward selection
import itertools
def backward(predictors):
    
    
    results = []
    
    
    # Note that the students might not do it this way. They might go through the list of predictors one
    # at a time and remove from the set.  Either way should work.
    for combo in itertools.combinations(predictors, len(predictors)-1):
        results.append(processSubset(combo))
    
    # Wrap everything up in a nice dataframe
    models = pd.DataFrame(results)
    
    # Choose the model with the highest RSS
    best_model = models.loc[models['RSS'].argmin()]
    
    print("Processed ", models.shape[0], "models on", len(predictors)-1, "predictors" )
    
    # Return the best model, along with some other useful information about the model
    return best_model

In [12]:
models_bwd = pd.DataFrame(columns=["RSS", "model"], index = range(1,len(inputvars)))

predictors = inputvars

while(len(predictors) > 1):  
    models_bwd.loc[len(predictors)-1] = backward(predictors)
    predictors = models_bwd.loc[len(predictors)-1]["model"]

models_bwd

Processed  4 models on 3 predictors
Processed  3 models on 2 predictors
Processed  2 models on 1 predictors


Unnamed: 0,RSS,model
1,18.676617,"(weight,)"
2,17.841442,"(horsepower, weight)"
3,17.763871,"(cylinders, horsepower, weight)"


In [13]:
# pick the best model using the cv score
myscores = []
myvars = []
for i in range(1,len(models_bwd)+1):
        myscores.append(myscore_cv(auto,models_bwd.loc[i, "model"]))
        myvars.append(models_bwd.loc[i, "model"])
myResults = pd.DataFrame({'Vars':myvars, 'TestScore':myscores})
indexmin = myResults.idxmin(numeric_only = True)
print('Best Model:', myResults.Vars[indexmin])

Best Model: 2    (cylinders, horsepower, weight)
Name: Vars, dtype: object


In [None]:
def compute_RSS(left_set,right_set)