## Forward/Backward Selection

In [58]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector
import numpy as np
import pandas as pd


# Load the housing Housing dataset
housing = fetch_california_housing()
X, y = pd.DataFrame(housing.data,columns=housing.feature_names), housing.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a linear regression model
model = LinearRegression()

# Perform Recursive Feature Elimination (SequentialFeatureSelector)
selector = SequentialFeatureSelector(model,n_features_to_select = 5, direction='backward',cv=5, n_jobs=-1, scoring='r2')  # Select top 5 features
X_train_selected = selector.fit_transform(X_train, y_train)

# Get the selected features
selected_features = np.array(X_train.columns)[selector.support_]

print("Selected Features:")
print(selected_features)


X_test_selected = selector.transform(X_test)
model.fit(X_train_selected, y_train)

# Evaluate the model
train_score = model.score(X_train_selected, y_train)
test_score = model.score(X_test_selected, y_test)

print(f"Training R^2 score: {train_score:.2f}")
print(f"Testing R^2 score: {test_score:.2f}")


Selected Features:
['MedInc' 'HouseAge' 'AveBedrms' 'Latitude' 'Longitude']
Training R^2 score: 0.60
Testing R^2 score: 0.58


In [55]:
X_train.columns[selector.support_]

Index(['MedInc', 'HouseAge', 'AveBedrms', 'Latitude', 'Longitude'], dtype='object')

In [56]:
X_test_selected

array([[   1.6812    ,   25.        ,    1.02228412,   36.06      ,
        -119.01      ],
       [   2.5313    ,   30.        ,    1.19349315,   35.14      ,
        -119.46      ],
       [   3.4801    ,   52.        ,    1.18587747,   37.8       ,
        -122.44      ],
       ...,
       [   9.2298    ,   25.        ,    0.9471831 ,   37.31      ,
        -122.05      ],
       [   2.785     ,   36.        ,    0.98312236,   36.77      ,
        -119.76      ],
       [   3.5521    ,   17.        ,    1.03348214,   34.22      ,
        -118.37      ]])

## Exhausive Method

In [67]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import ExhaustiveFeatureSelector
import numpy as np
import pandas as pd

# Load the California housing dataset
housing = fetch_california_housing()
X, y = pd.DataFrame(housing.data, columns=housing.feature_names), housing.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a linear regression model
model = LinearRegression()

# Perform Exhaustive Feature Selection
selector = ExhaustiveFeatureSelector(model, min_features=2, max_features=5, scoring='r2', cv=5)

# Note: This will perform an exhaustive search, which can be very computationally expensive for large datasets.

# Fit the selector to the training data
selector = selector.fit(X_train, y_train)

# Get the selected features
selected_feature_indices = selector.best_idx_
selected_features = np.array(X_train.columns[[selected_feature_indices]])

print("Selected Features:")
print(selected_features)

# Transform the training and testing data using selected features
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# Fit the model using selected features
model.fit(X_train_selected, y_train)

# Evaluate the model
train_score = model.score(X_train_selected, y_train)
test_score = model.score(X_test_selected, y_test)

print(f"Training R^2 score: {train_score:.2f}")
print(f"Testing R^2 score: {test_score:.2f}")


  selected_features = X_train.columns[[selected_feature_indices]]


Selected Features:
[['MedInc' 'HouseAge' 'AveBedrms' 'Latitude' 'Longitude']]
Training R^2 score: 0.60
Testing R^2 score: 0.58


In [69]:
data = pd.DataFrame.from_dict(selector.get_metric_dict()).T

In [71]:
data[['feature_idx','avg_score']]

Unnamed: 0,feature_idx,avg_score
0,"(0, 1)",0.512443
1,"(0, 2)",0.479832
2,"(0, 3)",0.476332
3,"(0, 4)",0.477473
4,"(0, 5)",0.477788
...,...,...
205,"(2, 3, 4, 5, 7)",0.135395
206,"(2, 3, 4, 6, 7)",0.390904
207,"(2, 3, 5, 6, 7)",0.389453
208,"(2, 4, 5, 6, 7)",0.306113
