## Lasso

- [Feature Selection in Machine Learning Book](https://www.trainindata.com/p/feature-selection-in-machine-learning-book)

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer, fetch_california_housing
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

## Classification

In [2]:
# load dataset

X, y = load_breast_cancer(return_X_y=True, as_frame=True)

# Separate data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [3]:
# SelectFromModel class from sklearn selects 
# the features which coefficients are non-zero

selector = SelectFromModel(
    LogisticRegression(
        C=0.5, 
        penalty='l1', # select the Lasso (l1) penalty.
        solver='liblinear', 
        random_state=10),
)

In [4]:
# add feature scaling in a pipeline
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("selector", selector),
])
    
pipe.fit(X_train, y_train)

In [5]:
# the name of the selected features:

pipe.get_feature_names_out()

array(['mean texture', 'mean concave points', 'mean symmetry',
       'radius error', 'compactness error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst concavity', 'worst concave points',
       'worst symmetry'], dtype=object)

In [6]:
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(pipe.get_feature_names_out())))

total features: 30
selected features: 14


In [7]:
# the number of features which coefficient was shrank to zero:

np.sum(pipe.named_steps["selector"].estimator_.coef_ == 0)

np.int64(16)

In [8]:
# we can then remove the features from the training and testing set
# like this:

X_train_selected = pipe.transform(X_train)
X_test_selected = pipe.transform(X_test)

X_train_selected.shape, X_test_selected.shape

((426, 14), (143, 14))

Go ahead and play around with the penalty (C) to see if the result changes.

## Regression

In [9]:
# load the California House price data

X, y = fetch_california_housing(return_X_y=True, as_frame=True)

# Separate data into train and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [10]:
# here, again I will train a Lasso Linear regression and select
# the non zero features in one line.

# bear in mind that the linear regression object from sklearn does
# not allow for regularisation. So If you want to make a regularised
# linear regression you need to import specifically "Lasso"

sel_ = SelectFromModel(Lasso(alpha=0.01, random_state=10))

# add feature scaling in a pipeline
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("selector", sel_),
]).set_output(transform="pandas")
    
pipe.fit(X_train, y_train)

In [11]:
# make a list with the selected features and print the outputs
selected_feat = pipe.get_feature_names_out()

print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(sel_.estimator_.coef_ == 0)))

total features: 8
selected features: 7
features with coefficients shrank to zero: 1


In [12]:
# we can then remove the features from the training and testing set
# like this:

X_train_selected = pipe.transform(X_train)
X_test_selected = pipe.transform(X_test)

X_train_selected.shape, X_test_selected.shape

((15480, 7), (5160, 7))

In [13]:
X_train_selected.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,AveOccup,Latitude,Longitude
19226,1.808122,-0.764262,0.998207,0.084854,0.006579,1.324919,-1.552206
14549,1.097891,-0.843631,0.440005,0.014431,-0.118332,-1.25207,1.15954
9093,-0.34949,-0.764262,-0.088795,0.245869,-0.011368,-0.442961,0.646103
12213,1.645924,-1.240475,0.393134,-0.194249,-0.027177,-0.990162,1.189449
12765,-0.717009,-0.605525,-0.395943,-0.119839,-0.102345,1.39975,-0.919133
