## Seleção de características

In [1]:
from sklearn.datasets import fetch_california_housing
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the California housing dataset
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name="target")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [2]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=0.2)
# fit encontra as características constantes
sel.fit(X_train)
# nome das características selecionadas
sel.get_feature_names_out()

array(['MedInc', 'HouseAge', 'AveRooms', 'Population', 'AveOccup',
       'Latitude', 'Longitude'], dtype=object)

In [3]:
# Exclui as características constantes
X_train_t = sel.transform(X_train)

In [4]:
df = X_train.join(y_train)
df.corr(method='pearson')['target'].sort_values(ascending=False)

target        1.000000
MedInc        0.690647
AveRooms      0.158485
HouseAge      0.103706
AveOccup     -0.022030
Population   -0.026032
Longitude    -0.046349
AveBedrms    -0.051351
Latitude     -0.142983
Name: target, dtype: float64

In [5]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
scaler = StandardScaler()
scaler.fit(X_train)

sel = SelectFromModel(Lasso(alpha=0.1, random_state=42))
sel.fit(scaler.transform(X_train), y_train)
X_train.columns[(sel.get_support())]

Index(['MedInc', 'HouseAge', 'Latitude'], dtype='object')

In [6]:
X_train_s = sel.transform(scaler.transform(X_train))