In [16]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectFromModel
from feature_selection import embedded_method
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier 

## Загрузка данных

**Отбор признаков с помощью алгоритмов** - использует собственный процесс выбора переменных и выполняет отбор признаков и регрессию/классификацию одновременно. Распространенными встроенными методами являются Lasso и различные виды деревьев решений. Он характеризуется следующим:

- Выполняет отбор признаков как часть процесса построения модели.
- Учитывает взаимодействия между признаками.
- Менее вычислительно затратен, так как обучает модель только один раз, в отличие от методов обёртки (Wrappers).
- Обычно предоставляет наилучший поднабор признаков для данного алгоритма машинного обучения, но, возможно, не для другого.

In [3]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
data = pd.DataFrame(np.c_[data['data'], data['target']],
                  columns= np.append(data['feature_names'], ['target']))

In [4]:
data.head(5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


In [4]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1), 
                                                    data.target, test_size=0.2,
                                                    random_state=0)
X_train.shape, X_test.shape

((455, 30), (114, 30))

## Lasso

Lasso(L1) позволяет некоторые коэффициенты сократить до 0, тем самым избавится от лишних признаков.

In [6]:
scaler = RobustScaler()
scaler.fit(X_train)

RobustScaler()

In [20]:
sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))
sel_.fit(scaler.transform(X_train), y_train)

SelectFromModel(estimator=LogisticRegression(C=1, penalty='l1',
                                             solver='liblinear'))

In [21]:
selected_feat = X_train.columns[(sel_.get_support())]

print('Всего признаков: {}'.format((X_train.shape[1])))
print('Выбранные признаки: {}'.format(len(selected_feat)))
print('Признаки с 0 коэффициентами: {}'.format(
    np.sum(sel_.estimator_.coef_ == 0)))

Всего признаков: 30
Выбранные признаки: 14
Признаки с 0 коэффициентами: 16


In [22]:
removed_feats = X_train.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]
removed_feats

Index(['mean radius', 'mean perimeter', 'mean area', 'mean smoothness',
       'mean compactness', 'mean concavity', 'mean fractal dimension',
       'texture error', 'perimeter error', 'smoothness error',
       'concavity error', 'concave points error', 'symmetry error',
       'worst radius', 'worst perimeter', 'worst compactness'],
      dtype='object')

In [23]:
# удаляем выбранные признаки из нашего датасета
X_train_selected = sel_.transform(X_train)
X_test_selected = sel_.transform(X_test)

X_train_selected.shape, X_test_selected.shape

((455, 14), (114, 14))

## Random Forest


In [27]:
# отбор признаков с порогом > 0.1
from sklearn.feature_selection import SelectFromModel

model = RandomForestClassifier(n_estimators=100,max_depth=20,
                            random_state=42,
                            n_jobs=-1)
model.fit(X_train, y_train)

feature_selection = SelectFromModel(model, threshold=0.1,prefit=True) 
selected_feat = X_train.columns[(feature_selection.get_support())]
selected_feat

Index(['worst area', 'worst concave points'], dtype='object')

In [30]:
# отбор признаков, со значением больше 2ух медиан
feature_selection2 = SelectFromModel(model, threshold='2*median',prefit=True) 
selected_feat2 = X_train.columns[(feature_selection2.get_support())]
selected_feat2

Index(['mean radius', 'mean perimeter', 'mean area', 'mean concavity',
       'mean concave points', 'area error', 'worst radius', 'worst perimeter',
       'worst area', 'worst concavity', 'worst concave points'],
      dtype='object')

## Gradient Boosting

In [34]:
# отбор признаков с порогом > 0.1

model = GradientBoostingClassifier(n_estimators=100, max_depth=10,
                                  random_state=42)
model.fit(X_train, y_train)

feature_selection = SelectFromModel(model, threshold=0.01,prefit=True) 
selected_feat = X_train.columns[(feature_selection.get_support())]
selected_feat

Index(['mean area', 'mean smoothness', 'mean symmetry', 'symmetry error',
       'worst texture', 'worst perimeter', 'worst area', 'worst concavity',
       'worst concave points'],
      dtype='object')

In [36]:
# отбор признаков, со значением больше 2ух медиан
feature_selection2 = SelectFromModel(model, threshold='2*median',prefit=True) 
selected_feat2 = X_train.columns[(feature_selection2.get_support())]
selected_feat2

Index(['mean perimeter', 'mean area', 'mean smoothness', 'mean symmetry',
       'area error', 'symmetry error', 'worst texture', 'worst perimeter',
       'worst area', 'worst concavity', 'worst concave points'],
      dtype='object')