# Автоматизируем выбор лучших признаков для модели

Имопорт библиотек

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Загружаем данные

In [None]:
df_train = pd.read_csv('/kaggle/input/mobile-price-classification/train.csv')
df_test = pd.read_csv('/kaggle/input/mobile-price-classification/test.csv')
f'df_train shape: {df_train.shape}!    df_test shape: {df_test.shape}!'

In [None]:
df_train.info()

* battery_power: Total energy a battery can store in one time measured in mAh
* blue: Has Bluetooth or not
* clock_speed: the speed at which microprocessor executes instructions
* dual_sim: Has dual sim support or not
* fc: Front Camera megapixels
* four_g: Has 4G or not
* int_memory: Internal Memory in Gigabytes
* m_dep: Mobile Depth in cm
* mobile_wt: Weight of mobile phone
* n_cores: Number of cores of the processor
* pc: Primary Camera megapixels
* px_height
* Pixel Resolution Height
* px_width: Pixel Resolution Width
* ram: Random Access Memory in MegaBytes
* sc_h: Screen Height of mobile in cm
* sc_w: Screen Width of mobile in cm
* talk_time: the longest time that a single battery charge will last when you are
* three_g: Has 3G or not
* touch_screen: Has touch screen or not
* wifi: Has wifi or not
* price_range: This is the target variable with a value of 0(low cost), 1(medium cost), 2(high cost) and 3(very high cost).

## Приступаем к изучению методов селекции признаков

**Выбор признаков на статистической близости к целевой переменной**

In [None]:
'''функция для отбора К лучших признаков по их статистической близости к целевой переменной'''
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2 # для выбора категориальных входных признаков
from sklearn.feature_selection import f_regression # числовые признаки и числовой выходной признак
from sklearn.feature_selection import f_classif # числовые входные признаки и категориальный выходной признак

In [None]:
X = df_train.iloc[:,0:20]  # выбираем признаки для обучения
y = df_train.iloc[:,-1]    # выбираем целевой признак
'''обучаем функцию выбора на основе критерия кси-квадрат и извлекаем лучшие 10 признаков'''
bestfeatures = SelectKBest(score_func=chi2, k=10)
bestfeatures2 = SelectKBest(score_func=f_regression, k=10)
fit = bestfeatures.fit(X,y)
fit2 = bestfeatures2.fit(X,y)
'''создаем набор данных признаков с их весом и выбираем 10 лучших'''
featureScores =  pd.DataFrame({'Specs':X.columns, 'Score1': fit.scores_, 'Score2': fit2.scores_})
featureScores = featureScores.set_index('Specs')
print(featureScores.nlargest(10,'Score1'))
print(featureScores.nlargest(10,'Score2'))

Выбор лучших признаков на основе рекурсивной перекрестной проверки на выбранной модели

In [None]:
'''рекурсивное удаление признаков. Значимость признаков расчитывается на основе перекрестной проверки'''
from sklearn.feature_selection import RFECV
'''выбираем разные модели для оценщика'''
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import ElasticNet, ARDRegression, HuberRegressor, Lasso, \
                                 LogisticRegression, LinearRegression, RANSACRegressor

In [None]:
dt = DecisionTreeClassifier()#SVC(kernel="linear", C=1)
rfe = RFECV(estimator=dt, step=1, cv=3, scoring='accuracy')
rfe.fit(X, y)
print("Optimal number of features : %d" % rfe.n_features_)

rfe_support = rfe.get_support()
featureScores['rfe'] = rfe_support
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

Выбор признаков на основе обучения модели. Без рекурсивного уменьшения количества признаков на каждом следующем этапе.

In [None]:
from sklearn.feature_selection import SelectFromModel

embeded_lr_selector = SelectFromModel(LogisticRegression(), max_features=10)
embeded_lr_selector.fit(X, y)
embeded_svc_selector = SelectFromModel(SVC(kernel="linear", C=1), max_features=10)
embeded_svc_selector.fit(X, y)
embeded_rf_selector = SelectFromModel(RandomForestClassifier(), max_features=10)
embeded_rf_selector.fit(X, y)

featureScores["LogReg"] = embeded_lr_selector.get_support()
featureScores["SVC"] = embeded_svc_selector.get_support()
featureScores["rf"] = embeded_rf_selector.get_support()
featureScores

**Подход на основе отдельной модели со встроенным методом важности признаков**

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
featureScores["ET"] = model.feature_importances_
featureScores["ET"].nlargest(10).plot(kind='barh')
plt.show()

In [None]:
featureScores["ET"].sort_values(ascending=False)

In [None]:
import seaborn as sns

corrmat = df_train.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(df_train[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
df_train[top_corr_features].corr().loc['price_range', df_train[top_corr_features].corr().loc['price_range', :]>0.1]

https://towardsdatascience.com/the-5-feature-selection-algorithms-every-data-scientist-need-to-know-3a6b566efd2

https://machinelearningmastery.com/feature-selection-with-real-and-categorical-data/

In [None]:
featureScores.sort_values("ET", ascending=False)