**Univariate Feature Selection**

In [1]:
# out of 8 features of dataset which one is most important in predicting class?
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
from pandas import read_csv
import pandas as pd
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest # for univariate selection
from sklearn.feature_selection import chi2

In [2]:
# load data, Patient's details taken, Identify which variable is most important in identifying whether patient has dibetes or not
filename = '/content/pima-indians-diabetes.data.csv'
# In actual dataset no column names so give col names, Class - Y variable
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
dataframe.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
dataframe.shape

(768, 9)

In [4]:
X = array[:,0:8]
Y = array[:,8] # separate them in X and Y

# feature extraction
test = SelectKBest(score_func=chi2, k=5) # get top 5 features, K should be less than or equal to total no. of features
fit = test.fit(X, Y)
# summarize scores
set_printoptions(precision=4)
print(fit.scores_) # will get chi squares for each feature
#For regression: f_regression, mutual_info_regression
#For classification: chi2, f_classif, mutual_info_classif
# for feature 'test': chi square value is 2175.5653, so this is most imp feature, then plas,pedi,....

[ 111.5197 1411.887    17.6054   53.108  2175.5653  127.6693    5.3927
  181.3037]


In [5]:
names

['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

In [6]:
features = fit.transform(X)
features.shape

(768, 5)

In [7]:
# store chi square values in a dataframe
x=pd.DataFrame(fit.scores_).T # .T will transpose list into dataframe


In [8]:
x # column names are not displayed... only index is visible.

Unnamed: 0,0,1,2,3,4,5,6,7
0,111.519691,1411.887041,17.605373,53.10804,2175.565273,127.669343,5.392682,181.303689


In [9]:
x.columns=names[:8]
x
# As per chi square test, test is most important feature, Next..plas, age, mass, preg

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age
0,111.519691,1411.887041,17.605373,53.10804,2175.565273,127.669343,5.392682,181.303689


In [10]:
names

['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

In [11]:
# Sort chi square values in ascending or descending order
sorted_x = x.sort_values(by = 0, ascending=False, axis = 1)
sorted_x

Unnamed: 0,test,plas,age,mass,preg,skin,pres,pedi
0,2175.565273,1411.887041,181.303689,127.669343,111.519691,53.10804,17.605373,5.392682


In [12]:
dataframe

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


# Feature Scaling (standardization, Normalization etc.)

In [13]:
# https://www.analyticsvidhya.com/blog/2021/05/feature-scaling-techniques-in-python-a-complete-guide/
# https://www.analyticsvidhya.com/blog/2020/07/types-of-feature-transformation-and-scaling/
# https://medium.datadriveninvestor.com/feature-scaling-in-data-science-5b1e82492727
# https://www.kaggle.com/code/aimack/complete-guide-to-feature-scaling

**Recursive Feature Elimination**

In [24]:
# Feature Extraction with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# load data
filename = '/content/pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
model = LogisticRegression(max_iter=400)# to find out coefficients of logistic regression, for that it need to iterate
# inside logistic regression optimization algorithm is there. Using that coefficients will be estimated.
# For that it has to iterate multiple times. Default value is 100.
rfe = RFE(model)# see best features
fit = rfe.fit(X, Y)

In [25]:
#Num Features: 4 top features are available
fit.n_features_

4

In [26]:
#Selected Features:
fit.support_

array([ True,  True, False, False, False,  True,  True, False])

In [27]:
names # print column names

['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

In [28]:
# Feature Ranking:
fit.ranking_

array([1, 1, 3, 5, 4, 1, 1, 2])

In [29]:
names

['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

#### Feature Importance using Decision Tree

In [30]:
# Feature Importance with Extra Trees Classifier
from pandas import read_csv
from sklearn.tree import  DecisionTreeClassifier
# load data
filename = '/content/pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
model = DecisionTreeClassifier()
model.fit(X, Y)
print(model.feature_importances_) # feature importance score

[0.0669 0.3209 0.109  0.0157 0.0291 0.2219 0.1305 0.1058]


In [31]:
sorted_list=model.feature_importances_
print('Feature importance=',sorted_list)#Before sorting
sorted_list.sort()
print('Sorted Feature importance=',sorted_list)#After sorting

Feature importance= [0.0669 0.3209 0.109  0.0157 0.0291 0.2219 0.1305 0.1058]
Sorted Feature importance= [0.0157 0.0291 0.0669 0.1058 0.109  0.1305 0.2219 0.3209]


In [32]:
names # plas is most imp feature, then mass,age,pedi
# find common amongst all methods

['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']