# Univariate feature selection
Univariate feature selection works by selecting the best features based on univariate statistical tests. We compare each feature to the target variable, to see whether there is any statistically significant relationship between them.

Two of the methods discussed below are:
* Chi squared method (χ2) for more info [visit](https://machinelearningmastery.com/chi-squared-test-for-machine-learning/)
* ANOVA (Analysis of variance) for more info [visit](https://www.analyticsvidhya.com/blog/2018/01/anova-analysis-of-variance/)

### Importing Dataset and pre-processing
(for Chi2 please skip to cellnumber 13)

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("https://raw.githubusercontent.com/rahul96rajan/sample_datasets/master/diabetes.csv")
display(data.head())

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
psuedo_zero_features = ['Glucose','BloodPressure','SkinThickness','Insulin', 'BMI']

data[psuedo_zero_features] = data[psuedo_zero_features].replace(0,  np.nan)
display(data.head())

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

col_trans = ColumnTransformer([('imp_mode', SimpleImputer(strategy='most_frequent'), psuedo_zero_features)])
transformed_df = pd.DataFrame(col_trans.fit_transform(data), columns=psuedo_zero_features)
display(transformed_df.head())

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI
0,148.0,72.0,35.0,105.0,33.6
1,85.0,66.0,29.0,105.0,26.6
2,183.0,64.0,32.0,105.0,23.3
3,89.0,66.0,23.0,94.0,28.1
4,137.0,40.0,35.0,168.0,43.1


In [7]:
data[psuedo_zero_features] = transformed_df
display(data.head())

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,105.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,105.0,26.6,0.351,31,0
2,8,183.0,64.0,32.0,105.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [8]:
X = data.drop(columns=['Outcome'])
y = data['Outcome']

In [9]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer

scale_all = ColumnTransformer([('mm_scaler', MinMaxScaler(), X.columns)])
X = pd.DataFrame(scale_all.fit_transform(X), columns=X.columns)

In [10]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    float64
 1   Glucose                   768 non-null    float64
 2   BloodPressure             768 non-null    float64
 3   SkinThickness             768 non-null    float64
 4   Insulin                   768 non-null    float64
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    float64
dtypes: float64(8)
memory usage: 48.1 KB


In [11]:
X = X.astype(np.float64)

In [12]:
#Helper function to identify features after transformation
def selected_features(X_new, X_old):
    sel_feat = []
    for i in range(X_old.shape[1]):
        for j in range(X_new.shape[1]):
            if(np.allclose(X_old.iloc[:, i].values, X_new[:,j])):
                sel_feat.append(X_old.iloc[:, i].name)
    return sel_feat
    

### Chi squared Method

In [13]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

chi_estimator = SelectKBest(score_func=chi2, k=4) # k=4 means that this will select top 4 features

In [14]:
X_new = chi_estimator.fit_transform(X, y)

In [15]:
chi2_selected_features = selected_features(X_new, X)

In [16]:
print("Feature Selected by Chi2 method: ", chi2_selected_features)

Feature Selected by Chi2 method:  ['Pregnancies', 'Glucose', 'BMI', 'Age']


In [17]:
chi2_features = X[chi2_selected_features]

### ANOVA method

In [18]:
from sklearn.feature_selection import f_classif # gives ANOVA-F Values
from sklearn.feature_selection import SelectPercentile

anova_estimator = SelectPercentile(score_func=f_classif, percentile=80) #selects top 80 percentile
                                                                        #of feature scoring anova-f values

In [19]:
X_new = anova_estimator.fit_transform(X, y)

In [20]:
anova_selected_features = selected_features(X_new, X)

In [21]:
print("Feature Selected by ANOVA method: ", anova_selected_features)

Feature Selected by ANOVA method:  ['Pregnancies', 'Glucose', 'SkinThickness', 'Insulin', 'BMI', 'Age']


In [22]:
annova_features = X[anova_selected_features]

## Time to test of feature selections

In [23]:
#Helper method to build a random forest model an return back scores
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def get_score_logistic_regression(Xx, yy):
    X_train, X_test, y_train, y_test = train_test_split(Xx, yy, test_size=0.2, random_state=1)
    log_reg = LogisticRegression().fit(X_train, y_train)
    y_pred = log_reg.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    print("Score : ", score)
    

In [24]:
print("Having All feature :")
get_score_logistic_regression(X, y)

Having All feature :
Score :  0.7597402597402597


In [25]:
print("Having Chi2 Selected feature :")
get_score_logistic_regression(chi2_features, y)

Having Chi2 Selected feature :
Score :  0.7597402597402597


In [26]:
print("Having ANOVA Selected feature :")
get_score_logistic_regression(annova_features, y)

Having ANOVA Selected feature :
Score :  0.7532467532467533
