heart data : https://www.kaggle.com/ronitf/heart-disease-uci

In [47]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import chi2, SelectKBest
from sklearn.feature_selection import f_classif, SelectPercentile

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [48]:
heart_data = pd.read_csv('datasets/heart.csv')

heart_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [49]:
x = heart_data.drop(["target"], axis = 1)

y = heart_data["target"]

In [50]:
x.shape

(303, 13)

In [51]:
x = x.astype(np.float64)

### Use chi2 feature selection to select the K-best features

A chi-square test is used in statistics to test the independence of two events. Given the data of two variables, we can get observed count O and expected count E. Chi-Square measures how expected count E and observed count O deviate from each other.

In [52]:
test = SelectKBest(score_func = chi2, k = 5)

fit = test.fit(x, y)

In [53]:
fit.scores_

array([ 23.28662399,   7.57683451,  62.59809791,  14.8239245 ,
        23.93639448,   0.20293368,   2.97827075, 188.32047169,
        38.91437697,  72.64425301,   9.8040952 ,  66.44076512,
         5.79185297])

In [54]:
feature_score = pd.DataFrame()

for i in range(x.shape[1]):
    new = pd.DataFrame({'Features' : x.columns[i],
                        'Score' : fit.scores_[i]}, index=[i])
    
    feature_score = pd.concat([feature_score, new])

In [55]:
feature_score = feature_score.sort_values(by="Score")

feature_score

Unnamed: 0,Features,Score
5,fbs,0.202934
6,restecg,2.978271
12,thal,5.791853
1,sex,7.576835
10,slope,9.804095
3,trestbps,14.823925
0,age,23.286624
4,chol,23.936394
8,exang,38.914377
2,cp,62.598098


In [56]:
x_new = fit.transform(x)

fit.get_support()

array([False, False,  True, False, False, False, False,  True,  True,
        True, False,  True, False])

In [57]:
x.columns[fit.get_support()]

Index(['cp', 'thalach', 'exang', 'oldpeak', 'ca'], dtype='object')

In [58]:
chi2_best_features = pd.DataFrame(x_new, columns=x.columns[fit.get_support()])

chi2_best_features.head(5)

Unnamed: 0,cp,thalach,exang,oldpeak,ca
0,3.0,150.0,0.0,2.3,0.0
1,2.0,187.0,0.0,3.5,0.0
2,1.0,172.0,0.0,1.4,0.0
3,1.0,178.0,0.0,0.8,0.0
4,0.0,163.0,1.0,0.6,0.0


### Use the ANOVA feature selection technique to select the features in the top 25%

f_classif is used only for classification models

In [59]:
test = SelectPercentile(f_classif, percentile = 25)

fit = test.fit(x, y)

fit.scores_

array([16.11669982, 25.79219115, 69.77227149,  6.45816867,  2.20298345,
        0.23694234,  5.77720891, 65.1201044 , 70.95243822, 68.55143941,
       40.90207063, 54.5598338 , 40.40769615])

In [60]:
feature_score = pd.DataFrame()

for i in range(x.shape[1]):
    new = pd.DataFrame({'Features' : x.columns[i],
                        'Score' : fit.scores_[i]}, index=[i])
    
    feature_score = pd.concat([feature_score, new])

In [61]:
feature_score = feature_score.sort_values(by="Score")

feature_score

Unnamed: 0,Features,Score
5,fbs,0.236942
4,chol,2.202983
6,restecg,5.777209
3,trestbps,6.458169
0,age,16.1167
1,sex,25.792191
12,thal,40.407696
10,slope,40.902071
11,ca,54.559834
7,thalach,65.120104


In [62]:
x_new = fit.transform(x)

fit.get_support()

array([False, False,  True, False, False, False, False, False,  True,
        True, False, False, False])

In [63]:
x.columns[fit.get_support()]

Index(['cp', 'exang', 'oldpeak'], dtype='object')

In [64]:
f_classif_best_features = pd.DataFrame(x_new, columns=x.columns[fit.get_support()])

f_classif_best_features.head()

Unnamed: 0,cp,exang,oldpeak
0,3.0,0.0,2.3
1,2.0,0.0,3.5
2,1.0,0.0,1.4
3,1.0,0.0,0.8
4,0.0,1.0,0.6


In [65]:
def build_model(x, y, test_frac):
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_frac)
    
    model = LogisticRegression(solver='liblinear').fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    print("Test_score : ", accuracy_score(y_test, y_pred))

In [66]:
build_model(x, y, 0.25)

Test_score :  0.8157894736842105


In [67]:
build_model(chi2_best_features, y, 0.25)

Test_score :  0.75


In [68]:
build_model(f_classif_best_features, y, 0.25)

Test_score :  0.7763157894736842
