## Sequential Backward Elimination

In [2]:
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector
import pandas as pd
import numpy as np

In [3]:
df =pd.read_csv("Heart_Attack_Prediction.csv")
df

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test =train_test_split(df.iloc[:,:-1], df.iloc[:, -1], test_size =0.2, random_state=1)

In [5]:
from sklearn.preprocessing import StandardScaler
s =StandardScaler()
X_train.iloc[:, :] =s.fit_transform(X_train.iloc[:, :])
X_test.iloc[:, :] =s.fit_transform(X_test.iloc[:, :])

In [7]:
from sklearn.impute import SimpleImputer
imputer =SimpleImputer(strategy="mean")

X_train.iloc[:,:] = imputer.fit_transform(X_train.iloc[:,:])
X_test.iloc[:,:]  =imputer.fit_transform(X_test.iloc[:,:])

In [8]:
lr =LinearRegression()
# best means--> I need best output can specify number
backward =SequentialFeatureSelector(lr, cv=5, scoring="r2", floating=False, forward=False)
backward.fit(X_train, Y_train)

In [9]:
# Sorting based on avg score
df =pd.DataFrame.from_dict(backward.get_metric_dict()).T
df.sort_values("avg_score", ascending=False)

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
10,"(1, 2, 3, 4, 6, 7, 8, 9, 11, 12)","[0.47631250651518886, 0.3940165021418607, 0.43...",0.49537,"(sex, cp, trtbps, chol, restecg, thalachh, exn...",0.101755,0.079169,0.039584
9,"(1, 2, 3, 6, 7, 8, 9, 11, 12)","[0.47479444718145714, 0.39669623827516765, 0.4...",0.495222,"(sex, cp, trtbps, restecg, thalachh, exng, old...",0.101333,0.078841,0.03942
11,"(1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12)","[0.48048263160132465, 0.38485253175030854, 0.4...",0.494307,"(sex, cp, trtbps, chol, restecg, thalachh, exn...",0.106062,0.08252,0.04126
8,"(1, 2, 3, 7, 8, 9, 11, 12)","[0.46346070380229554, 0.37807889152817686, 0.4...",0.493888,"(sex, cp, trtbps, thalachh, exng, oldpeak, caa...",0.098954,0.07699,0.038495
12,"(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)","[0.47231776745597265, 0.3853723205339632, 0.43...",0.491922,"(sex, cp, trtbps, chol, fbs, restecg, thalachh...",0.10565,0.082199,0.0411
7,"(1, 2, 3, 7, 8, 9, 11)","[0.4863663207933474, 0.3621339311237186, 0.456...",0.488604,"(sex, cp, trtbps, thalachh, exng, oldpeak, caa)",0.099433,0.077363,0.038681
13,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)","[0.47513612534920013, 0.36430853007985964, 0.4...",0.482983,"(age, sex, cp, trtbps, chol, fbs, restecg, tha...",0.107566,0.08369,0.041845
6,"(1, 2, 3, 8, 9, 11)","[0.48678408181901023, 0.42608597118664016, 0.4...",0.481337,"(sex, cp, trtbps, exng, oldpeak, caa)",0.068794,0.053524,0.026762
5,"(1, 2, 8, 9, 11)","[0.47246772702379747, 0.39608949048815323, 0.4...",0.472495,"(sex, cp, exng, oldpeak, caa)",0.068043,0.052939,0.02647
4,"(1, 2, 9, 11)","[0.4700563190456163, 0.4573065171083265, 0.441...",0.446152,"(sex, cp, oldpeak, caa)",0.044422,0.034562,0.017281


In [10]:
# sorting based on adjusted r2
def cal_adjustedr2(r2_score, num_of_featues, samples):
    coef =(samples-1)/(samples-num_of_featues-1)
    return 1-(1-r2_score)*coef
df["adjusted_r2"] =cal_adjustedr2(df["avg_score"], df["feature_idx"].apply(lambda x: len(x)),404 )
df.sort_values("adjusted_r2",ascending=False)

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err,adjusted_r2
9,"(1, 2, 3, 6, 7, 8, 9, 11, 12)","[0.47479444718145714, 0.39669623827516765, 0.4...",0.495222,"(sex, cp, trtbps, restecg, thalachh, exng, old...",0.101333,0.078841,0.03942,0.483691
8,"(1, 2, 3, 7, 8, 9, 11, 12)","[0.46346070380229554, 0.37807889152817686, 0.4...",0.493888,"(sex, cp, trtbps, thalachh, exng, oldpeak, caa...",0.098954,0.07699,0.038495,0.483637
10,"(1, 2, 3, 4, 6, 7, 8, 9, 11, 12)","[0.47631250651518886, 0.3940165021418607, 0.43...",0.49537,"(sex, cp, trtbps, chol, restecg, thalachh, exn...",0.101755,0.079169,0.039584,0.48253
11,"(1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12)","[0.48048263160132465, 0.38485253175030854, 0.4...",0.494307,"(sex, cp, trtbps, chol, restecg, thalachh, exn...",0.106062,0.08252,0.04126,0.480117
7,"(1, 2, 3, 7, 8, 9, 11)","[0.4863663207933474, 0.3621339311237186, 0.456...",0.488604,"(sex, cp, trtbps, thalachh, exng, oldpeak, caa)",0.099433,0.077363,0.038681,0.479564
12,"(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)","[0.47231776745597265, 0.3853723205339632, 0.43...",0.491922,"(sex, cp, trtbps, chol, fbs, restecg, thalachh...",0.10565,0.082199,0.0411,0.476328
6,"(1, 2, 3, 8, 9, 11)","[0.48678408181901023, 0.42608597118664016, 0.4...",0.481337,"(sex, cp, trtbps, exng, oldpeak, caa)",0.068794,0.053524,0.026762,0.473498
5,"(1, 2, 8, 9, 11)","[0.47246772702379747, 0.39608949048815323, 0.4...",0.472495,"(sex, cp, exng, oldpeak, caa)",0.068043,0.052939,0.02647,0.465868
13,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)","[0.47513612534920013, 0.36430853007985964, 0.4...",0.482983,"(age, sex, cp, trtbps, chol, fbs, restecg, tha...",0.107566,0.08369,0.041845,0.465749
4,"(1, 2, 9, 11)","[0.4700563190456163, 0.4573065171083265, 0.441...",0.446152,"(sex, cp, oldpeak, caa)",0.044422,0.034562,0.017281,0.4406


In [11]:
lr =LinearRegression()
# best means--> I need best output can specify number
backward =SequentialFeatureSelector(lr, cv=5, scoring="neg_mean_squared_error", floating=False, forward=False)
backward.fit(X_train, Y_train)



In [12]:
# Mean sqaured error
df =pd.DataFrame.from_dict(backward.get_metric_dict()).T
df.sort_values("avg_score", ascending=True)


Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(2,)","[-0.1664309269102458, -0.1947453567581894, -0....",-0.192563,"(cp,)",0.018684,0.014536,0.007268
2,"(2, 9)","[-0.1604259137591304, -0.15945249804244999, -0...",-0.161278,"(cp, oldpeak)",0.006313,0.004912,0.002456
3,"(2, 9, 11)","[-0.15335184530634338, -0.14733228422572803, -...",-0.145802,"(cp, oldpeak, caa)",0.014197,0.011046,0.005523
4,"(1, 2, 9, 11)","[-0.12978212594801233, -0.1356168636963782, -0...",-0.136114,"(sex, cp, oldpeak, caa)",0.009142,0.007113,0.003556
5,"(1, 2, 8, 9, 11)","[-0.12919157705539652, -0.15091474623369766, -...",-0.129783,"(sex, cp, exng, oldpeak, caa)",0.017801,0.01385,0.006925
6,"(1, 2, 3, 8, 9, 11)","[-0.12568553098309954, -0.14341874939109367, -...",-0.127662,"(sex, cp, trtbps, exng, oldpeak, caa)",0.018554,0.014436,0.007218
13,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)","[-0.1285380917512163, -0.1588566771978693, -0....",-0.127363,"(age, sex, cp, trtbps, chol, fbs, restecg, tha...",0.028363,0.022067,0.011034
7,"(1, 2, 3, 7, 8, 9, 11)","[-0.12578783980571084, -0.1594001005105243, -0...",-0.125984,"(sex, cp, trtbps, thalachh, exng, oldpeak, caa)",0.02648,0.020603,0.010301
12,"(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)","[-0.1292283018475169, -0.15359292281533612, -0...",-0.125172,"(sex, cp, trtbps, chol, fbs, restecg, thalachh...",0.027884,0.021695,0.010847
8,"(1, 2, 3, 7, 8, 9, 11, 12)","[-0.1313973786606623, -0.15541552065101785, -0...",-0.124671,"(sex, cp, trtbps, thalachh, exng, oldpeak, caa...",0.026188,0.020375,0.010188


In [13]:

lr =LinearRegression()
# best means--> I need best output can specify number
backward =SequentialFeatureSelector(lr, cv=5, scoring="neg_mean_absolute_error", floating=False, forward=False)
backward.fit(X_train, Y_train)

In [14]:
# Based on mean sqaure error
df =pd.DataFrame.from_dict(backward.get_metric_dict()).T
df.sort_values("avg_score", ascending=True)

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(2,)","[-0.3520760668897198, -0.3751015425807305, -0....",-0.38492,"(cp,)",0.024482,0.019048,0.009524
2,"(2, 9)","[-0.32670316832732604, -0.3235908902377881, -0...",-0.332078,"(cp, oldpeak)",0.008246,0.006415,0.003208
3,"(2, 9, 11)","[-0.3188411645316859, -0.3095776125808783, -0....",-0.307689,"(cp, oldpeak, caa)",0.011729,0.009126,0.004563
4,"(2, 8, 9, 11)","[-0.29764022990161154, -0.312439542538992, -0....",-0.288796,"(cp, exng, oldpeak, caa)",0.021196,0.016491,0.008246
13,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)","[-0.28271033058087636, -0.3186345655164251, -0...",-0.287243,"(age, sex, cp, trtbps, chol, fbs, restecg, tha...",0.023158,0.018018,0.009009
5,"(2, 8, 9, 11, 12)","[-0.29973142416138643, -0.3155698848831938, -0...",-0.286739,"(cp, exng, oldpeak, caa, thall)",0.025229,0.019629,0.009815
12,"(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)","[-0.2840210320812943, -0.3125549478486, -0.292...",-0.284562,"(sex, cp, trtbps, chol, fbs, restecg, thalachh...",0.021893,0.017033,0.008517
6,"(2, 7, 8, 9, 11, 12)","[-0.30621258922117733, -0.3162423214829596, -0...",-0.28355,"(cp, thalachh, exng, oldpeak, caa, thall)",0.030038,0.02337,0.011685
11,"(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)","[-0.30483007449437527, -0.30790719068141986, -...",-0.282486,"(cp, trtbps, chol, fbs, restecg, thalachh, exn...",0.02881,0.022416,0.011208
10,"(2, 3, 4, 6, 7, 8, 9, 10, 11, 12)","[-0.3039594815660236, -0.3082337795718055, -0....",-0.281907,"(cp, trtbps, chol, restecg, thalachh, exng, ol...",0.029292,0.02279,0.011395
