# Sequential Feature Selector (SFS)

## Read data

In [1]:
import pandas as pd
import psycopg2

# information used to create a database connection
sqluser = 'postgres'
dbname = 'mimic4'
hostname = 'localhost'
port_number = 5434
schema_name = 'sepsis_micro'

# Connect to postgres with a copy of the MIMIC-III database
con = psycopg2.connect(dbname=dbname, user=sqluser, host=hostname, port=port_number, password='mysecretpassword')

# the below statement is prepended to queries to ensure they select from the right schema
query_schema = 'set search_path to ' + schema_name + ';'

dataQuery = """select * from sepsis_micro.data_matrix_qc_1;"""
dataDf = pd.read_sql_query(dataQuery, con)
dataDf

Unnamed: 0,micro_specimen_id,person_id,seven_day_mortality,fourteen_day_mortality,twentyone_day_mortality,twentyeight_day_mortality,sixty_day_mortality,ninety_day_mortality,onetwenty_day_mortality,Ambulatory Clinic / Center,...,hemoglobin_first,creatinine_first,potassium_last,chloride_last,glucose_last,sodium_last,bicarbonate_last,hemoglobin_last,creatinine_last,time_to_antibiotic
0,2940,-470494077,0,0,0,1,1,1,1,0,...,-0.586379,2.441041,-0.053360,-0.804726,0.726778,-0.800329,0.048541,-0.857543,-0.077536,0.018778
1,3236,-762840899,0,1,1,1,1,1,1,0,...,-0.942367,-0.066982,1.677597,-1.840315,0.615396,-1.745000,0.621905,-1.037712,0.988716,0.018778
2,3720,-1481508426,0,0,0,0,0,0,0,0,...,-0.535524,-0.345652,-1.438125,-1.692373,1.376502,-0.989263,1.004149,-0.377094,-0.290787,0.022081
3,5214,2090709572,0,0,0,0,0,0,0,0,...,-0.789801,-0.554654,0.292832,-0.508844,-0.164272,-1.745000,-1.098189,-0.677375,-0.504037,-1.107961
4,10611,-961491032,0,0,0,0,0,0,0,0,...,1.091850,-0.763656,-0.399551,1.414391,0.763905,0.333277,-0.333703,0.043300,-0.859455,0.034054
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7097,9986222,-1199717531,0,0,0,0,0,0,0,0,...,1.447838,-0.554654,0.465927,-0.065021,-0.498416,0.900080,2.533121,-0.377094,-0.575121,0.017126
7098,9989349,-1246337626,0,0,0,0,0,0,0,0,...,1.244416,1.953370,-0.399551,-1.248550,1.357938,-1.745000,-0.715946,0.463693,1.841718,0.031577
7099,9989996,-1704051650,1,1,1,1,1,1,1,0,...,-1.247500,-0.693988,0.119736,0.082921,-0.758306,-1.178197,-0.907067,-1.337993,-0.646204,0.018778
7100,9996881,768076701,0,0,1,1,1,1,1,0,...,-0.586379,-0.833323,-0.572647,-0.656785,-0.609797,0.144343,2.150878,-0.437150,-0.788371,0.031990


## Extract data

In [2]:
X = dataDf.drop(['micro_specimen_id', 'person_id', 'seven_day_mortality', 'fourteen_day_mortality', 'twentyone_day_mortality', 'twentyeight_day_mortality', 'sixty_day_mortality', 'ninety_day_mortality', 'onetwenty_day_mortality'], axis = 1)
y_seven_day = dataDf['seven_day_mortality']
y_fourteen_day = dataDf['fourteen_day_mortality']
y_twentyone_day = dataDf['twentyone_day_mortality']
y_twentyeight_day = dataDf['twentyeight_day_mortality']
y_sixty_day = dataDf['sixty_day_mortality']
y_ninety_day = dataDf['ninety_day_mortality']
y_onetwenty_day = dataDf['onetwenty_day_mortality']

## Seven day mortality prediction

In [5]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.tree import DecisionTreeClassifier

sfs = SequentialFeatureSelector(estimator = DecisionTreeClassifier(), n_features_to_select=10)
sfs.fit(X, y_seven_day)

SequentialFeatureSelector(estimator=DecisionTreeClassifier(),
                          n_features_to_select=10)

In [10]:
sfs.get_feature_names_out()

array(['Ambulatory Clinic / Center', 'Inpatient Visit', 'EU OBSERVATION',
       'PHYSICIAN REFERRAL', 'visit_duration_hrs', 'gcs_motor_last',
       'chloride_max', 'glucose_min', 'sodium_last', 'bicarbonate_last'],
      dtype=object)

In [15]:
# summarize all features
data = []
columns = X.columns
for i in range(X.shape[1]):
    data.append([columns[i], sfs.get_support()[i]])
feDf = pd.DataFrame(data, columns=['Column', 'Selected'])
feDf

Unnamed: 0,Column,Selected
0,Ambulatory Clinic / Center,True
1,Ambulatory Surgical Center,False
2,Emergency Room - Hospital,False
3,Emergency Room and Inpatient Visit,False
4,Inpatient Visit,True
...,...,...
154,sodium_last,True
155,bicarbonate_last,True
156,hemoglobin_last,False
157,creatinine_last,False


In [16]:
feDf[feDf.Selected]

Unnamed: 0,Column,Selected
0,Ambulatory Clinic / Center,True
4,Inpatient Visit,True
9,EU OBSERVATION,True
20,PHYSICIAN REFERRAL,True
25,visit_duration_hrs,True
113,gcs_motor_last,True
117,chloride_max,True
125,glucose_min,True
154,sodium_last,True
155,bicarbonate_last,True


## Fourteen day mortality prediction

In [17]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.tree import DecisionTreeClassifier

sfs = SequentialFeatureSelector(estimator = DecisionTreeClassifier(), n_features_to_select=10)
sfs.fit(X, y_fourteen_day)

SequentialFeatureSelector(estimator=DecisionTreeClassifier(),
                          n_features_to_select=10)

In [18]:
sfs.get_feature_names_out()

array(['Emergency Room - Hospital', 'AMBULATORY OBSERVATION',
       'CLINIC REFERRAL', 'INTERNAL TRANSFER TO OR FROM PSYCH', 'PACU',
       'PROCEDURE SITE', 'visit_duration_hrs', 'gcs_motor_last',
       'chloride_max', 'glucose_min'], dtype=object)

In [19]:
# summarize all features
data = []
columns = X.columns
for i in range(X.shape[1]):
    data.append([columns[i], sfs.get_support()[i]])
feDf = pd.DataFrame(data, columns=['Column', 'Selected'])
feDf

Unnamed: 0,Column,Selected
0,Ambulatory Clinic / Center,False
1,Ambulatory Surgical Center,False
2,Emergency Room - Hospital,True
3,Emergency Room and Inpatient Visit,False
4,Inpatient Visit,False
...,...,...
154,sodium_last,False
155,bicarbonate_last,False
156,hemoglobin_last,False
157,creatinine_last,False


In [20]:
feDf[feDf.Selected]

Unnamed: 0,Column,Selected
2,Emergency Room - Hospital,True
6,AMBULATORY OBSERVATION,True
15,CLINIC REFERRAL,True
18,INTERNAL TRANSFER TO OR FROM PSYCH,True
19,PACU,True
21,PROCEDURE SITE,True
25,visit_duration_hrs,True
113,gcs_motor_last,True
117,chloride_max,True
125,glucose_min,True


## Twentyone day mortality prediction

In [21]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.tree import DecisionTreeClassifier

sfs = SequentialFeatureSelector(estimator = DecisionTreeClassifier(), n_features_to_select=10)
sfs.fit(X, y_twentyone_day)

SequentialFeatureSelector(estimator=DecisionTreeClassifier(),
                          n_features_to_select=10)

In [22]:
sfs.get_feature_names_out()

array(['Ambulatory Clinic / Center', 'SURGICAL SAME DAY ADMISSION',
       'CLINIC REFERRAL', 'INFORMATION NOT AVAILABLE', 'PACU',
       'visit_duration_hrs', 'gcs_eye_max', 'gcs_motor_last',
       'gcs_eye_last', 'glucose_min'], dtype=object)

In [23]:
# summarize all features
data = []
columns = X.columns
for i in range(X.shape[1]):
    data.append([columns[i], sfs.get_support()[i]])
feDf = pd.DataFrame(data, columns=['Column', 'Selected'])
feDf

Unnamed: 0,Column,Selected
0,Ambulatory Clinic / Center,True
1,Ambulatory Surgical Center,False
2,Emergency Room - Hospital,False
3,Emergency Room and Inpatient Visit,False
4,Inpatient Visit,False
...,...,...
154,sodium_last,False
155,bicarbonate_last,False
156,hemoglobin_last,False
157,creatinine_last,False


In [24]:
feDf[feDf.Selected]

Unnamed: 0,Column,Selected
0,Ambulatory Clinic / Center,True
12,SURGICAL SAME DAY ADMISSION,True
15,CLINIC REFERRAL,True
17,INFORMATION NOT AVAILABLE,True
19,PACU,True
25,visit_duration_hrs,True
40,gcs_eye_max,True
113,gcs_motor_last,True
115,gcs_eye_last,True
125,glucose_min,True


## Twentyeight day mortality prediction

In [25]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.tree import DecisionTreeClassifier

sfs = SequentialFeatureSelector(estimator = DecisionTreeClassifier(), n_features_to_select=10)
sfs.fit(X, y_twentyeight_day)

SequentialFeatureSelector(estimator=DecisionTreeClassifier(),
                          n_features_to_select=10)

In [26]:
sfs.get_feature_names_out()

array(['Ambulatory Clinic / Center', 'Emergency Room - Hospital',
       'Inpatient Visit', 'AMBULATORY OBSERVATION', 'DIRECT EMER.',
       'ELECTIVE', 'EW EMER.', 'visit_duration_hrs', 'gcs_motor_last',
       'creatinine_avg'], dtype=object)

In [27]:
# summarize all features
data = []
columns = X.columns
for i in range(X.shape[1]):
    data.append([columns[i], sfs.get_support()[i]])
feDf = pd.DataFrame(data, columns=['Column', 'Selected'])
feDf

Unnamed: 0,Column,Selected
0,Ambulatory Clinic / Center,True
1,Ambulatory Surgical Center,False
2,Emergency Room - Hospital,True
3,Emergency Room and Inpatient Visit,False
4,Inpatient Visit,True
...,...,...
154,sodium_last,False
155,bicarbonate_last,False
156,hemoglobin_last,False
157,creatinine_last,False


In [28]:
feDf[feDf.Selected]

Unnamed: 0,Column,Selected
0,Ambulatory Clinic / Center,True
2,Emergency Room - Hospital,True
4,Inpatient Visit,True
6,AMBULATORY OBSERVATION,True
7,DIRECT EMER.,True
8,ELECTIVE,True
10,EW EMER.,True
25,visit_duration_hrs,True
113,gcs_motor_last,True
136,creatinine_avg,True


## Sixty day mortality prediction

In [29]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.tree import DecisionTreeClassifier

sfs = SequentialFeatureSelector(estimator = DecisionTreeClassifier(), n_features_to_select=10)
sfs.fit(X, y_sixty_day)

SequentialFeatureSelector(estimator=DecisionTreeClassifier(),
                          n_features_to_select=10)

In [30]:
sfs.get_feature_names_out()

array(['Ambulatory Clinic / Center', 'AMBULATORY OBSERVATION',
       'AMBULATORY SURGERY TRANSFER', 'PACU', 'PROCEDURE SITE',
       'visit_duration_hrs', 'gcs_motor_max', 'resp_rate_min',
       'gcs_motor_last', 'glucose_max'], dtype=object)

In [31]:
# summarize all features
data = []
columns = X.columns
for i in range(X.shape[1]):
    data.append([columns[i], sfs.get_support()[i]])
feDf = pd.DataFrame(data, columns=['Column', 'Selected'])
feDf

Unnamed: 0,Column,Selected
0,Ambulatory Clinic / Center,True
1,Ambulatory Surgical Center,False
2,Emergency Room - Hospital,False
3,Emergency Room and Inpatient Visit,False
4,Inpatient Visit,False
...,...,...
154,sodium_last,False
155,bicarbonate_last,False
156,hemoglobin_last,False
157,creatinine_last,False


In [32]:
feDf[feDf.Selected]

Unnamed: 0,Column,Selected
0,Ambulatory Clinic / Center,True
6,AMBULATORY OBSERVATION,True
14,AMBULATORY SURGERY TRANSFER,True
19,PACU,True
21,PROCEDURE SITE,True
25,visit_duration_hrs,True
38,gcs_motor_max,True
45,resp_rate_min,True
113,gcs_motor_last,True
118,glucose_max,True


## Ninety day mortality prediction

In [33]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.tree import DecisionTreeClassifier

sfs = SequentialFeatureSelector(estimator = DecisionTreeClassifier(), n_features_to_select=10)
sfs.fit(X, y_ninety_day)

SequentialFeatureSelector(estimator=DecisionTreeClassifier(),
                          n_features_to_select=10)

In [34]:
sfs.get_feature_names_out()

array(['Ambulatory Clinic / Center', 'Inpatient Visit',
       'AMBULATORY OBSERVATION', 'EMERGENCY ROOM', 'PROCEDURE SITE',
       'TRANSFER FROM HOSPITAL', 'WALK-IN/SELF REFERRAL',
       'visit_duration_hrs', 'gcs_motor_last', 'bicarbonate_min'],
      dtype=object)

In [35]:
# summarize all features
data = []
columns = X.columns
for i in range(X.shape[1]):
    data.append([columns[i], sfs.get_support()[i]])
feDf = pd.DataFrame(data, columns=['Column', 'Selected'])
feDf

Unnamed: 0,Column,Selected
0,Ambulatory Clinic / Center,True
1,Ambulatory Surgical Center,False
2,Emergency Room - Hospital,False
3,Emergency Room and Inpatient Visit,False
4,Inpatient Visit,True
...,...,...
154,sodium_last,False
155,bicarbonate_last,False
156,hemoglobin_last,False
157,creatinine_last,False


In [36]:
feDf[feDf.Selected]

Unnamed: 0,Column,Selected
0,Ambulatory Clinic / Center,True
4,Inpatient Visit,True
6,AMBULATORY OBSERVATION,True
16,EMERGENCY ROOM,True
21,PROCEDURE SITE,True
22,TRANSFER FROM HOSPITAL,True
24,WALK-IN/SELF REFERRAL,True
25,visit_duration_hrs,True
113,gcs_motor_last,True
127,bicarbonate_min,True


## Onetwenty day mortality prediction

In [37]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.tree import DecisionTreeClassifier

sfs = SequentialFeatureSelector(estimator = DecisionTreeClassifier(), n_features_to_select=10)
sfs.fit(X, y_onetwenty_day)

SequentialFeatureSelector(estimator=DecisionTreeClassifier(),
                          n_features_to_select=10)

In [38]:
sfs.get_feature_names_out()

array(['Ambulatory Clinic / Center', 'Inpatient Visit',
       'AMBULATORY OBSERVATION', 'PACU', 'PHYSICIAN REFERRAL',
       'PROCEDURE SITE', 'TRANSFER FROM HOSPITAL', 'visit_duration_hrs',
       'gcs_motor_last', 'sodium_min'], dtype=object)

In [39]:
# summarize all features
data = []
columns = X.columns
for i in range(X.shape[1]):
    data.append([columns[i], sfs.get_support()[i]])
feDf = pd.DataFrame(data, columns=['Column', 'Selected'])
feDf

Unnamed: 0,Column,Selected
0,Ambulatory Clinic / Center,True
1,Ambulatory Surgical Center,False
2,Emergency Room - Hospital,False
3,Emergency Room and Inpatient Visit,False
4,Inpatient Visit,True
...,...,...
154,sodium_last,False
155,bicarbonate_last,False
156,hemoglobin_last,False
157,creatinine_last,False


In [40]:
feDf[feDf.Selected]

Unnamed: 0,Column,Selected
0,Ambulatory Clinic / Center,True
4,Inpatient Visit,True
6,AMBULATORY OBSERVATION,True
19,PACU,True
20,PHYSICIAN REFERRAL,True
21,PROCEDURE SITE,True
22,TRANSFER FROM HOSPITAL,True
25,visit_duration_hrs,True
113,gcs_motor_last,True
126,sodium_min,True
