# Recursive Feature Elimination (KNN in pipeline)

In [1]:
# Original source: 
# https://github.com/rasbt/stat451-machine-learning-fs21/blob/main/13-feature-selection/04_recursive-feature-elimination.ipynb

Implementation of RFE for feature implementation along with classification in the same pipeline. This code used KNN in the pipeline for second stage as well as testing.

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Dataset Preparation

In [3]:
df_CBC_umich = pd.read_csv(r'C:\Users\rajnishk\Dropbox (University of Michigan)\2023-09-25 CBC CART for Benjie and Rajnish from Mary\Rajnish Folder\Only_UM_LTFU_CBC_Data_with_matched_CRP_Ferritin_LOD_0.csv')
# print("shape of the data frame is %s" %(np.shape(df_CBC_umich)))
df_CBC_umich.columns

Index(['StudyID', 'DPI', 'WBC', 'NEU%', 'NEU#', 'LYMP%', 'LYMP#', 'MON%',
       'MON#', 'CRS_Grade', 'CRS_Incidence', 'Max_CRS', 'DP_CRS_Onset',
       'During Chemo?', 'After Chemo', 'Placeholder_DPI', 'NT_Incidence',
       'Max_NT', 'NT_Grade', 'DP_NT_Onset', 'HGB', 'HCT', 'PLT', 'RBC', 'MCV',
       'MCH', 'MCHC', 'RDW', 'MPV', 'Sex', 'Age', 'CRP', 'Ferritin'],
      dtype='object')

In [4]:
df_CBC_umich["NT_Incidence"] = df_CBC_umich["NT_Incidence"].astype(int)
# df_CBC_umich["Sex"] = df_CBC_umich["Sex"].astype(int)
# Map 'Male' to 0 and 'Female' to 1
sex_mapping = {'Male': 0, 'Female': 1}

df_CBC_umich['Sex'] = df_CBC_umich['Sex'].replace(sex_mapping)

  df_CBC_umich['Sex'] = df_CBC_umich['Sex'].replace(sex_mapping)


Drop all the columns that are not to be used. 
df.drop(['C', 'D'], axis=1)

In [5]:
CBC_DPI_0_only = df_CBC_umich[df_CBC_umich.DPI==0]
# CBC_DPI_0_only

In [6]:
df_CBC_Umich_DPI_0_only = CBC_DPI_0_only.drop(columns=['CRS_Incidence','DPI','Max_CRS','NEU%','LYMP%','MON%' , 'DP_CRS_Onset','Max_NT', 'NT_Grade', 'Placeholder_DPI', 'During Chemo?', 'After Chemo', 'DP_NT_Onset'])
# df_CBC_Umich_DPI_0_only 

In [7]:
# np.sum(df_CBC_Umich_DPI_0_only.isna())
# np.shape(df_CBC_Umich_DPI_0_only)

In [8]:
# Drop the row with value of ferritin not avialable.
df_CBC_Umich_DPI_0_only.dropna(subset=['Ferritin'], inplace=True)
# df_CBC_Umich_DPI_0_only

In [9]:
df_CBC_Umich_DPI_0_only.columns
# df_CBC_Umich_DPI_0_only.NT_Incidence

Index(['StudyID', 'WBC', 'NEU#', 'LYMP#', 'MON#', 'CRS_Grade', 'NT_Incidence',
       'HGB', 'HCT', 'PLT', 'RBC', 'MCV', 'MCH', 'MCHC', 'RDW', 'MPV', 'Sex',
       'Age', 'CRP', 'Ferritin'],
      dtype='object')

In [10]:
df_CBC_Umich_DPI_0_only # The data set from UM from day 0. 

Unnamed: 0,StudyID,WBC,NEU#,LYMP#,MON#,CRS_Grade,NT_Incidence,HGB,HCT,PLT,RBC,MCV,MCH,MCHC,RDW,MPV,Sex,Age,CRP,Ferritin
6,SC04,0.40,0.00,0.0,0.0,0.0,0,7.9,23.8,67,2.60,91.5,30.4,33.2,15.3,8.4,0,53,12.5,3862.5
18,SC06,1.30,1.20,0.0,0.0,0.0,1,7.7,23.8,140,2.49,95.6,30.9,32.4,15.8,9.1,0,71,7.7,638.9
38,SC08,0.30,0.00,0.0,0.0,0.0,1,10.0,29.9,36,3.07,97.4,32.6,33.4,17.5,10.0,1,56,12.6,1935.6
56,SC14,1.80,1.55,0.0,0.0,0.0,0,8.8,27.4,264,3.36,81.4,26.2,32.2,15.2,8.1,0,65,10.2,1367.7
72,SC18,0.35,0.00,0.0,0.0,0.0,1,7.2,21.8,81,2.15,101.4,33.6,33.1,14.8,9.5,0,71,6.1,5140.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1086,LTFU39,1.40,1.20,0.0,0.0,0.0,0,10.0,29.0,105,3.13,92.7,31.9,34.5,11.9,10.1,0,61,3.7,1004.1
1101,LTFU40,1.40,1.20,0.1,0.1,0.0,0,10.8,34.4,112,4.36,78.9,24.8,31.4,15.0,11.0,0,62,0.0,56.3
1116,LTFU41,1.20,0.40,0.7,0.1,2.0,1,6.3,19.2,43,2.05,93.7,30.7,32.8,21.6,11.7,1,58,11.7,920.2
1142,LTFU42,3.30,0.00,0.1,0.0,0.0,0,11.2,33.3,132,3.88,85.8,28.9,33.6,15.2,9.4,0,72,3.3,768.3


RFE has two stage algorithms. In the internal stage the model will be used on training dataset from UM, as well as cross validation from UM. 

In [11]:
features_list = ['WBC', 'NEU#', 'LYMP#', 'MON#','CRS_Grade', 'HGB', 'HCT',
       'PLT', 'RBC', 'MCV', 'MCH', 'MCHC', 'RDW', 'MPV', 'Sex', 'Age', 'CRP',
       'Ferritin']

X, y = df_CBC_Umich_DPI_0_only.loc[:, features_list].values, df_CBC_Umich_DPI_0_only.loc[:,['NT_Incidence']].values
print("X is %s \n" %X)

y = y.flatten()
print("y is %s \n" %y)

X is [[4.0000e-01 0.0000e+00 0.0000e+00 ... 5.3000e+01 1.2500e+01 3.8625e+03]
 [1.3000e+00 1.2000e+00 0.0000e+00 ... 7.1000e+01 7.7000e+00 6.3890e+02]
 [3.0000e-01 0.0000e+00 0.0000e+00 ... 5.6000e+01 1.2600e+01 1.9356e+03]
 ...
 [1.2000e+00 4.0000e-01 7.0000e-01 ... 5.8000e+01 1.1700e+01 9.2020e+02]
 [3.3000e+00 0.0000e+00 1.0000e-01 ... 7.2000e+01 3.3000e+00 7.6830e+02]
 [1.7000e+00 1.7000e+00 0.0000e+00 ... 6.3000e+01 1.0000e+00 1.0620e+03]] 

y is [0 1 1 0 1 1 1 0 0 0 0 0 0 0 0 1 1 1 0 0 1 1 0 0 0 1 1 1 0 0 0 1 0 1 0 1 1
 0 0 1 1 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 1 1 1 1 1 0 1 0 0 1 1 1 0 0 0 0 1 0
 1] 



In [12]:
from sklearn.model_selection import train_test_split
X_train, X_CV, y_train, y_CV =\
    train_test_split(X, y, 
                     test_size=0.3, 
                     random_state=0, 
                     stratify=y)

In [13]:
from sklearn.preprocessing import StandardScaler


sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_CV_std = sc.transform(X_CV)

## Implementing Recursive Feature Selection

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE



lr = LogisticRegression(solver='liblinear', random_state=123)

rfe = RFE(estimator=lr, n_features_to_select=12, step=1)          # No of features to be selected here have nothing to do with number of principal components selected for other models. 
rfe.fit(X_train_std, y_train)

X_train_sub = rfe.transform(X_train_std)

## Selected Features

In [15]:
rfe.support_


array([False,  True,  True,  True, False,  True, False,  True,  True,
       False,  True, False,  True,  True,  True,  True, False,  True])

Selected features are: 

In [16]:
df_CBC_Umich_DPI_0_only.loc[:, features_list].columns[0 :][rfe.support_]

Index(['NEU#', 'LYMP#', 'MON#', 'HGB', 'PLT', 'RBC', 'MCH', 'RDW', 'MPV',
       'Sex', 'Age', 'Ferritin'],
      dtype='object')

# RFE as a part of the pipeline. We will use the cross validation set from University of Michigan for this part of the pipeline.

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline


pipe = make_pipeline(RFE(estimator=lr, step=1),
                     KNeighborsClassifier())


parameters = {'rfe__n_features_to_select': range(1, len(features_list)), 
              'kneighborsclassifier__n_neighbors': range(1, 15) }

grid = GridSearchCV(pipe, param_grid=parameters, cv=10, n_jobs=-1)
grid.fit(X_train_std, y_train)

print('Best params:', grid.best_params_)
print('Best accuracy:', grid.best_score_)




Best params: {'kneighborsclassifier__n_neighbors': 14, 'rfe__n_features_to_select': 9}
Best accuracy: 0.78


In [18]:
rfe.get_support()

array([False,  True,  True,  True, False,  True, False,  True,  True,
       False,  True, False,  True,  True,  True,  True, False,  True])

In [19]:
features_list

['WBC',
 'NEU#',
 'LYMP#',
 'MON#',
 'CRS_Grade',
 'HGB',
 'HCT',
 'PLT',
 'RBC',
 'MCV',
 'MCH',
 'MCHC',
 'RDW',
 'MPV',
 'Sex',
 'Age',
 'CRP',
 'Ferritin']

In [20]:
# Reduced feature set from grid search

grid.best_estimator_.score(X_CV_std, y_CV)

0.6956521739130435

In [21]:
# Full feature set for reference

knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train_std, y_train)
knn.score(X_CV_std, y_CV)

0.6956521739130435

In [22]:
# Project the dataset from John Hopkins on test set, and see the value. 

# We will do testing on data set from John Hopkins.

In [23]:
df_CBC_JH = pd.read_csv(r'C:\Users\rajnishk\Dropbox (University of Michigan)\2023-09-25 CBC CART for Benjie and Rajnish from Mary\JHMI Compiled CBC, No ALL, CRP_LOD_0.csv')
# df_CBC_JH

In [24]:
df_CBC_JH["Ferritin"] = df_CBC_JH["Ferritin"].str.replace(',','').astype(float)

In [25]:
df_CBC_JH["NT_Incidence"] = df_CBC_JH["NT_Incidence"].astype(int)

# Map 'Male' to 0 and 'Female' to 1
sex_mapping = {'Male': 0, 'Female': 1}

df_CBC_JH['Sex'] = df_CBC_JH['Sex'].replace(sex_mapping)

  df_CBC_JH['Sex'] = df_CBC_JH['Sex'].replace(sex_mapping)


In [26]:
df_CBC_JH_DPI_0 = df_CBC_JH[df_CBC_JH.DPI == 0]
columns_list  = features_list + ['NT_Incidence']
df_CBC_JH_DPI_0 = df_CBC_JH_DPI_0[columns_list]
# df_CBC_JH_DPI_0
df_CBC_JH_DPI_0.dropna(inplace=True)

In [27]:
X_test_JH, y_test_JH = df_CBC_JH_DPI_0.loc[:, features_list].values, df_CBC_JH_DPI_0.loc[:,['NT_Incidence']].values
# print("X_test_JH is %s \n" %X_test_JH)

y_test_JH = y_test_JH.flatten()
# print("y_test_JH is %s \n" %y_test_JH)

In [28]:
X_test_JH_std = sc.fit_transform(X_test_JH)


In [29]:
grid.best_estimator_.score(X_test_JH_std, y_test_JH)

0.6551724137931034

In [30]:
grid.best_estimator_.verbose

False

In [31]:
grid.n_splits_

10

In [32]:
np.shape(X_test_JH)

(29, 18)

In [33]:
# for i in range(X.shape[1]):
#  print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))