In [11]:
from utils import intake_data

## Upsample, StandardScaler, SFS

## Importing and Scale Data

Removed 3 features: days, ventilator weaning, ventilator free days

In [12]:
data_pre = intake_data()
data_x = data_pre.drop(["death","days.1","ventilator weaning = 1", "VFD ","days"], axis=1)
data_y = data_pre["death"]

Upsampling data using SMOTE

In [13]:
from imblearn import over_sampling
oversample = over_sampling.SMOTE()
smote_x, smote_y = oversample.fit_resample(data_x, data_y)

In [19]:
from sklearn.preprocessing import StandardScaler

scaled_x = smote_x
scaled_y = smote_y 

scaler = StandardScaler()
scaler.fit(scaled_x)

StandardScaler(copy=True, with_mean=True, with_std=True)

## Finding best parameters

Finding best number of n neighbors:

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
param_list = []
for i in range(1,31): 
    param_list.append(i)
parameters = {'n_neighbors':param_list}
gs = GridSearchCV(KNeighborsClassifier(), parameters, cv = 10)
gs_results = gs.fit(smote_x, smote_y)
gs.best_params_

{'n_neighbors': 1}

Running cross validation loop with n = 2

In [21]:
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score

my_model = KNeighborsClassifier(n_neighbors = 2)
my_model.fit(scaled_x, scaled_y)
cross_val_score(my_model, scaled_x, scaled_y, cv = 10)
y_predict = cross_val_predict(my_model, scaled_x, scaled_y, cv=10)
print(classification_report(scaled_y, y_predict))

              precision    recall  f1-score   support

         0.0       0.58      0.77      0.66       128
         1.0       0.66      0.45      0.53       128

    accuracy                           0.61       256
   macro avg       0.62      0.61      0.60       256
weighted avg       0.62      0.61      0.60       256



In [22]:
cross_val_score(gs, scaled_x, scaled_y, cv=10)
y_predict = cross_val_predict(gs, scaled_x, scaled_y, cv=10)
print(classification_report(scaled_y, y_predict))

              precision    recall  f1-score   support

         0.0       0.70      0.55      0.61       128
         1.0       0.63      0.77      0.69       128

    accuracy                           0.66       256
   macro avg       0.66      0.66      0.65       256
weighted avg       0.66      0.66      0.65       256



Performing PCA dimensionality reduction

In [35]:
# Define a pipeline to search for the best combination of 
# PCA dimensions and n_neighbors.

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import warnings
from sklearn.preprocessing import StandardScaler 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
warnings.filterwarnings('ignore')

#crate a scaler
scaler = StandardScaler()

#create a PCA
sfs = sfs(knn, k_features=5, forward=True, floating=False,
          verbose=2, scoring='accuracy', cv=5)
sfs.fit(smote_x, smote_y)

#create a KNN classifier
knn = KNeighborsClassifier()

#create a pipeline that does scaling, then PCA, then KNN
pipe = Pipeline(steps=[('scaler', scaler), ('knn', knn)])

#Set up the parameters you want to tune for each of your pipeline steps
#Parameters of pipelines can be set using ‘__’ separated parameter names:
param_grid = {
    'knn__n_neighbors': list(range(1, 30)),  #find the best value of k
}

# pass the pipeline and the parameters into a GridSearchCV with a 5-fold cross validation
gs = GridSearchCV(pipe, param_grid, cv=5)
# call fit() on the GridSearchCV and pass in the unscaled data (X_values, Y_values)
x = smote_x
y = smote_y
gs.fit(x,y)
# print out the best_score_ and best_params_ from the GridSearchCV
print(gs.best_score_)
print(gs.best_params_)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    0.3s finished

[2020-04-17 13:59:42] Features: 1/5 -- score: 0.6023378582202111[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    0.3s finished

[2020-04-17 13:59:42] Features: 2/5 -- score: 0.6137254901960785[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed:    0.3s finished

[2020-04-17 13:59:42] Features: 3/5 -- score: 0.6097285067873304[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 

0.7538461538461538
{'knn__n_neighbors': 1}


## Finding final accuracy score

In [36]:
from sklearn.metrics import accuracy_score
cross_val_score(gs, x, y, cv=5)
y_predict = cross_val_predict(gs, x, y, cv=5)
accuracy_score(y, y_predict)

0.68359375