In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import  cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import plot_confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report

In [2]:
dfraw = pd.read_csv('/Users/robholmstrom/Downloads/patient.csv')

In [3]:
dfraw.shape

(95839, 20)

In [4]:
pd.set_option("display.max_rows", 101)

In [5]:
dfraw.head(3)

Unnamed: 0,sex,patient_type,intubated,pneumonia,age,pregnant,diabetes,copd,asthma,immunosuppression,hypertension,other_diseases,cardiovascular,obesity,chronic_kidney_failure,smoker,another_case,outcome,icu,death_date
0,2,1,97,2,42,97,2,2,1,2,2,2,2,2,2,2,99,1,97,9999-99-99
1,1,1,97,2,51,2,2,2,2,2,2,2,2,2,2,2,99,1,97,9999-99-99
2,2,2,2,2,51,97,1,2,2,2,1,2,2,1,2,2,99,1,2,9999-99-99


In [6]:
for x in dfraw:
    print(dfraw[x].value_counts())

2    48720
1    47119
Name: sex, dtype: int64
1    70268
2    25571
Name: patient_type, dtype: int64
97    70268
2     23613
1      1934
99       24
Name: intubated, dtype: int64
2     78203
1     17628
99        8
Name: pneumonia, dtype: int64
30     2390
31     2390
35     2370
29     2367
34     2366
       ... 
100      11
99        9
101       4
102       4
113       1
Name: age, Length: 104, dtype: int64
97    48720
2     45909
1       978
98      232
Name: pregnant, dtype: int64
2     82544
1     12878
98      417
Name: diabetes, dtype: int64
2     92973
1      2462
98      404
Name: copd, dtype: int64
2     91102
1      4328
98      409
Name: asthma, dtype: int64
2     93099
1      2314
98      426
Name: immunosuppression, dtype: int64
2     78721
1     16716
98      402
Name: hypertension, dtype: int64
2     90639
1      4642
98      558
Name: other_diseases, dtype: int64
2     92437
1      2986
98      416
Name: cardiovascular, dtype: int64
2     79852
1     15597
98      390

In [7]:
# Checking unique values in each columns. 
# For all except outcome, 1 = No; 2 = Yes; 97 = Not applicable; 99 = Null. Unclear what 98 stands for
# For outcome, 1, 2, and 3 are different testing companies
for x in dfraw:
    print(x, np.unique(dfraw[x]))

sex [1 2]
patient_type [1 2]
intubated [ 1  2 97 99]
pneumonia [ 1  2 99]
age [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 113]
pregnant [ 1  2 97 98]
diabetes [ 1  2 98]
copd [ 1  2 98]
asthma [ 1  2 98]
immunosuppression [ 1  2 98]
hypertension [ 1  2 98]
other_diseases [ 1  2 98]
cardiovascular [ 1  2 98]
obesity [ 1  2 98]
chronic_kidney_failure [ 1  2 98]
smoker [ 1  2 98]
another_case [ 1  2 99]
outcome [1 2 3]
icu [ 1  2 97 99]
death_date ['2020-01-15' '2020-01-29' '2020-02-10' '2020-02-23' '2020-03-04'
 '2020-03-09' '2020-03-16' '2020-03-17' '2020-03-18' '2020-03-19'
 '2020-03-20' '2020-03-22' '2020-03-23' '2020

In [8]:
dfraw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95839 entries, 0 to 95838
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   sex                     95839 non-null  int64 
 1   patient_type            95839 non-null  int64 
 2   intubated               95839 non-null  int64 
 3   pneumonia               95839 non-null  int64 
 4   age                     95839 non-null  int64 
 5   pregnant                95839 non-null  int64 
 6   diabetes                95839 non-null  int64 
 7   copd                    95839 non-null  int64 
 8   asthma                  95839 non-null  int64 
 9   immunosuppression       95839 non-null  int64 
 10  hypertension            95839 non-null  int64 
 11  other_diseases          95839 non-null  int64 
 12  cardiovascular          95839 non-null  int64 
 13  obesity                 95839 non-null  int64 
 14  chronic_kidney_failure  95839 non-null  int64 
 15  sm

In [9]:
dfraw.columns

Index(['sex', 'patient_type', 'intubated', 'pneumonia', 'age', 'pregnant',
       'diabetes', 'copd', 'asthma', 'immunosuppression', 'hypertension',
       'other_diseases', 'cardiovascular', 'obesity', 'chronic_kidney_failure',
       'smoker', 'another_case', 'outcome', 'icu', 'death_date'],
      dtype='object')

# Data cleaning

In [10]:

# Select out continous variable in seperate list
unaltercols = ['age']
# Select all columns that  will be treated the same way
altercols = ['sex', 'patient_type', 'intubated', 'pneumonia',
       'diabetes', 'copd', 'asthma', 'immunosuppression', 'hypertension',
       'other_diseases', 'cardiovascular', 'obesity', 'chronic_kidney_failure',
       'smoker', 'another_case', 'outcome']

# Eliminate unclears (98 values), clear nulls (99)  and convert non-applicable (97) to 'no' for all columns
# except pregnant or icu columns.
# This will allow more choice downstream.
dfreplace = pd.concat([dfraw[unaltercols], dfraw[altercols].replace({99:np.nan, 97:2, 98:np.nan, 3:np.nan})], axis = 1)

# Further eliminate nulls from pregnant column
dfreplace1 = pd.concat([dfreplace,dfraw['pregnant'].replace(98,np.nan)], axis = 1)                    

# Further eliminate nulls from icu column
dfreplace2 = pd.concat([dfreplace1, dfraw['icu'].replace(99, np.nan)],axis=1)
dfreplace2.columns

Index(['age', 'sex', 'patient_type', 'intubated', 'pneumonia', 'diabetes',
       'copd', 'asthma', 'immunosuppression', 'hypertension', 'other_diseases',
       'cardiovascular', 'obesity', 'chronic_kidney_failure', 'smoker',
       'another_case', 'outcome', 'pregnant', 'icu'],
      dtype='object')

In [11]:
dfreplace2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95839 entries, 0 to 95838
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   age                     95839 non-null  int64  
 1   sex                     95839 non-null  float64
 2   patient_type            95839 non-null  float64
 3   intubated               95815 non-null  float64
 4   pneumonia               95831 non-null  float64
 5   diabetes                95422 non-null  float64
 6   copd                    95435 non-null  float64
 7   asthma                  95430 non-null  float64
 8   immunosuppression       95413 non-null  float64
 9   hypertension            95437 non-null  float64
 10  other_diseases          95281 non-null  float64
 11  cardiovascular          95423 non-null  float64
 12  obesity                 95449 non-null  float64
 13  chronic_kidney_failure  95433 non-null  float64
 14  smoker                  95435 non-null

# Feature engineering

In [80]:
# Choose whether to iterate back from SelectKbest and input to features (dfbest) OR custom features (dfcust)
kbestcols = ['age', 'patient_type', 'pneumonia', 'diabetes', 'outcome']
customcols = ['age', 'sex', 'patient_type', 'pneumonia', 'diabetes',
       'copd', 'asthma', 'immunosuppression', 'hypertension', 'other_diseases',
       'cardiovascular', 'obesity', 'chronic_kidney_failure', 'smoker',
       'another_case', 'outcome', 'pregnant', 'icu']

# Define the label that will be predicted
target = 'intubated'

# Based on whether using kbestcols or customcols, designate dropped columns, but will leave the target in the list
dfdropcols = [x for x in dfreplace2 if x not in customcols]
dfdropcols.remove(target)

dfdrop = dfreplace2.drop(dfdropcols, axis = 1)

if 'pregnant' in dfdrop:
    dfpreg = dfdrop.loc[(dfdrop['pregnant']==1) | (dfdrop['pregnant']==2)]
    dfall = dfdrop.copy()
    dfall['pregnant'] = np.where(dfdrop['pregnant'] == 97, 2,dfdrop['pregnant'].values)
else:
    dfall = dfdrop.copy()
    

if 'icu' in dfdrop:
    dfallicu = dfdrop.loc[(dfdrop['icu']==2) | (dfdrop['icu']==1)]
    dfall = dfdrop.copy()
else:
    dfall = dfdrop.copy()
    dfallicu = []

# Choose whether using 'icu' applicable entries only (dfallicu) and ignore the non-applicable 'icu' entries

#*********************OR*************************

# Choose whether to ask if pregnancy is a predictor among women only (dfpreg) or include men among pregnancies (dfall)

df = dfall



In [82]:
# Deleting leftover nulls from conversion of 99 and 98 to NaN
df_non_nulls = df.dropna()

# Checking nulls have been eliminated
df_non_nulls.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51567 entries, 4 to 95779
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   age                     51567 non-null  int64  
 1   sex                     51567 non-null  float64
 2   patient_type            51567 non-null  float64
 3   intubated               51567 non-null  float64
 4   pneumonia               51567 non-null  float64
 5   diabetes                51567 non-null  float64
 6   copd                    51567 non-null  float64
 7   asthma                  51567 non-null  float64
 8   immunosuppression       51567 non-null  float64
 9   hypertension            51567 non-null  float64
 10  other_diseases          51567 non-null  float64
 11  cardiovascular          51567 non-null  float64
 12  obesity                 51567 non-null  float64
 13  chronic_kidney_failure  51567 non-null  float64
 14  smoker                  51567 non-null

In [83]:
# Investigate cross-feature correlations to eliminate multicolinearity
df_non_nulls.corr()

Unnamed: 0,age,sex,patient_type,intubated,pneumonia,diabetes,copd,asthma,immunosuppression,hypertension,other_diseases,cardiovascular,obesity,chronic_kidney_failure,smoker,another_case,outcome,pregnant,icu
age,1.0,0.030034,0.23324,-0.10641,-0.23318,-0.324162,-0.202064,0.029964,-0.033802,-0.391992,-0.027779,-0.158092,-0.107917,-0.101438,-0.029092,0.071666,-0.175808,0.030175,-0.23331
sex,0.030034,1.0,0.093336,-0.043341,-0.095404,-0.022238,0.003942,0.04131,0.009391,-0.010844,0.035182,-0.007462,0.012994,-0.019706,-0.095737,0.038985,-0.092014,0.999998,-0.093387
patient_type,0.23324,0.093336,1.0,-0.342834,-0.722153,-0.237291,-0.138256,0.01707,-0.082799,-0.187411,-0.055442,-0.108521,-0.084702,-0.124429,-0.015686,0.241993,-0.238845,0.093297,-0.999992
intubated,-0.10641,-0.043341,-0.342834,1.0,0.355566,0.109591,0.049577,-0.017851,0.038839,0.087935,0.019748,0.052791,0.058174,0.064311,0.012051,-0.094779,0.120339,-0.043352,0.344618
pneumonia,-0.23318,-0.095404,-0.722153,0.355566,1.0,0.220155,0.118762,-0.020017,0.071392,0.17248,0.036085,0.102444,0.080907,0.110359,0.017252,-0.194299,0.240748,-0.095428,0.722468
diabetes,-0.324162,-0.022238,-0.237291,0.109591,0.220155,1.0,0.102133,-0.003171,0.042047,0.349705,0.008804,0.120272,0.120468,0.158166,0.01758,-0.090826,0.121106,-0.022286,0.23739
copd,-0.202064,0.003942,-0.138256,0.049577,0.118762,0.102133,1.0,0.03131,0.054137,0.128889,0.037435,0.128816,0.032818,0.087709,0.073381,-0.05767,-0.002177,0.003919,0.138269
asthma,0.029964,0.04131,0.01707,-0.017851,-0.020017,-0.003171,0.03131,1.0,0.031092,0.014889,0.009274,0.016277,0.046644,0.002594,0.012141,-0.003742,-0.033242,0.041313,-0.017095
immunosuppression,-0.033802,0.009391,-0.082799,0.038839,0.071392,0.042047,0.054137,0.031092,1.0,0.0422,0.126185,0.073512,0.01863,0.10625,0.013847,-0.041924,-0.01918,0.009388,0.082863
hypertension,-0.391992,-0.010844,-0.187411,0.087935,0.17248,0.349705,0.128889,0.014889,0.0422,1.0,0.03024,0.181691,0.175626,0.154601,0.014297,-0.075317,0.083726,-0.010909,0.187504


In [84]:
# Seperate features and target for modeling
X = df_non_nulls.drop('intubated', axis=1)
y = df_non_nulls['intubated']

In [85]:
k = 5
skb = SelectKBest(k = k, score_func = f_classif)
best_kfit = skb.fit(X, y)
X_best_feat = best_kfit.fit_transform(X,y)

X_best_feat = pd.DataFrame(X_best_feat, columns=X.columns[best_kfit.get_support()])

X_best_feat.columns

Index(['patient_type', 'pneumonia', 'diabetes', 'outcome', 'icu'], dtype='object')

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# Logistic Regressions

In [87]:
lr1 = LogisticRegression(solver='liblinear',penalty = 'l2', max_iter = 10000, C=1)

lr1.fit(X_train, y_train)

y_test_predlr1 = lr1.predict(X_test)
y_train_predlr1 = lr1.predict(X_train)
print(f'Test score is {lr1.score(X_test, y_test)}')
print(f'Train score is {lr1.score(X_train, y_train)}')



Test score is 0.9756641458212139
Train score is 0.977456185004727


In [100]:
from sklearn.metrics import confusion_matrix
print('LR1 confusion matrix')
print('Training', confusion_matrix(y_train, y_train_predlr1, normalize= None))
print('Test', confusion_matrix(y_test, y_test_predlr1, normalize = None))

LR1 confusion matrix
Training [[  559   592]
 [  338 39764]]
Test [[ 142  153]
 [  98 9921]]


In [89]:
print(f'LR1 cv scores are {cross_val_score(lr1, X, y, cv= 5)}')

LR1 cv scores are [0.98283886 0.97731239 0.97527393 0.9747891  0.97420731]


# Random Forests

In [90]:

rf1 = RandomForestClassifier(max_depth=8,  max_features=4,  n_estimators=500)
rf1.fit(X_train, y_train)

y_test_predrf1 = rf1.predict(X_test)
y_train_predrf1 = rf1.predict(X_train)
print(f'Test score is {rf1.score(X_test, y_test)}')
print(f'Train score is {rf1.score(X_train, y_train)}')


Test score is 0.975179367849525
Train score is 0.9807771556008048


In [91]:
print('RF1 confusion matrix')
print('Training', confusion_matrix(y_train, y_train_predrf1, normalize= 'true'))
print('Test', confusion_matrix(y_test, y_test_predrf1, normalize = 'true'))

RF1 confusion matrix
Training [[0.45351868 0.54648132]
 [0.00408957 0.99591043]]
Test [[0.37627119 0.62372881]
 [0.00718635 0.99281365]]


In [92]:
print(f'RF1 cv scores are {cross_val_score(rf1, X, y, cv= 5)}')

RF1 cv scores are [0.97973628 0.9766337  0.97604965 0.97624358 0.97411035]


# K Nearest Neighbors

In [93]:

knn1 = KNeighborsClassifier(n_neighbors=2, weights='distance', algorithm='ball_tree', leaf_size=10)
knn1.fit(X_train, y_train)

y_test_predknn1 = knn1.predict(X_test)
y_train_predknn1 = knn1.predict(X_train)
print(f'Test score is {knn1.score(X_test, y_test)}')
print(f'Train score is {knn1.score(X_train, y_train)}')


Test score is 0.9567578049253442
Train score is 0.9891644244054978


In [94]:

print(f'KNN1 cv scores are {cross_val_score(knn1, X, y, cv= 5)}')



KNN1 cv scores are [0.97101028 0.95569129 0.95481431 0.95384466 0.95219626]


In [95]:
print('knn1 confusion matrix')
print('Training', confusion_matrix(y_train, y_train_predknn1, normalize= 'true'))
print('Test', confusion_matrix(y_test, y_test_predknn1, normalize = 'true'))

knn1 confusion matrix
Training [[0.91225022 0.08774978]
 [0.008628   0.991372  ]]
Test [[0.46440678 0.53559322]
 [0.02874538 0.97125462]]


# Support Vector Classification

In [96]:
c_values = [10**x for x in range(-2, 2)]
svmlin = SVC()
parameters = {'kernel':['linear', 'rbf'], 'C':c_values}
svmgrid = GridSearchCV(svmlin, parameters)
svmgrid.fit(X_train, y_train)

y_train_predsvmgrid = svmgrid.predict(X_train)
y_test_predsvmgrid = svmgrid.predict(X_test)

print('Train SVC score', svmgrid.score(X_train, y_train))
print('Test SVC score', svmgrid.score(X_test, y_test))
print(svmgrid.best_params_)


Train SVC score 0.9769228904564516
Test SVC score 0.9755671902268761
{'C': 0.1, 'kernel': 'linear'}


In [97]:

print('svmrbf confusion matrix')
print('Training', confusion_matrix(y_train, y_train_predsvmgrid, normalize= 'true'))
print('Test', confusion_matrix(y_test,y_test_predsvmgrid, normalize = 'true'))


svmrbf confusion matrix
Training [[0.4995656  0.5004344 ]
 [0.00937609 0.99062391]]
Test [[0.50508475 0.49491525]
 [0.0105799  0.9894201 ]]


In [98]:
print(f'SVMlin cv scores are {cross_val_score(svmlin, X, y, cv= 5)}')

SVMlin cv scores are [0.97197983 0.97188288 0.97197712 0.97197712 0.97197712]
