# Exploring K-Fold using Survival

In [1]:
import pandas as pd
from datetime import datetime
from datetime import date
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline

# Load data and create subsets
### Observation Data

In [2]:
observations_df = pd.read_csv('C:\\Projects\\healthdata\\train\\observations.csv')
covid19_status_df = observations_df.loc[(observations_df['CODE'] == '94531-1')]
covid19_status2_df = observations_df.loc[(observations_df['CODE'] == '94531-1') & (observations_df['VALUE'].str.contains('Detected'))]

In [3]:
covid19_status_df.shape

(105548, 8)

In [4]:
covid19_status2_df.shape

(73697, 8)

### Patient Data 

In [5]:
patients_df = pd.read_csv('C:\\Projects\\healthdata\\train\\patients.csv')
patients_subset_df = pd.DataFrame([patients_df.Id, patients_df.BIRTHDATE, patients_df.DEATHDATE, patients_df.MARITAL, patients_df.RACE, patients_df.ETHNICITY, patients_df.GENDER]).transpose()
covid19_patients_df = patients_subset_df.loc[(patients_subset_df['Id'].isin(covid19_status2_df['PATIENT']))]

#### Add column to covid19_patients_df to track patients who survived: '1' = survived, '0' = died

In [6]:
covid19_patients_df["SURVIVAL"] = covid19_patients_df["DEATHDATE"].apply(lambda x: 1 if pd.isnull(x) else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


#### Calculate Age for each patient, use DEATHDATE if not null, else use today's date.

In [7]:
def calculate_age(born, death):
    born = datetime.strptime(born, "%Y-%m-%d").date()
    if pd.isnull(death):
        today = date.today()
    else:        
        today = datetime.strptime(death, "%Y-%m-%d").date()
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

covid19_patients_df['AGE'] = covid19_patients_df.apply(lambda x: calculate_age(x.BIRTHDATE, x.DEATHDATE), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


### Encounters Date: Contains Hospitalizations

In [8]:
encounters_df = pd.read_csv('C:\\Projects\\healthdata\\train\\encounters.csv')

#### Create DataFrame with only COVID-19 hospitalizations.
#### Create a subset DataFrame that only contains the following columns: Id, START, STOP, PATIENT, CODE, DESCRIPTION, REASONCODE, REASONDESCRIPTION.

In [9]:
covid19_hosp_df = encounters_df.loc[(encounters_df['REASONCODE'] == 840539006)] 
covid19_hosp_subset_df = pd.DataFrame([covid19_hosp_df.Id, covid19_hosp_df.START, covid19_hosp_df.STOP, covid19_hosp_df.PATIENT, covid19_hosp_df.CODE, covid19_hosp_df.DESCRIPTION, covid19_hosp_df.REASONCODE, covid19_hosp_df.REASONDESCRIPTION]).transpose()

#### Insert column containing number of days hospitalized into the COVID-19 hospital dataframe.

In [10]:
covid19_hosp_subset_df['START'] = pd.to_datetime(covid19_hosp_subset_df['START'])
covid19_hosp_subset_df['STOP'] = pd.to_datetime(covid19_hosp_subset_df['STOP'])
covid19_hosp_subset_df.insert(3, 'DAYSHOSPITALIZED', (covid19_hosp_subset_df['STOP'] - covid19_hosp_subset_df['START']))
covid19_patients_df = pd.merge(covid19_patients_df, covid19_hosp_subset_df[['DAYSHOSPITALIZED', 'PATIENT']], how='left', left_on='Id', right_on='PATIENT')
covid19_patients_df.drop(columns='PATIENT', inplace = True)
covid19_patients_df['DAYSHOSPITALIZED'] = pd.to_numeric(covid19_patients_df['DAYSHOSPITALIZED'].dt.days, downcast='integer')
covid19_patients_df['DAYSHOSPITALIZED'] = covid19_patients_df['DAYSHOSPITALIZED'].fillna(0)

### Conditions Data

In [11]:
conditions_df = pd.read_csv('C:\\Projects\\healthdata\\train\\conditions.csv')

In [12]:
groupby_conditions_code = conditions_df.groupby(['CODE', 'DESCRIPTION'])['PATIENT'].count()
pd.set_option('display.max_rows', 236)
print(groupby_conditions_code)

CODE             DESCRIPTION                                                                     
1734006          Fracture of the vertebral column with spinal cord injury                              123
5602001          Opioid abuse (disorder)                                                             10924
6072007          Bleeding from anus                                                                    705
7200002          Alcoholism                                                                          13882
10509002         Acute bronchitis (disorder)                                                          6356
11218009         Infection caused by Pseudomonas aeruginosa                                             13
15724005         Fracture of vertebral column without spinal cord injury                                18
15777000         Prediabetes                                                                         55868
16114001         Fracture of ankle            

In [13]:
groupby_conditions_code.shape

(192,)

#### Create subset containing patients with diabetes 

In [13]:
diabetes_status_df = conditions_df.loc[(conditions_df['CODE'] == 44054006)]

#### Merge Conditions DESCRIPTION column with covid patient data

In [14]:
covid19_patients_df = pd.merge(covid19_patients_df, diabetes_status_df[['DESCRIPTION', 'PATIENT']], how='left', left_on='Id', right_on='PATIENT')
covid19_patients_df.drop(columns='PATIENT', inplace = True)

In [15]:
covid19_patients_df.isnull().sum().sort_values(ascending = False)

DEATHDATE           70473
DESCRIPTION         67353
MARITAL              6604
DAYSHOSPITALIZED        0
AGE                     0
SURVIVAL                0
GENDER                  0
ETHNICITY               0
RACE                    0
BIRTHDATE               0
Id                      0
dtype: int64

In [16]:
covid19_patients_df.DESCRIPTION.value_counts()

Diabetes    17807
Name: DESCRIPTION, dtype: int64

In [17]:
covid19_patients_df.shape

(85160, 11)

#### Add column to covid19_patients_df to track patients who have Diabetes: '1' = Diabetes, '0' = Does Not have Diabetes

In [18]:
covid19_patients_df["DIABETES"] = covid19_patients_df["DESCRIPTION"].apply(lambda x: 0 if pd.isnull(x) else 1)

# Preparing dataframe for Logistic model

In [19]:
covid19_patients_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85160 entries, 0 to 85159
Data columns (total 12 columns):
Id                  85160 non-null object
BIRTHDATE           85160 non-null object
DEATHDATE           14687 non-null object
MARITAL             78556 non-null object
RACE                85160 non-null object
ETHNICITY           85160 non-null object
GENDER              85160 non-null object
SURVIVAL            85160 non-null int64
AGE                 85160 non-null int64
DAYSHOSPITALIZED    85160 non-null float64
DESCRIPTION         17807 non-null object
DIABETES            85160 non-null int64
dtypes: float64(1), int64(3), object(8)
memory usage: 8.4+ MB


In [20]:
# Convert 'GENDER' variable to integer form!
covid19_patients_df.loc[(covid19_patients_df.GENDER == "M"),'GENDER'] = 0
covid19_patients_df.loc[(covid19_patients_df.GENDER == "F"),'GENDER'] = 1

# Convert 'ETHNICITY' variable to integer form!
covid19_patients_df.loc[(covid19_patients_df.ETHNICITY == "nonhispanic"),'ETHNICITY'] = 0
covid19_patients_df.loc[(covid19_patients_df.ETHNICITY == "hispanic"),'ETHNICITY'] = 1

# Convert 'RACE' variable to integer form!
covid19_patients_df.loc[(covid19_patients_df.RACE == "white"),'RACE'] = 1
covid19_patients_df.loc[(covid19_patients_df.RACE == "black"),'RACE'] = 2
covid19_patients_df.loc[(covid19_patients_df.RACE == "asian"),'RACE'] = 3
covid19_patients_df.loc[(covid19_patients_df.RACE == "other"),'RACE'] = 4
covid19_patients_df.loc[(covid19_patients_df.RACE == "native"),'RACE'] = 5

In [21]:
covid19_patients_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85160 entries, 0 to 85159
Data columns (total 12 columns):
Id                  85160 non-null object
BIRTHDATE           85160 non-null object
DEATHDATE           14687 non-null object
MARITAL             78556 non-null object
RACE                85160 non-null int64
ETHNICITY           85160 non-null int64
GENDER              85160 non-null int64
SURVIVAL            85160 non-null int64
AGE                 85160 non-null int64
DAYSHOSPITALIZED    85160 non-null float64
DESCRIPTION         17807 non-null object
DIABETES            85160 non-null int64
dtypes: float64(1), int64(6), object(5)
memory usage: 8.4+ MB


In [22]:
# We'll drop the following features for now, but more to follow...
covid19_patients_df.drop(['MARITAL', 'DESCRIPTION'], axis = 1, inplace = True)

In [23]:
# We'll also drop the following features
covid19_patients_df.drop(['Id', 'BIRTHDATE', 'DEATHDATE'], axis = 1, inplace = True)

# Running Logistics model based on the following columns

In [24]:
covid19_patients_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85160 entries, 0 to 85159
Data columns (total 7 columns):
RACE                85160 non-null int64
ETHNICITY           85160 non-null int64
GENDER              85160 non-null int64
SURVIVAL            85160 non-null int64
AGE                 85160 non-null int64
DAYSHOSPITALIZED    85160 non-null float64
DIABETES            85160 non-null int64
dtypes: float64(1), int64(6)
memory usage: 5.2 MB


In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(covid19_patients_df.drop(['SURVIVAL'], axis = 1), 
                                                    covid19_patients_df['SURVIVAL'], test_size = 0.4, 
                                                    random_state = 2)

In [26]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51096 entries, 57465 to 72173
Data columns (total 6 columns):
RACE                51096 non-null int64
ETHNICITY           51096 non-null int64
GENDER              51096 non-null int64
AGE                 51096 non-null int64
DAYSHOSPITALIZED    51096 non-null float64
DIABETES            51096 non-null int64
dtypes: float64(1), int64(5)
memory usage: 2.7 MB


In [27]:
X_train.shape

(51096, 6)

In [28]:
type(y_train)

pandas.core.series.Series

In [29]:
y_train

57465    1
21187    1
39621    1
25422    1
7441     1
        ..
33867    1
84434    0
31019    1
44566    0
72173    1
Name: SURVIVAL, Length: 51096, dtype: int64

In [30]:
from sklearn.linear_model import LogisticRegression
logisticRegression = LogisticRegression(max_iter = 10000)
logisticRegression.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [31]:
# Predict!
predictions = logisticRegression.predict(X_test)

In [32]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, predictions))

[[  554  5300]
 [  420 27790]]


In [33]:
accuracy = (554 + 27790) / (554 + 5300 + 420 + 27790)
print('accuracy is: ' + str(round(accuracy, 2)))

accuracy is: 0.83


In [34]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34064 entries, 77515 to 25384
Data columns (total 6 columns):
RACE                34064 non-null int64
ETHNICITY           34064 non-null int64
GENDER              34064 non-null int64
AGE                 34064 non-null int64
DAYSHOSPITALIZED    34064 non-null float64
DIABETES            34064 non-null int64
dtypes: float64(1), int64(5)
memory usage: 1.8 MB


In [35]:
logisticRegression.coef_

array([[ 0.001671  , -0.08476004,  0.39892351, -0.07203332, -0.02223663,
        -0.38689691]])

In [36]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.57      0.09      0.16      5854
           1       0.84      0.99      0.91     28210

    accuracy                           0.83     34064
   macro avg       0.70      0.54      0.53     34064
weighted avg       0.79      0.83      0.78     34064



In [37]:
# SVC - Support Vector Classifier
from sklearn.svm import SVC

# X_train, X_test, y_train, y_test = train_test_split(covid19_patients_df.drop(['SURVIVAL'], axis = 1), 
#                                                     covid19_patients_df['SURVIVAL'], random_state = 0)

In [38]:
svm = SVC(C=100, gamma='auto')

In [40]:
svm.fit(X_train, y_train)
print("Test set accuracy: {:.2f}".format(svm.score(X_test, y_test)))

Test set accuracy: 0.90


In [41]:
from sklearn.preprocessing import MinMaxScaler

# preprocessing using 0-1 scaling
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# learning an SVM on the scaled training data
svm.fit(X_train_scaled, y_train)

# scoring on the scaled test set
print("Scaled test set accuracy: {:.2f}".format(
    svm.score(X_test_scaled, y_test)))

Scaled test set accuracy: 0.90


In [42]:
# preprocessing using zero mean and unit variance scaling/ compressing the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# learning an SVM on the scaled training data
svm.fit(X_train_scaled, y_train)

# scoring on the scaled test set
print("SVM test accuracy: {:.2f}".format(svm.score(X_test_scaled, y_test)))

SVM test accuracy: 0.90


In [39]:
y = covid19_patients_df['SURVIVAL']
X = covid19_patients_df.drop(['SURVIVAL'], axis = 1)

In [40]:
y

0        1
1        1
2        1
3        1
4        1
        ..
85155    0
85156    0
85157    0
85158    1
85159    1
Name: SURVIVAL, Length: 85160, dtype: int64

In [41]:
X

Unnamed: 0,RACE,ETHNICITY,GENDER,AGE,DAYSHOSPITALIZED,DIABETES
0,1,0,1,42,0.0,0
1,1,0,0,30,17.0,0
2,1,0,1,36,20.0,1
3,2,0,0,24,0.0,0
4,1,1,1,60,0.0,0
...,...,...,...,...,...,...
85155,1,0,0,56,7.0,1
85156,1,0,0,56,3.0,1
85157,1,0,0,56,0.0,1
85158,1,0,1,68,0.0,1


In [42]:
from sklearn.model_selection import cross_val_score

from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
scores = cross_val_score(LogisticRegression(), X, y)
print('Cross-Validation Accuracy Scores for Logistic Regression', scores)

Cross-Validation Accuracy Scores for Logistic Regression [0.83027442 0.82995737 0.83245262]


In [43]:
scores = cross_val_score(LogisticRegression(), X, y, cv=5)
print('Cross-Validation Accuracy Scores for Logistic Regression', scores)

Cross-Validation Accuracy Scores for Logistic Regression [0.83085775 0.83115129 0.82979098 0.83107275 0.83165991]


In [44]:
from sklearn.ensemble import RandomForestClassifier

scores = cross_val_score(RandomForestClassifier(n_estimators=40), X, y)
print('Cross-Validation Accuracy Scores for Random Forest Classifier', scores)

Cross-Validation Accuracy Scores for Random Forest Classifier [0.89322577 0.89202804 0.89294018]


In [45]:
scores = cross_val_score(RandomForestClassifier(n_estimators=15), X, y)
print('Cross-Validation Accuracy Scores for Random Forest Classifier', scores)

Cross-Validation Accuracy Scores for Random Forest Classifier [0.892486   0.89157008 0.89325724]


In [46]:
scores = cross_val_score(RandomForestClassifier(n_estimators=35), X, y, cv=5)
print('Cross-Validation Accuracy Scores for Random Forest Classifier', scores)

Cross-Validation Accuracy Scores for Random Forest Classifier [0.89584923 0.89062408 0.89466886 0.89184428 0.895837  ]


In [47]:
scores = cross_val_score(SVC(), X, y)
print('Cross-Validation Accuracy Scores for SVM', scores)

Cross-Validation Accuracy Scores for SVM [0.90182126 0.9002008  0.90065525]


In [61]:
scores = cross_val_score(SVC(), X, y, cv=5)
print('Cross-Validation Accuracy Scores for SVM', scores)

Cross-Validation Accuracy Scores for SVM [0.90330535 0.89984148 0.89995303 0.89941871 0.90229581]
