In [1]:
import pandas as pd
import numpy as np
import warnings
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

In [16]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [4]:
df = pd.read_csv('dataset/covid.csv', parse_dates=[3, 4])
df.head()

Unnamed: 0,id,sex,patient_type,entry_date,date_symptoms,date_died,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu
0,16169f,2,1,2020-04-05,2020-02-05,9999-99-99,97,2,27,97,2,2,2,2,2,2,2,2,2,2,2,1,97
1,1009bf,2,1,2020-03-19,2020-03-17,9999-99-99,97,2,24,97,2,2,2,2,2,2,2,2,2,2,99,1,97
2,167386,1,2,2020-06-04,2020-01-04,9999-99-99,2,2,54,2,2,2,2,2,2,2,2,1,2,2,99,1,2
3,0b5948,2,2,2020-04-17,2020-10-04,9999-99-99,2,1,30,97,2,2,2,2,2,2,2,2,2,2,99,1,2
4,0d01b5,1,2,2020-04-13,2020-04-13,22-04-2020,2,2,60,2,1,2,2,2,1,2,1,2,2,2,99,1,2


In [5]:
df.shape

(566602, 23)

In [6]:
df.isnull().sum().any()

False

In [7]:
df.drop(['id','patient_type','pregnancy','contact_other_covid','other_disease'],inplace=True,axis=1)

In [8]:
df['new_column'] = (df['entry_date'] - df['date_symptoms']).dt.days
df['dead']='1'
df.loc[df['date_died'] == '9999-99-99','dead']='0'
df.drop(['entry_date','date_symptoms','date_died'],inplace=True,axis=1)
df = df.astype('int8')

In [9]:
df = df.loc[(df['sex'] <= 2) & (df['intubed'] <= 2) & (df['pneumonia'] <= 2) & (df['diabetes'] <= 2) & 
            (df['copd'] <= 2) & (df['asthma'] <= 2) & (df['inmsupr'] <= 2) & (df['hypertension'] <= 2) & 
            (df['cardiovascular'] <= 2) & (df['obesity'] <= 2) & (df['renal_chronic'] <= 2) & (df['tobacco'] <= 2) &
            (df['covid_res'] <= 2) & (df['dead'] <= 2) & (df['icu'] <= 2) & (df['new_column'] >= 0), 
            ['sex','age','inmsupr','pneumonia','diabetes','asthma','copd','hypertension','cardiovascular',
             'renal_chronic','obesity','tobacco','new_column','covid_res','intubed','icu','dead']]

In [10]:
df.loc[df['sex']==2,'sex']=0
df.loc[df['inmsupr']==2,'inmsupr']=0
df.loc[df['pneumonia']==2,'pneumonia']=0
df.loc[df['diabetes']==2,'diabetes']=0
df.loc[df['asthma']==2,'asthma']=0
df.loc[df['copd']==2,'copd']=0
df.loc[df['hypertension']==2,'hypertension']=0
df.loc[df['cardiovascular']==2,'cardiovascular']=0
df.loc[df['renal_chronic']==2,'renal_chronic']=0
df.loc[df['obesity']==2,'obesity']=0
df.loc[df['tobacco']==2,'tobacco']=0
df.loc[df['intubed']==2,'intubed']=0
df.loc[df['icu']==2,'icu']=0
df.loc[df['covid_res']==2,'covid_res']=0

In [11]:
df.head()

Unnamed: 0,sex,age,inmsupr,pneumonia,diabetes,asthma,copd,hypertension,cardiovascular,renal_chronic,obesity,tobacco,new_column,covid_res,intubed,icu,dead
3,0,30,0,1,0,0,0,0,0,0,0,0,86,1,0,0,0
4,1,60,0,0,1,0,0,1,1,0,0,0,0,1,0,0,1
5,0,47,0,1,1,0,0,0,0,0,0,0,0,1,0,1,1
6,0,63,0,0,0,0,0,1,0,0,0,0,9,1,0,0,0
9,1,39,0,1,0,0,0,0,0,0,1,0,5,1,1,0,0


In [12]:
df = df.loc[df['covid_res'] == 1, ['sex','age','inmsupr','pneumonia','diabetes','asthma','copd','hypertension',
                                   'cardiovascular','renal_chronic','obesity','tobacco','new_column',
                                   'covid_res','intubed','icu','dead']]

In [13]:
X = df.drop(['icu','covid_res','dead','intubed'], axis=1)
y = df['icu']

In [14]:
y.value_counts()

0    49279
1     4276
Name: icu, dtype: int64

In [17]:
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

In [18]:
y_smote.value_counts()

1    49279
0    49279
Name: icu, dtype: int64

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.25, random_state=0)

In [20]:
def evaluation_fun():
    print("train Accuracy = {}".format(accuracy_score(y_train, model.predict(X_train))))
    print("test Accuracy = {}".format(accuracy_score(y_test, results)))
    print("Confusion Matrix")
    print(confusion_matrix(y_test, results))
    print("Classification Report")
    print(classification_report(y_test, results))

In [21]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
results = model.predict(X_test)

In [22]:
evaluation_fun()

train Accuracy = 0.8974268784328581
test Accuracy = 0.8339285714285715
Confusion Matrix
[[ 9461  2758]
 [ 1334 11087]]
Classification Report
              precision    recall  f1-score   support

           0       0.88      0.77      0.82     12219
           1       0.80      0.89      0.84     12421

    accuracy                           0.83     24640
   macro avg       0.84      0.83      0.83     24640
weighted avg       0.84      0.83      0.83     24640



In [23]:
model = XGBClassifier()
model.fit(X_train, y_train)
results = model.predict(X_test)



In [24]:
evaluation_fun()

train Accuracy = 0.7659162856137882
test Accuracy = 0.7508116883116883
Confusion Matrix
[[8590 3629]
 [2511 9910]]
Classification Report
              precision    recall  f1-score   support

           0       0.77      0.70      0.74     12219
           1       0.73      0.80      0.76     12421

    accuracy                           0.75     24640
   macro avg       0.75      0.75      0.75     24640
weighted avg       0.75      0.75      0.75     24640



In [25]:
model = KNeighborsClassifier()
model.fit(X_train, y_train)
results = model.predict(X_test)

In [26]:
evaluation_fun()

train Accuracy = 0.820612029546254
test Accuracy = 0.7657061688311688
Confusion Matrix
[[ 7799  4420]
 [ 1353 11068]]
Classification Report
              precision    recall  f1-score   support

           0       0.85      0.64      0.73     12219
           1       0.71      0.89      0.79     12421

    accuracy                           0.77     24640
   macro avg       0.78      0.76      0.76     24640
weighted avg       0.78      0.77      0.76     24640



In [27]:
model = AdaBoostClassifier()
model.fit(X_train, y_train)
results = model.predict(X_test)

In [28]:
evaluation_fun()

train Accuracy = 0.6859222381557942
test Accuracy = 0.6905438311688312
Confusion Matrix
[[7973 4246]
 [3379 9042]]
Classification Report
              precision    recall  f1-score   support

           0       0.70      0.65      0.68     12219
           1       0.68      0.73      0.70     12421

    accuracy                           0.69     24640
   macro avg       0.69      0.69      0.69     24640
weighted avg       0.69      0.69      0.69     24640



In [29]:
model = GradientBoostingClassifier(max_features='auto', loss='deviance',learning_rate=0.3, 
                                   max_depth=8,min_samples_leaf=3,min_samples_split=0.1, n_estimators=400, subsample=0.4)
model.fit(X_train, y_train
results = model.predict(X_test)

In [30]:
evaluation_fun()

train Accuracy = 0.7201628831948916
test Accuracy = 0.7158685064935065
Confusion Matrix
[[8503 3716]
 [3285 9136]]
Classification Report
              precision    recall  f1-score   support

           0       0.72      0.70      0.71     12219
           1       0.71      0.74      0.72     12421

    accuracy                           0.72     24640
   macro avg       0.72      0.72      0.72     24640
weighted avg       0.72      0.72      0.72     24640



In [31]:
model = GradientBoostingClassifier()
model.fit(X_train, y_train)
results = model.predict(X_test)

In [32]:
evaluation_fun()

train Accuracy = 0.6997348413106416
test Accuracy = 0.7023944805194805
Confusion Matrix
[[8266 3953]
 [3380 9041]]
Classification Report
              precision    recall  f1-score   support

           0       0.71      0.68      0.69     12219
           1       0.70      0.73      0.71     12421

    accuracy                           0.70     24640
   macro avg       0.70      0.70      0.70     24640
weighted avg       0.70      0.70      0.70     24640



In [33]:
model = KNeighborsClassifier(n_neighbors=5,weights='distance',p=1,metric='minkowski')
model.fit(X_train, y_train)
results = model.predict(X_test)

In [34]:
evaluation_fun()

train Accuracy = 0.8881192672961931
test Accuracy = 0.7883522727272727
Confusion Matrix
[[ 8443  3776]
 [ 1439 10982]]
Classification Report
              precision    recall  f1-score   support

           0       0.85      0.69      0.76     12219
           1       0.74      0.88      0.81     12421

    accuracy                           0.79     24640
   macro avg       0.80      0.79      0.79     24640
weighted avg       0.80      0.79      0.79     24640

