### import the required packages

In [1]:
import numpy as np
import pandas as pd

### load the data

In [2]:
# read Data
df = pd.read_csv('covid.csv')

### EDA

In [3]:
df.corr()

Unnamed: 0,sex,patient_type,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu
sex,1.0,0.097025,-0.097029,-0.054758,0.036709,0.994293,0.008153,0.009339,0.010119,0.008096,0.009019,0.010155,0.008344,0.010199,0.008577,0.00498,-0.001791,-0.047575,-0.097024
patient_type,0.097025,1.0,-0.999319,-0.4204,0.325059,0.095472,0.002315,0.013227,0.01667,0.016359,0.000274,0.028232,0.016446,0.014967,0.01324,0.016603,0.228929,-0.135931,-0.999314
intubed,-0.097029,-0.999319,1.0,0.421256,-0.324869,-0.095481,-0.002456,-0.013363,-0.016807,-0.016424,-0.000419,-0.028389,-0.016587,-0.015101,-0.013384,-0.016738,-0.228811,0.135738,0.999989
pneumonia,-0.054758,-0.4204,0.421256,1.0,-0.183492,-0.054031,-0.002875,-0.008592,-0.01037,-0.010979,-0.001568,-0.019541,-0.010392,-0.01007,-0.008296,-0.009996,-0.074756,0.093832,0.421182
age,0.036709,0.325059,-0.324869,-0.183492,1.0,0.036239,0.000556,0.01444,0.018668,0.018625,-0.008617,0.018627,0.015049,0.012068,0.016643,0.017531,0.099339,-0.102643,-0.324791
pregnancy,0.994293,0.095472,-0.095481,-0.054031,0.036239,1.0,0.016239,0.018035,0.018931,0.016412,0.017474,0.017392,0.016854,0.018692,0.01697,0.01306,-0.003185,-0.047589,-0.095474
diabetes,0.008153,0.002315,-0.002456,-0.002875,0.000556,0.016239,1.0,0.838368,0.843817,0.79414,0.845727,0.688245,0.821716,0.765574,0.817052,0.777978,0.002895,0.005156,-0.002429
copd,0.009339,0.013227,-0.013363,-0.008592,0.01444,0.018035,0.838368,1.0,0.922057,0.86687,0.872321,0.748066,0.893562,0.827598,0.889774,0.844021,0.010322,0.000882,-0.013308
asthma,0.010119,0.01667,-0.016807,-0.01037,0.018668,0.018931,0.843817,0.922057,1.0,0.886307,0.883549,0.756712,0.906039,0.839128,0.901793,0.855136,0.010714,0.000276,-0.016759
inmsupr,0.008096,0.016359,-0.016424,-0.010979,0.018625,0.016412,0.79414,0.86687,0.886307,1.0,0.844534,0.800965,0.872862,0.804051,0.866491,0.821285,0.010554,-0.000914,-0.016299


In [4]:
 # removing rows with 'covid_res' = 3
df = df.loc[df["covid_res"] != 3 ]

In [5]:
# drop some colums we donot need like dates and id
df.drop(columns={'entry_date','date_died','date_symptoms', 'inmsupr' ,'id', 'patient_type', 'pregnancy', 'other_disease'},axis=1,inplace=True)
df.shape

(499692, 15)

In [6]:
# CHECK  NULL Values
df.isnull().sum()

sex                    0
intubed                0
pneumonia              0
age                    0
diabetes               0
copd                   0
asthma                 0
hypertension           0
cardiovascular         0
obesity                0
renal_chronic          0
tobacco                0
contact_other_covid    0
covid_res              0
icu                    0
dtype: int64

In [7]:
df['covid_res'].replace([1,2,3],[1,0,2],inplace=True)
df['covid_res'].value_counts().to_frame()

Unnamed: 0,covid_res
0,279035
1,220657


### decide the dependent and independent variables

In [8]:
x = df[['sex','intubed','pneumonia','age','diabetes','copd','asthma','hypertension','cardiovascular', 'obesity', 'renal_chronic','tobacco', 'contact_other_covid', 'icu']]
y = df['covid_res']


### split the data into train and test

In [9]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=253691, shuffle=True)

## create and train the decision model

In [10]:
def create_dt_model():
    from sklearn.tree import DecisionTreeClassifier

    # create the model
    model = DecisionTreeClassifier()

    # train the model
    model.fit(x_train, y_train)
    
    return model

## create random forest

In [11]:
def create_rf_model():
    from sklearn.ensemble import RandomForestClassifier

    # create the model
    model =  RandomForestClassifier(n_estimators=50)

    # train the model
    model.fit(x_train, y_train)
    
    return model

## create and train the XGBoost model 

In [12]:
def create_xgboost_model():
    from xgboost import XGBClassifier

    # create the model
    # use 200 Decison Tree models
    model = XGBClassifier()

    # train the model
    model.fit(x_train, y_train)
    
    return model

## create and train KNN model

In [13]:
def create_knn_model():
    from sklearn.neighbors import KNeighborsClassifier

    # create the model
    model = KNeighborsClassifier(n_neighbors=2)
    
    # train the model
    model.fit(x_train, y_train)
    
    return model

## create and train Logistic Regression Model

In [14]:
def create_lg_model():
    from sklearn.linear_model import LogisticRegression
    
    # crate the model
    model = LogisticRegression(max_iter=1000)
    
    # train the model
    model.fit(x_train, y_train)
    
    return model

## create and train the AdaBoost model

In [15]:
def create_adaboost_model():
    from sklearn.ensemble import AdaBoostClassifier

    # create the model
    # use 200 Decison Tree models
    model = AdaBoostClassifier()

    # train the model
    model.fit(x_train, y_train)
    
    return model

## create and train GradientBoost model

In [16]:
def create_gradientboost_model():
    from sklearn.ensemble import GradientBoostingClassifier

    # create the model
    # use 200 Decison Tree models
    model = GradientBoostingClassifier()

    # train the model
    model.fit(x_train, y_train)
    
    return model

### evaluate the model

In [17]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report

def evaluate_model(name, model):
    y_predictions = model.predict(x_test)
    
    print(f"-- {name} model evaluation --")
    print(confusion_matrix(y_test, y_predictions))
    print(f"accuracy = {accuracy_score(y_test, y_predictions)}")
    print(f"precision = {precision_score(y_test, y_predictions)}")
    print(f"recall = {recall_score(y_test, y_predictions)}")
    print(f"f1 = {f1_score(y_test, y_predictions)}")
    print(classification_report(y_test, y_predictions))

In [18]:
model_dt = create_dt_model()
evaluate_model('Decision Tree', model_dt)

-- Decision Tree model evaluation --
[[46364  9289]
 [28436 15850]]
accuracy = 0.6225197370395942
precision = 0.630494450853256
recall = 0.3579009167682789
f1 = 0.4566078501980554
              precision    recall  f1-score   support

           0       0.62      0.83      0.71     55653
           1       0.63      0.36      0.46     44286

    accuracy                           0.62     99939
   macro avg       0.63      0.60      0.58     99939
weighted avg       0.62      0.62      0.60     99939



In [19]:
model_rf = create_rf_model()
evaluate_model('Random Forest', model_rf)

-- Random Forest model evaluation --
[[45708  9945]
 [27503 16783]]
accuracy = 0.6252914277709403
precision = 0.6279182879377432
recall = 0.37896852278372395
f1 = 0.4726673613653646
              precision    recall  f1-score   support

           0       0.62      0.82      0.71     55653
           1       0.63      0.38      0.47     44286

    accuracy                           0.63     99939
   macro avg       0.63      0.60      0.59     99939
weighted avg       0.63      0.63      0.60     99939



In [20]:
model_xgb = create_xgboost_model()
evaluate_model('XGBoost', model_xgb)

-- XGBoost model evaluation --
[[46456  9197]
 [26664 17622]]
accuracy = 0.6411711143797717
precision = 0.6570714791752116
recall = 0.3979135618479881
f1 = 0.49566134589691296
              precision    recall  f1-score   support

           0       0.64      0.83      0.72     55653
           1       0.66      0.40      0.50     44286

    accuracy                           0.64     99939
   macro avg       0.65      0.62      0.61     99939
weighted avg       0.64      0.64      0.62     99939



In [21]:
model_knn = create_knn_model()
evaluate_model('KNN', model_knn)

-- KNN model evaluation --
[[45084 10569]
 [31950 12336]]
accuracy = 0.574550475790232
precision = 0.5385723641126392
recall = 0.27855304159328004
f1 = 0.3671920346474975
              precision    recall  f1-score   support

           0       0.59      0.81      0.68     55653
           1       0.54      0.28      0.37     44286

    accuracy                           0.57     99939
   macro avg       0.56      0.54      0.52     99939
weighted avg       0.56      0.57      0.54     99939



In [22]:
model_lg = create_lg_model()
evaluate_model('Logistic Regression', model_lg)

-- Logistic Regression model evaluation --
[[47431  8222]
 [28677 15609]]
accuracy = 0.6307847787150161
precision = 0.6549872015442071
recall = 0.3524590163934426
f1 = 0.4582996902388537
              precision    recall  f1-score   support

           0       0.62      0.85      0.72     55653
           1       0.65      0.35      0.46     44286

    accuracy                           0.63     99939
   macro avg       0.64      0.60      0.59     99939
weighted avg       0.64      0.63      0.60     99939



In [23]:
model_ada = create_adaboost_model()
evaluate_model('AdaBoost', model_ada)

-- AdaBoost model evaluation --
[[46627  9026]
 [27088 17198]]
accuracy = 0.638639570137784
precision = 0.6558114704087858
recall = 0.3883394300681931
f1 = 0.48781733087505325
              precision    recall  f1-score   support

           0       0.63      0.84      0.72     55653
           1       0.66      0.39      0.49     44286

    accuracy                           0.64     99939
   macro avg       0.64      0.61      0.60     99939
weighted avg       0.64      0.64      0.62     99939



In [None]:
import pickle as plk

with open('covid_model', 'wb') as file:
    model_xgb = plk.dump(XGBModel, file)