#### Import important Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

#### Load Dataset 

In [2]:
df = pd.read_csv('Madedata1.csv')

In [3]:
df.shape

(2499, 12)

In [4]:
df.head()

Unnamed: 0,Country,Age,Gender,fever,Bodypain,Runny_nose,Difficulty_in_breathing,Nasal_congestion,Sore_throat,Severity,Contact_with_covid_patient,Infected
0,China,10,Male,102,1,0,0,0,1,Mild,No,0
1,Italy,20,Male,103,1,1,0,0,0,Moderate,Not known,1
2,Iran,55,Transgender,99,0,0,0,1,1,Severe,No,0
3,Republic of Korean,37,Female,100,0,1,1,0,0,Mild,Yes,1
4,France,45,Male,101,1,1,1,1,0,Moderate,Yes,1


#### Feature Selection (Remove unwanted columns) 

In [5]:
df = df.drop(columns=['Country'])

In [6]:
df['Gender'].unique()

array(['Male', 'Transgender', 'Female'], dtype=object)

In [7]:
df['Severity'].unique()

array(['Mild', 'Moderate', 'Severe'], dtype=object)

In [8]:
df['Contact_with_covid_patient'].unique()

array(['No', 'Not known', 'Yes', 'yes'], dtype=object)

In [9]:
df['Contact_with_covid_patient'] = df['Contact_with_covid_patient'].str.lower()

In [10]:
df['Contact_with_covid_patient'].unique()

array(['no', 'not known', 'yes'], dtype=object)

#### Data Preprocessing 

In [11]:
df.head()

Unnamed: 0,Age,Gender,fever,Bodypain,Runny_nose,Difficulty_in_breathing,Nasal_congestion,Sore_throat,Severity,Contact_with_covid_patient,Infected
0,10,Male,102,1,0,0,0,1,Mild,no,0
1,20,Male,103,1,1,0,0,0,Moderate,not known,1
2,55,Transgender,99,0,0,0,1,1,Severe,no,0
3,37,Female,100,0,1,1,0,0,Mild,yes,1
4,45,Male,101,1,1,1,1,0,Moderate,yes,1


In [12]:
# pd.get_dummies(df['Gender'])

In [13]:
gender = df['Gender'].values
severity = df['Severity'].values
contact = df['Contact_with_covid_patient'].values

df= df.drop(columns=['Gender','Severity','Contact_with_covid_patient'])

In [14]:
gender_label = LabelEncoder()
gender = gender_label.fit_transform(gender)

In [15]:
severity_label = LabelEncoder()
severity = severity_label.fit_transform(severity)

contact_label = LabelEncoder()
contact = contact_label.fit_transform(contact)

In [16]:
onehot_1 = OneHotEncoder()
gender = onehot_1.fit_transform(gender.reshape(-1,1))

In [17]:
gender

<2499x3 sparse matrix of type '<class 'numpy.float64'>'
	with 2499 stored elements in Compressed Sparse Row format>

In [18]:
gender = gender.toarray()

In [19]:
gender

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [20]:
onehot_2 = OneHotEncoder()
severity = onehot_2.fit_transform(severity.reshape(-1,1)).toarray()

onehot_3 = OneHotEncoder()
contact = onehot_3.fit_transform(contact.reshape(-1,1)).toarray()

In [21]:
X = df.iloc[:,:-1].values
y = df.iloc[:, -1].values

In [22]:
X = np.c_[X,gender,severity,contact]

In [23]:
X[0]

array([ 10., 102.,   1.,   0.,   0.,   0.,   1.,   0.,   1.,   0.,   1.,
         0.,   0.,   1.,   0.,   0.])

In [24]:
X.shape

(2499, 16)

In [25]:
minmax = MinMaxScaler()

In [26]:
X = minmax.fit_transform(X)

In [27]:
X[0]

array([0.        , 0.66666667, 1.        , 0.        , 0.        ,
       0.        , 1.        , 0.        , 1.        , 0.        ,
       1.        , 0.        , 0.        , 1.        , 0.        ,
       0.        ])

#### Split data into training and testing 

In [28]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

In [29]:
x_train.shape

(1874, 16)

In [30]:
x_test.shape

(625, 16)

In [31]:
logistic = LogisticRegression()
logistic.fit(x_train, y_train)

LogisticRegression()

In [32]:
y_pred = logistic.predict(x_test)

In [33]:
accuracy_score(y_test, y_pred)

0.9376

In [34]:
confusion_matrix(y_test, y_pred)

array([[294,  24],
       [ 15, 292]], dtype=int64)

In [35]:
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

In [36]:
precision_score(y_test, y_pred)

0.9240506329113924

In [37]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.92      0.94       318
           1       0.92      0.95      0.94       307

    accuracy                           0.94       625
   macro avg       0.94      0.94      0.94       625
weighted avg       0.94      0.94      0.94       625



In [38]:
file = open('covid_model.pkl','wb')
pkl.dump(logistic, file)
file.close()

In [39]:
file = open('onehot_1.pkl','wb')
pkl.dump(onehot_1, file)
file.close()

In [40]:
file = open('label_1.pkl','wb')
pkl.dump(gender_label, file)
file.close()

file = open('label_2.pkl','wb')
pkl.dump(severity_label, file)
file.close()

file = open('label_3.pkl','wb')
pkl.dump(contact_label, file)
file.close()

In [41]:
file = open('onehot_1.pkl','wb')
pkl.dump(onehot_1, file)
file.close()

file = open('onehot_2.pkl','wb')
pkl.dump(onehot_2, file)
file.close()

file = open('onehot_3.pkl','wb')
pkl.dump(onehot_3, file)
file.close()

In [42]:
file = open('minmax.pkl','wb')
pkl.dump(minmax, file)
file.close()

In [43]:
logistic.coef_

array([[ 2.13698075,  0.56311746,  3.31292597, -0.36584933,  1.92671665,
         0.58419682, -0.34484268,  0.05844139,  0.02338979, -0.08273487,
         0.09172536,  0.14017321, -0.23280226, -3.56912403,  0.10586052,
         3.46235981]])