In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
df = pd.read_csv('covid_data.csv')

In [3]:
df.shape

(2499, 12)

In [4]:
df.head()

Unnamed: 0,Country,Age,Gender,fever,Bodypain,Runny_nose,Difficulty_in_breathing,Nasal_congestion,Sore_throat,Severity,Contact_with_covid_patient,Infected
0,China,10,Male,102,1,0,0,0,1,Mild,No,0
1,Italy,20,Male,103,1,1,0,0,0,Moderate,Not known,1
2,Iran,55,Transgender,99,0,0,0,1,1,Severe,No,0
3,Republic of Korean,37,Female,100,0,1,1,0,0,Mild,Yes,1
4,France,45,Male,101,1,1,1,1,0,Moderate,Yes,1


In [5]:
df.drop(columns=['Country'], inplace=True)

In [6]:
df.head()

Unnamed: 0,Age,Gender,fever,Bodypain,Runny_nose,Difficulty_in_breathing,Nasal_congestion,Sore_throat,Severity,Contact_with_covid_patient,Infected
0,10,Male,102,1,0,0,0,1,Mild,No,0
1,20,Male,103,1,1,0,0,0,Moderate,Not known,1
2,55,Transgender,99,0,0,0,1,1,Severe,No,0
3,37,Female,100,0,1,1,0,0,Mild,Yes,1
4,45,Male,101,1,1,1,1,0,Moderate,Yes,1


In [8]:
df.isna().sum()

Age                           0
Gender                        0
fever                         0
Bodypain                      0
Runny_nose                    0
Difficulty_in_breathing       0
Nasal_congestion              0
Sore_throat                   0
Severity                      0
Contact_with_covid_patient    0
Infected                      0
dtype: int64

In [9]:
pd.value_counts(df['Gender'])

Male           1257
Female         1208
Transgender      34
Name: Gender, dtype: int64

In [10]:
pd.value_counts(df['Severity'])

Mild        1591
Moderate     525
Severe       383
Name: Severity, dtype: int64

In [11]:
pd.value_counts(df['Contact_with_covid_patient'])

No           1203
Yes           638
Not known     633
yes            25
Name: Contact_with_covid_patient, dtype: int64

In [12]:
df['Contact_with_covid_patient'] = df['Contact_with_covid_patient'].str.lower()

In [13]:
pd.value_counts(df['Contact_with_covid_patient'])

no           1203
yes           663
not known     633
Name: Contact_with_covid_patient, dtype: int64

In [14]:
genderLabel = LabelEncoder()
gender = genderLabel.fit_transform(df['Gender'])

In [15]:
gender

array([1, 1, 2, ..., 0, 0, 0])

In [17]:
genderOneHot = OneHotEncoder()
gender = genderOneHot.fit_transform(gender.reshape(-1, 1))

In [18]:
gender

<2499x3 sparse matrix of type '<class 'numpy.float64'>'
	with 2499 stored elements in Compressed Sparse Row format>

In [19]:
gender = gender.toarray()

In [20]:
gender

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [21]:
severityLabel = LabelEncoder()
severity = severityLabel.fit_transform(df['Severity'])

severityOneHot = OneHotEncoder()
severity = severityOneHot.fit_transform(severity.reshape(-1, 1)).toarray()

In [22]:
contactLabel = LabelEncoder()
contact = contactLabel.fit_transform(df['Contact_with_covid_patient'])

contactOneHot = OneHotEncoder()
contact = contactOneHot.fit_transform(contact.reshape(-1, 1)).toarray()

In [23]:
severity

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [24]:
contact

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [25]:
df.drop(columns=['Gender', 'Severity', 'Contact_with_covid_patient'], inplace=True)

In [26]:
df.head()

Unnamed: 0,Age,fever,Bodypain,Runny_nose,Difficulty_in_breathing,Nasal_congestion,Sore_throat,Infected
0,10,102,1,0,0,0,1,0
1,20,103,1,1,0,0,0,1
2,55,99,0,0,0,1,1,0
3,37,100,0,1,1,0,0,1
4,45,101,1,1,1,1,0,1


In [27]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [28]:
X = np.c_[X, gender, severity, contact]

In [29]:
X[0]

array([ 10., 102.,   1.,   0.,   0.,   0.,   1.,   0.,   1.,   0.,   1.,
         0.,   0.,   1.,   0.,   0.])

In [30]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [36]:
logistic = LogisticRegression(max_iter=1000)
logistic.fit(x_train, y_train)

LogisticRegression(max_iter=1000)

In [37]:
y_pred = logistic.predict(x_test)

In [38]:
accuracy_score(y_test, y_pred)

0.9472

In [40]:
confusion_matrix(y_test, y_pred)

array([[298,  23],
       [ 10, 294]], dtype=int64)

In [41]:
298 + 294

592

In [42]:
23 + 10

33

In [44]:
(298 + 294) / (298 + 294 + 23 + 10)

0.9472

In [46]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.93      0.95       321
           1       0.93      0.97      0.95       304

    accuracy                           0.95       625
   macro avg       0.95      0.95      0.95       625
weighted avg       0.95      0.95      0.95       625



In [47]:
logistic.coef_

array([[ 0.03435601,  0.06821719,  3.44485883, -0.2059226 ,  2.01086447,
         0.60091169, -0.38943909,  0.1130023 , -0.05563998, -0.11820126,
         0.18196277,  0.35112609, -0.59392779, -3.5141343 ,  0.28103929,
         3.17225608]])

In [51]:
# import statsmodels.api as sm

In [52]:
# model = sm.GLM(y_test, sm.add_constant(x_test), family=sm.families.Binomial())
# results = model.fit()
# results.summary()