In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv("covid_data.csv")

In [3]:
df.shape

(2499, 12)

In [4]:
df.head()

Unnamed: 0,Country,Age,Gender,fever,Bodypain,Runny_nose,Difficulty_in_breathing,Nasal_congestion,Sore_throat,Severity,Contact_with_covid_patient,Infected
0,China,10,Male,102,1,0,0,0,1,Mild,No,0
1,Italy,20,Male,103,1,1,0,0,0,Moderate,Not known,1
2,Iran,55,Transgender,99,0,0,0,1,1,Severe,No,0
3,Republic of Korean,37,Female,100,0,1,1,0,0,Mild,Yes,1
4,France,45,Male,101,1,1,1,1,0,Moderate,Yes,1


In [5]:
pd.value_counts(df['Gender'])

Male           1257
Female         1208
Transgender      34
Name: Gender, dtype: int64

In [6]:
pd.value_counts(df['Severity'])

Mild        1591
Moderate     525
Severe       383
Name: Severity, dtype: int64

In [7]:
pd.value_counts(df['Contact_with_covid_patient'])

No           1203
Yes           638
Not known     633
yes            25
Name: Contact_with_covid_patient, dtype: int64

In [8]:
df['Contact_with_covid_patient'] = df['Contact_with_covid_patient'].str.lower()

In [9]:
pd.value_counts(df['Contact_with_covid_patient'])

no           1203
yes           663
not known     633
Name: Contact_with_covid_patient, dtype: int64

In [10]:
df.drop(columns=['Country'], inplace=True)

In [11]:
gender_label = LabelEncoder()
df['Gender'] = gender_label.fit_transform(df['Gender'])

severe_label = LabelEncoder()
df['Severity'] = severe_label.fit_transform(df['Severity'])

contact_label = LabelEncoder()
df['Contact_with_covid_patient'] = contact_label.fit_transform(df['Contact_with_covid_patient'])

In [12]:
df.head()

Unnamed: 0,Age,Gender,fever,Bodypain,Runny_nose,Difficulty_in_breathing,Nasal_congestion,Sore_throat,Severity,Contact_with_covid_patient,Infected
0,10,1,102,1,0,0,0,1,0,0,0
1,20,1,103,1,1,0,0,0,1,1,1
2,55,2,99,0,0,0,1,1,2,0,0
3,37,0,100,0,1,1,0,0,0,2,1
4,45,1,101,1,1,1,1,0,1,2,1


In [13]:
pd.value_counts(df['Gender'])

1    1257
0    1208
2      34
Name: Gender, dtype: int64

In [17]:
gender_df = pd.get_dummies(df['Gender'])
gender_df.columns = ['Female', 'Male', 'Trans']

In [18]:
gender_df.head()

Unnamed: 0,Female,Male,Trans
0,0,1,0
1,0,1,0
2,0,0,1
3,1,0,0
4,0,1,0


In [19]:
severity_df = pd.get_dummies(df['Severity'])
severity_df.columns = ['Mild', 'Moderate', 'Severe']

contact_df = pd.get_dummies(df['Contact_with_covid_patient'])
contact_df.columns = ['No', 'Not Known', 'Yes']

In [20]:
contact_df.head()

Unnamed: 0,No,Not Known,Yes
0,1,0,0
1,0,1,0
2,1,0,0
3,0,0,1
4,0,0,1


In [21]:
y = df['Infected']
df.drop(columns=['Gender', 
                 'Severity',
                 'Contact_with_covid_patient',
                'Infected'], inplace=True)

In [26]:
X = pd.concat([df, gender_df, severity_df, contact_df], axis=1)

In [27]:
X.head()

Unnamed: 0,Age,fever,Bodypain,Runny_nose,Difficulty_in_breathing,Nasal_congestion,Sore_throat,Female,Male,Trans,Mild,Moderate,Severe,No,Not Known,Yes
0,10,102,1,0,0,0,1,0,1,0,1,0,0,1,0,0
1,20,103,1,1,0,0,0,0,1,0,0,1,0,0,1,0
2,55,99,0,0,0,1,1,0,0,1,0,0,1,1,0,0
3,37,100,0,1,1,0,0,1,0,0,1,0,0,0,0,1
4,45,101,1,1,1,1,0,0,1,0,0,1,0,0,0,1
