# Data Preprocessing

## Importing the libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing Dataset

In [3]:
df=pd.read_csv('covid_data.csv')
x=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [5]:
print(x)

     age body_temperature  ... breathing_issue Blood O2 Level in Percentage
0   10.0           Normal  ...              no                         97.0
1   12.0           Normal  ...              no                         97.0
2   15.0           Normal  ...              no                         94.0
3   10.0           Normal  ...              no                         97.0
4   13.0         Moderate  ...              no                         94.0
..   ...              ...  ...             ...                          ...
65  86.0             High  ...             yes                         76.0
66  61.0         Moderate  ...             yes                         90.0
67  94.0             High  ...             yes                         64.0
68  81.0             High  ...             yes                         75.0
69  76.0             High  ...             yes                         80.0

[70 rows x 5 columns]


In [6]:
print(y)

0      No
1      No
2      No
3      No
4      No
     ... 
65    Yes
66     No
67    Yes
68    Yes
69    Yes
Name: Needed Hospitalization, Length: 70, dtype: object





## Handling Missing Data

In [6]:
from sklearn.impute import SimpleImputer

imputer=SimpleImputer(missing_values=np.nan,strategy='mean')

imputer.fit(x.iloc[:,0:1])
x.iloc[:,0:1]=imputer.transform(x.iloc[:,0:1])
imputer.fit(x.iloc[:,4:5])
x.iloc[:,4:5]=imputer.transform(x.iloc[:,4:5])

In [15]:
print(x)

     age body_temperature chronic_disease breathing_issue  \
0   10.0           Normal              no              no   
1   12.0           Normal              no              no   
2   15.0           Normal              no              no   
3   10.0           Normal              no              no   
4   13.0         Moderate              no              no   
..   ...              ...             ...             ...   
65  86.0             High              no             yes   
66  61.0         Moderate              no             yes   
67  94.0             High             yes             yes   
68  81.0             High             yes             yes   
69  76.0             High             yes             yes   

    Blood O2 Level in Percentage  
0                           97.0  
1                           97.0  
2                           94.0  
3                           97.0  
4                           94.0  
..                           ...  
65                    

## Encoding Categorical Data

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct=ColumnTransformer([("encoder",OneHotEncoder(),[1])] , remainder="passthrough")
x=np.array(ct.fit_transform(x))

In [20]:
print(x)

[[0.0 0.0 1.0 10.0 'no' 'no' 97.0]
 [0.0 0.0 1.0 12.0 'no' 'no' 97.0]
 [0.0 0.0 1.0 15.0 'no' 'no' 94.0]
 [0.0 0.0 1.0 10.0 'no' 'no' 97.0]
 [0.0 1.0 0.0 13.0 'no' 'no' 94.0]
 [0.0 1.0 0.0 12.0 'no' 'no' 97.0]
 [0.0 1.0 0.0 13.0 'no' 'no' 93.0]
 [0.0 1.0 0.0 15.0 'no' 'no' 92.0]
 [0.0 1.0 0.0 18.0 'no' 'no' 66.0]
 [0.0 0.0 1.0 19.0 'no' 'no' 92.0]
 [0.0 0.0 1.0 20.0 'no' 'no' 93.0]
 [0.0 0.0 1.0 17.0 'no' 'no' 93.0]
 [0.0 0.0 1.0 16.0 'no' 'no' 92.0]
 [0.0 0.0 1.0 18.0 'no' 'no' 93.0]
 [0.0 0.0 1.0 20.0 'no' 'no' 92.0]
 [0.0 1.0 0.0 25.0 'no' 'no' 93.0]
 [0.0 1.0 0.0 24.0 'no' 'no' 92.0]
 [1.0 0.0 0.0 26.0 'no' 'no' 94.0]
 [0.0 0.0 1.0 28.0 'no' 'no' 99.0]
 [0.0 0.0 1.0 29.0 'no' 'no' 93.0]
 [0.0 1.0 0.0 30.0 'no' 'no' 62.0]
 [0.0 0.0 1.0 19.0 'no' 'no' 89.0]
 [0.0 0.0 1.0 25.0 'no' 'yes' 86.0]
 [0.0 0.0 1.0 26.0 'no' 'no' 82.07246376811594]
 [0.0 0.0 1.0 28.0 'no' 'no' 89.0]
 [0.0 1.0 0.0 30.0 'yes' 'no' 86.0]
 [0.0 1.0 0.0 35.0 'no' 'no' 89.0]
 [0.0 1.0 0.0 32.0 'no' 'yes' 84.0]
 [0.

### Encoding independent variables

### Encoding dependent variables

In [23]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(y)

In [24]:
print(y)

[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1]


## Splitting data into Test set & Training Set


In [25]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [26]:
print(x_train)

[[1.0 0.0 0.0 86.0 'yes' 'yes' 71.0]
 [0.0 1.0 0.0 35.0 'no' 'yes' 92.0]
 [1.0 0.0 0.0 75.0 'no' 'yes' 72.0]
 [1.0 0.0 0.0 42.0 'no' 'no' 92.0]
 [1.0 0.0 0.0 74.0 'yes' 'yes' 75.0]
 [1.0 0.0 0.0 62.0 'yes' 'yes' 68.0]
 [0.0 1.0 0.0 24.0 'no' 'no' 92.0]
 [0.0 1.0 0.0 48.0 'no' 'yes' 93.0]
 [1.0 0.0 0.0 58.0 'no' 'no' 70.0]
 [0.0 1.0 0.0 45.130434782608695 'yes' 'no' 90.0]
 [0.0 1.0 0.0 15.0 'no' 'no' 92.0]
 [0.0 0.0 1.0 70.0 'no' 'yes' 88.0]
 [1.0 0.0 0.0 52.0 'yes' 'yes' 80.0]
 [1.0 0.0 0.0 60.0 'yes' 'yes' 68.0]
 [1.0 0.0 0.0 54.0 'yes' 'yes' 70.0]
 [0.0 0.0 1.0 29.0 'no' 'no' 93.0]
 [1.0 0.0 0.0 78.0 'yes' 'yes' 71.0]
 [1.0 0.0 0.0 36.0 'yes' 'no' 90.0]
 [0.0 1.0 0.0 30.0 'yes' 'no' 86.0]
 [1.0 0.0 0.0 50.0 'yes' 'yes' 77.0]
 [0.0 0.0 1.0 18.0 'no' 'no' 93.0]
 [1.0 0.0 0.0 69.0 'no' 'yes' 53.0]
 [0.0 0.0 1.0 10.0 'no' 'no' 97.0]
 [1.0 0.0 0.0 26.0 'no' 'no' 94.0]
 [0.0 1.0 0.0 18.0 'no' 'no' 66.0]
 [1.0 0.0 0.0 62.0 'yes' 'yes' 69.0]
 [0.0 1.0 0.0 13.0 'no' 'no' 93.0]
 [1.0 0.0 0.0 4

In [27]:
print(y_train)

[1 0 1 0 1 1 0 0 1 0 0 0 1 1 1 0 1 0 0 1 0 1 0 0 1 1 0 0 1 1 1 0 0 1 0 1 0
 1 1 0 0 0 1 1 0 1 1 0 1 0 0 0 1 0 0 1]


In [28]:
print(x_test)

[[0.0 0.0 1.0 25.0 'no' 'yes' 86.0]
 [0.0 0.0 1.0 10.0 'no' 'no' 97.0]
 [1.0 0.0 0.0 68.0 'yes' 'no' 67.0]
 [0.0 1.0 0.0 13.0 'no' 'no' 94.0]
 [1.0 0.0 0.0 59.0 'yes' 'no' 68.0]
 [0.0 0.0 1.0 28.0 'no' 'no' 99.0]
 [0.0 0.0 1.0 20.0 'no' 'no' 93.0]
 [1.0 0.0 0.0 46.0 'yes' 'no' 91.0]
 [1.0 0.0 0.0 53.0 'yes' 'no' 55.0]
 [0.0 0.0 1.0 16.0 'no' 'no' 92.0]
 [1.0 0.0 0.0 38.0 'no' 'no' 75.0]
 [0.0 0.0 1.0 19.0 'no' 'no' 92.0]
 [1.0 0.0 0.0 94.0 'yes' 'yes' 64.0]
 [0.0 1.0 0.0 12.0 'no' 'no' 97.0]]


In [29]:
print(y_test)

[0 0 1 0 1 0 0 0 1 0 1 0 1 0]


## Feature Scaling

In [30]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x_test[: ,6:]=sc.fit_transform(x_test[: ,6:])
x_train[: ,6:]=sc.fit_transform(x_train[: ,6:])
x_test[: ,3:4]=sc.fit_transform(x_test[: ,3:4])
x_train[: ,3:4]=sc.fit_transform(x_train[: ,3:4])

In [31]:
print(x_train)

[[1.0 0.0 0.0 1.660595032652111 'yes' 'yes' -0.9941858494300261]
 [0.0 1.0 0.0 -0.5372483818439074 'no' 'yes' 0.9574354145081887]
 [1.0 0.0 0.0 1.1865503746235582 'no' 'yes' -0.9012515035282064]
 [1.0 0.0 0.0 -0.23558359946210095 'no' 'no' 0.9574354145081887]
 [1.0 0.0 0.0 1.1434554057118715 'yes' 'yes' -0.6224484658227472]
 [1.0 0.0 0.0 0.6263157787716318 'yes' 'yes' -1.2729888871354853]
 [0.0 1.0 0.0 -1.0112930398724604 'no' 'no' 0.9574354145081887]
 [0.0 1.0 0.0 0.022986214008018892 'no' 'yes' 1.0503697604100084]
 [1.0 0.0 0.0 0.4539359031248853 'no' 'no' -1.087120195331846]
 [0.0 1.0 0.0 -0.10067760982551668 'yes' 'no' 0.7715667227045492]
 [0.0 1.0 0.0 -1.39914776007764 'no' 'no' 0.9574354145081887]
 [0.0 0.0 1.0 0.9710755300651249 'no' 'yes' 0.5856980309009097]
 [1.0 0.0 0.0 0.19536608965476546 'yes' 'yes' -0.15777673631364833]
 [1.0 0.0 0.0 0.5401258409482586 'yes' 'yes' -1.2729888871354853]
 [1.0 0.0 0.0 0.28155602747813874 'yes' 'yes' -1.087120195331846]
 [0.0 0.0 1.0 -0.795818