# Data Preprocessing

## Importing the libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Importing Dataset

In [3]:
df = pd.read_excel('Covid_Data_new.xlsx')
df.head()

Unnamed: 0,age,body_temperature,chronic_disease,breathing_issue,Blood O2 Level in Percentage,Needed Hospitalization
0,10.0,Normal,no,no,97.0,No
1,12.0,Normal,no,no,97.0,No
2,15.0,Normal,no,no,94.0,No
3,10.0,Normal,no,no,97.0,No
4,13.0,Moderate,no,no,94.0,No


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70 entries, 0 to 69
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   age                           69 non-null     float64
 1   body_temperature              70 non-null     object 
 2   chronic_disease               70 non-null     object 
 3   breathing_issue               70 non-null     object 
 4   Blood O2 Level in Percentage  69 non-null     float64
 5   Needed Hospitalization        70 non-null     object 
dtypes: float64(2), object(4)
memory usage: 3.4+ KB





## Handling Missing Data

In [5]:
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 68 entries, 0 to 69
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   age                           68 non-null     float64
 1   body_temperature              68 non-null     object 
 2   chronic_disease               68 non-null     object 
 3   breathing_issue               68 non-null     object 
 4   Blood O2 Level in Percentage  68 non-null     float64
 5   Needed Hospitalization        68 non-null     object 
dtypes: float64(2), object(4)
memory usage: 3.7+ KB


## Encoding Categorical Data and Scaling Numeric Columns

### Encoding independent variables

In [63]:
X = df.iloc[:, 0:-1]
X

Unnamed: 0,age,body_temperature,chronic_disease,breathing_issue,Blood O2 Level in Percentage
0,10.0,Normal,no,no,97.0
1,12.0,Normal,no,no,97.0
2,15.0,Normal,no,no,94.0
3,10.0,Normal,no,no,97.0
4,13.0,Moderate,no,no,94.0
...,...,...,...,...,...
65,86.0,High,no,yes,76.0
66,61.0,Moderate,no,yes,90.0
67,94.0,High,yes,yes,64.0
68,81.0,High,yes,yes,75.0


In [72]:
X['body_temperature'].value_counts()

body_temperature
High        35
Moderate    17
Normal      16
Name: count, dtype: int64

In [84]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# oe = OrdinalEncoder(categories=[['Normal','Moderate','High']], dtype='int32')
# le =

from sklearn.compose import ColumnTransformer

trans = ColumnTransformer(transformers=[
    ('tnf1', OrdinalEncoder(categories=[
     ['Normal', 'Moderate', 'High']], dtype='int32'), ['body_temperature']),
    ('tnf2', OrdinalEncoder(), ['chronic_disease', 'breathing_issue']),
    ('tnf3', StandardScaler(), ['age', 'Blood O2 Level in Percentage'])
],
    remainder='passthrough'
)

In [85]:
X_encoded = trans.fit_transform(X=X)
X_encoded[0:5]

array([[ 0.        ,  0.        ,  0.        , -1.46870994,  1.28996193],
       [ 0.        ,  0.        ,  0.        , -1.38575954,  1.28996193],
       [ 0.        ,  0.        ,  0.        , -1.26133395,  1.03272612],
       [ 0.        ,  0.        ,  0.        , -1.46870994,  1.28996193],
       [ 1.        ,  0.        ,  0.        , -1.34428435,  1.03272612]])

### Encoding dependent variables

In [79]:
y = df.iloc[:,-1]
target_enc= LabelEncoder()
y_encoded = target_enc.fit_transform(y)
y_encoded

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1])

## Splitting data into Test set & Training Set


In [86]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y_encoded, test_size=0.33, random_state=42)