# Data Preprocessing

## Importing the libraries

In [9]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import LabelEncoder

## Importing Dataset

In [12]:
dataset = pd.read_excel('Covid_Data_new.xlsx')

In [35]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values




## Handling Missing Data

In [36]:
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')

In [37]:
# in our data column 0 & 4 are having missing numeric value

In [38]:
imputer.fit(X[:,0:1])
X[:,0:1] = imputer.transform(X[:,0:1])
imputer.fit(X[:,4:5])
X[:,4:5] = imputer.transform(X[:,4:5])

In [39]:
print(X[28])

[45.130434782608695 'Moderate' 'yes' 'no' 90.0]


In [40]:
print(X[23])


[26.0 'Normal' 'no' 'no' 82.07246376811594]


## Encoding Categorical Data

### Encoding independent variables

In [41]:
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(drop='first'),[1])],remainder='passthrough')

In [42]:
X = np.array(ct.fit_transform(X))

In [43]:
le = LabelEncoder()
X[:,3] = le.fit_transform(X[:,3])

In [45]:
X[:,4] = le.transform(X[:,4])

In [46]:
X

array([[0.0, 1.0, 10.0, 0, 0, 97.0],
       [0.0, 1.0, 12.0, 0, 0, 97.0],
       [0.0, 1.0, 15.0, 0, 0, 94.0],
       [0.0, 1.0, 10.0, 0, 0, 97.0],
       [1.0, 0.0, 13.0, 0, 0, 94.0],
       [1.0, 0.0, 12.0, 0, 0, 97.0],
       [1.0, 0.0, 13.0, 0, 0, 93.0],
       [1.0, 0.0, 15.0, 0, 0, 92.0],
       [1.0, 0.0, 18.0, 0, 0, 66.0],
       [0.0, 1.0, 19.0, 0, 0, 92.0],
       [0.0, 1.0, 20.0, 0, 0, 93.0],
       [0.0, 1.0, 17.0, 0, 0, 93.0],
       [0.0, 1.0, 16.0, 0, 0, 92.0],
       [0.0, 1.0, 18.0, 0, 0, 93.0],
       [0.0, 1.0, 20.0, 0, 0, 92.0],
       [1.0, 0.0, 25.0, 0, 0, 93.0],
       [1.0, 0.0, 24.0, 0, 0, 92.0],
       [0.0, 0.0, 26.0, 0, 0, 94.0],
       [0.0, 1.0, 28.0, 0, 0, 99.0],
       [0.0, 1.0, 29.0, 0, 0, 93.0],
       [1.0, 0.0, 30.0, 0, 0, 62.0],
       [0.0, 1.0, 19.0, 0, 0, 89.0],
       [0.0, 1.0, 25.0, 0, 1, 86.0],
       [0.0, 1.0, 26.0, 0, 0, 82.07246376811594],
       [0.0, 1.0, 28.0, 0, 0, 89.0],
       [1.0, 0.0, 30.0, 1, 0, 86.0],
       [1.0, 0.0, 35.0, 0

### Encoding dependent variables

In [47]:
le = LabelEncoder()
y = le.fit_transform(y)

In [48]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1])

## Feature Scaling

In [51]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X[:,5:6] = sc.fit_transform(X[:,5:6])

In [52]:
X

array([[0.0, 1.0, 10.0, 0, 0, 1.2941982275215735],
       [0.0, 1.0, 12.0, 0, 0, 1.2941982275215735],
       [0.0, 1.0, 15.0, 0, 0, 1.0341020788837427],
       [0.0, 1.0, 10.0, 0, 0, 1.2941982275215735],
       [1.0, 0.0, 13.0, 0, 0, 1.0341020788837427],
       [1.0, 0.0, 12.0, 0, 0, 1.2941982275215735],
       [1.0, 0.0, 13.0, 0, 0, 0.9474033626711326],
       [1.0, 0.0, 15.0, 0, 0, 0.8607046464585223],
       [1.0, 0.0, 18.0, 0, 0, -1.393461975069344],
       [0.0, 1.0, 19.0, 0, 0, 0.8607046464585223],
       [0.0, 1.0, 20.0, 0, 0, 0.9474033626711326],
       [0.0, 1.0, 17.0, 0, 0, 0.9474033626711326],
       [0.0, 1.0, 16.0, 0, 0, 0.8607046464585223],
       [0.0, 1.0, 18.0, 0, 0, 0.9474033626711326],
       [0.0, 1.0, 20.0, 0, 0, 0.8607046464585223],
       [1.0, 0.0, 25.0, 0, 0, 0.9474033626711326],
       [1.0, 0.0, 24.0, 0, 0, 0.8607046464585223],
       [0.0, 0.0, 26.0, 0, 0, 1.0341020788837427],
       [0.0, 1.0, 28.0, 0, 0, 1.4675956599467939],
       [0.0, 1.0, 29.0, 0, 0, 0

## Splitting data into Test set & Training Set


In [53]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=21,test_size=0.3)

In [55]:
X_train.shape

(49, 6)

In [56]:
y_train.shape

(49,)