In [1]:
import pandas as pd
import seaborn as sns

In [2]:
df = pd.read_csv("Data-sets\covid_toy.csv")

In [3]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [4]:
df.describe()

Unnamed: 0,age,fever
count,100.0,90.0
mean,44.22,100.844444
std,24.878931,2.054926
min,5.0,98.0
25%,20.0,99.0
50%,45.0,101.0
75%,66.5,102.75
max,84.0,104.0


In [5]:
# We will apply column transformer
# 1.Gender,city -> Nominal Encoder
# 2.Cough  -> Ordinal Encoder(OHE)
# 3.has_covid -> Label Encoder
# 4.age -> normalization

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
x=df.iloc[:,0:5]
y=df.iloc[:,-1]

In [8]:
x

Unnamed: 0,age,gender,fever,cough,city
0,60,Male,103.0,Mild,Kolkata
1,27,Male,100.0,Mild,Delhi
2,42,Male,101.0,Mild,Delhi
3,31,Female,98.0,Mild,Kolkata
4,65,Female,101.0,Mild,Mumbai
...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore
96,51,Female,101.0,Strong,Kolkata
97,20,Female,101.0,Mild,Bangalore
98,5,Female,98.0,Strong,Mumbai


In [9]:
y

0      No
1     Yes
2      No
3      No
4      No
     ... 
95     No
96    Yes
97     No
98     No
99    Yes
Name: has_covid, Length: 100, dtype: object

In [10]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [11]:
x_train

Unnamed: 0,age,gender,fever,cough,city
95,12,Female,104.0,Mild,Bangalore
33,26,Female,98.0,Mild,Kolkata
92,82,Female,102.0,Strong,Kolkata
24,13,Female,100.0,Strong,Kolkata
35,82,Female,102.0,Strong,Bangalore
...,...,...,...,...,...
38,49,Female,101.0,Mild,Delhi
34,74,Male,102.0,Mild,Mumbai
11,65,Female,98.0,Mild,Mumbai
93,27,Male,100.0,Mild,Kolkata


In [12]:
y_train

95     No
33     No
92     No
24     No
35     No
     ... 
38    Yes
34    Yes
11    Yes
93    Yes
50    Yes
Name: has_covid, Length: 80, dtype: object

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [14]:
df['cough'].unique()

array(['Mild', 'Strong'], dtype=object)

In [15]:
df['city'].unique()

array(['Kolkata', 'Delhi', 'Mumbai', 'Bangalore'], dtype=object)

In [16]:
# We will apply column transformer
# 1.Gender,city -> Nominal Encoder
# 2.Cough  -> Ordinal Encoder(OHE)
# 3.has_covid -> Label Encoder
# 4.age -> normalization

In [36]:
x_train

Unnamed: 0,age,gender,fever,cough,city
95,12,Female,104.0,Mild,Bangalore
33,26,Female,98.0,Mild,Kolkata
92,82,Female,102.0,Strong,Kolkata
24,13,Female,100.0,Strong,Kolkata
35,82,Female,102.0,Strong,Bangalore
...,...,...,...,...,...
38,49,Female,101.0,Mild,Delhi
34,74,Male,102.0,Mild,Mumbai
11,65,Female,98.0,Mild,Mumbai
93,27,Male,100.0,Mild,Kolkata


In [18]:
# ct = ColumnTransformer(transformers=[
#     ('cough_ordinal_encoder',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
#     ('ohe',OneHotEncoder(sparse=False,drop='first'),['gender','city'])
    
#     ],remainder='passthrough'
# )

In [27]:
ct = ColumnTransformer(transformers=[
    ('ohe',OneHotEncoder(sparse=False,drop='first'),['gender','city']),
    # ('le',LabelEncoder(),['has_covid']),
    ('ne',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('scaler',StandardScaler(),['age'])
],remainder='passthrough')

In [28]:
ct.fit(x_train)



In [29]:
ct.transform(x_train).shape

(80, 7)

In [33]:
ans=ct.transform(x_train)

In [34]:
ans

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00, -1.37705578e+00,
         1.04000000e+02],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00, -8.04135099e-01,
         9.80000000e+01],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  1.48754763e+00,
         1.02000000e+02],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  1.00000000e+00, -1.33613287e+00,
         1.00000000e+02],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  1.48754763e+00,
         1.02000000e+02],
       [ 1.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00, -7.63212193e-01,
         1.00000000e+02],
       [ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  8.32781133e-01,
         9.9000000

In [32]:
pd.DataFrame(ans)

Unnamed: 0,0,1,2,3,4,5,6
0,0.0,0.0,0.0,0.0,0.0,-1.377056,104.0
1,0.0,0.0,1.0,0.0,0.0,-0.804135,98.0
2,0.0,0.0,1.0,0.0,1.0,1.487548,102.0
3,0.0,0.0,1.0,0.0,1.0,-1.336133,100.0
4,0.0,0.0,0.0,0.0,1.0,1.487548,102.0
...,...,...,...,...,...,...,...
75,0.0,1.0,0.0,0.0,0.0,0.137092,101.0
76,1.0,0.0,0.0,1.0,0.0,1.160164,102.0
77,0.0,0.0,0.0,1.0,0.0,0.791858,98.0
78,1.0,0.0,1.0,0.0,0.0,-0.763212,100.0
