In [1]:
import pandas as pd
import seaborn as sns

In [2]:
df = pd.read_csv("Data-sets\covid_toy.csv")

In [3]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [4]:
df.describe()

Unnamed: 0,age,fever
count,100.0,90.0
mean,44.22,100.844444
std,24.878931,2.054926
min,5.0,98.0
25%,20.0,99.0
50%,45.0,101.0
75%,66.5,102.75
max,84.0,104.0


In [5]:
# We will apply column transformer
# 1.Gender,city -> Nominal Encoder
# 2.Cough  -> Ordinal Encoder(OHE)
# 3.has_covid -> Label Encoder
# 4.age -> STANDRADIZATION

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
x=df.iloc[:,0:5]
y=df.iloc[:,5:]

In [8]:
x

Unnamed: 0,age,gender,fever,cough,city
0,60,Male,103.0,Mild,Kolkata
1,27,Male,100.0,Mild,Delhi
2,42,Male,101.0,Mild,Delhi
3,31,Female,98.0,Mild,Kolkata
4,65,Female,101.0,Mild,Mumbai
...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore
96,51,Female,101.0,Strong,Kolkata
97,20,Female,101.0,Mild,Bangalore
98,5,Female,98.0,Strong,Mumbai


In [9]:
y

Unnamed: 0,has_covid
0,No
1,Yes
2,No
3,No
4,No
...,...
95,No
96,Yes
97,No
98,No


In [10]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [11]:
x_train

Unnamed: 0,age,gender,fever,cough,city
26,19,Female,100.0,Mild,Kolkata
21,73,Male,98.0,Mild,Bangalore
24,13,Female,100.0,Strong,Kolkata
37,55,Male,100.0,Mild,Kolkata
12,25,Female,99.0,Strong,Kolkata
...,...,...,...,...,...
76,80,Male,100.0,Mild,Bangalore
87,47,Male,101.0,Strong,Bangalore
43,22,Female,99.0,Mild,Bangalore
29,34,Female,,Strong,Mumbai


In [12]:
y_train

Unnamed: 0,has_covid
26,Yes
21,Yes
24,No
37,No
12,No
...,...
76,Yes
87,No
43,Yes
29,Yes


In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [14]:
df['cough'].unique()

array(['Mild', 'Strong'], dtype=object)

In [15]:
df['city'].unique()

array(['Kolkata', 'Delhi', 'Mumbai', 'Bangalore'], dtype=object)

In [16]:
# We will apply column transformer
# 1.Gender,city -> Nominal Encoder
# 2.Cough  -> Ordinal Encoder(OHE)
# 3.has_covid -> Label Encoder
# 4.age -> normalization

In [17]:
x_train

Unnamed: 0,age,gender,fever,cough,city
26,19,Female,100.0,Mild,Kolkata
21,73,Male,98.0,Mild,Bangalore
24,13,Female,100.0,Strong,Kolkata
37,55,Male,100.0,Mild,Kolkata
12,25,Female,99.0,Strong,Kolkata
...,...,...,...,...,...
76,80,Male,100.0,Mild,Bangalore
87,47,Male,101.0,Strong,Bangalore
43,22,Female,99.0,Mild,Bangalore
29,34,Female,,Strong,Mumbai


### Column transfor for X_train and X_test

In [18]:
# ct = ColumnTransformer(transformers=[
#     ('cough_ordinal_encoder',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
#     ('ohe',OneHotEncoder(sparse=False,drop='first'),['gender','city'])
    
#     ],remainder='passthrough'
# )

In [19]:
ct = ColumnTransformer(transformers=[
    ('ohe',OneHotEncoder(sparse=False,drop='first'),['gender','city']),
    # ('le',LabelEncoder(),['has_covid']),
    ('ne',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('scaler',StandardScaler(),['age'])
],remainder='passthrough')

In [20]:
ct.fit(x_train)



In [21]:
ct.transform(x_train).shape

(80, 7)

In [22]:
ans_xtrain=ct.transform(x_train)
ans_xtest=ct.transform(x_test)

In [23]:
ans_xtrain

array([[ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00, -1.01980335e+00,
         1.00000000e+02],
       [ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  1.11673907e+00,
         9.80000000e+01],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  1.00000000e+00, -1.25719695e+00,
         1.00000000e+02],
       [ 1.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  4.04558262e-01,
         1.00000000e+02],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  1.00000000e+00, -7.82409745e-01,
         9.90000000e+01],
       [ 1.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00, -1.09794541e-01,
         1.01000000e+02],
       [ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  1.07717347e+00,
         9.9000000

In [24]:
ans_xtest

array([[  1.        ,   0.        ,   0.        ,   1.        ,
          0.        ,  -0.10979454, 104.        ],
       [  0.        ,   1.        ,   0.        ,   0.        ,
          0.        ,   0.76064866, 101.        ],
       [  0.        ,   0.        ,   0.        ,   0.        ,
          1.        ,   1.47282947, 102.        ],
       [  1.        ,   0.        ,   0.        ,   0.        ,
          0.        ,  -1.37589375, 100.        ],
       [  0.        ,   0.        ,   0.        ,   1.        ,
          0.        ,   0.80021426,  98.        ],
       [  0.        ,   1.        ,   0.        ,   0.        ,
          1.        ,  -0.18892574,  98.        ],
       [  0.        ,   0.        ,   0.        ,   0.        ,
          1.        ,  -0.82197535, 102.        ],
       [  0.        ,   1.        ,   0.        ,   0.        ,
          0.        ,   0.16716466, 101.        ],
       [  1.        ,   0.        ,   1.        ,   0.        ,
          0.    

In [25]:
x_train_df=pd.DataFrame(ans_xtrain,columns=[['gender_m','city_delhi',"city_kolkata",'city_mumbai','cough','age','fever']])
x_test_df=pd.DataFrame(ans_xtest,columns=[['gender_m','city_delhi',"city_kolkata",'city_mumbai','cough','age','fever']])

In [26]:
x_train

Unnamed: 0,age,gender,fever,cough,city
26,19,Female,100.0,Mild,Kolkata
21,73,Male,98.0,Mild,Bangalore
24,13,Female,100.0,Strong,Kolkata
37,55,Male,100.0,Mild,Kolkata
12,25,Female,99.0,Strong,Kolkata
...,...,...,...,...,...
76,80,Male,100.0,Mild,Bangalore
87,47,Male,101.0,Strong,Bangalore
43,22,Female,99.0,Mild,Bangalore
29,34,Female,,Strong,Mumbai


In [27]:
x_train_df

Unnamed: 0,gender_m,city_delhi,city_kolkata,city_mumbai,cough,age,fever
0,0.0,0.0,1.0,0.0,0.0,-1.019803,100.0
1,1.0,0.0,0.0,0.0,0.0,1.116739,98.0
2,0.0,0.0,1.0,0.0,1.0,-1.257197,100.0
3,1.0,0.0,1.0,0.0,0.0,0.404558,100.0
4,0.0,0.0,1.0,0.0,1.0,-0.782410,99.0
...,...,...,...,...,...,...,...
75,1.0,0.0,0.0,0.0,0.0,1.393698,100.0
76,1.0,0.0,0.0,0.0,1.0,0.088033,101.0
77,0.0,0.0,0.0,0.0,0.0,-0.901107,99.0
78,0.0,0.0,0.0,1.0,1.0,-0.426319,


In [28]:
x_train_df.describe()

Unnamed: 0,gender_m,city_delhi,city_kolkata,city_mumbai,cough,age,fever
count,80.0,80.0,80.0,80.0,80.0,80.0,71.0
mean,0.4125,0.2125,0.3375,0.1625,0.375,4.4408920000000007e-17,100.774648
std,0.49539,0.411658,0.47584,0.371236,0.487177,1.006309,2.064649
min,0.0,0.0,0.0,0.0,0.0,-1.573722,98.0
25%,0.0,0.0,0.0,0.0,0.0,-0.9208893,99.0
50%,0.0,0.0,0.0,0.0,0.0,0.00890226,101.0
75%,1.0,0.0,1.0,0.0,1.0,0.9584767,103.0
max,1.0,1.0,1.0,1.0,1.0,1.551961,104.0


In [29]:
x_test

Unnamed: 0,age,gender,fever,cough,city
64,42,Male,104.0,Mild,Mumbai
9,64,Female,101.0,Mild,Delhi
35,82,Female,102.0,Strong,Bangalore
63,10,Male,100.0,Mild,Bangalore
11,65,Female,98.0,Mild,Mumbai
17,40,Female,98.0,Strong,Delhi
60,24,Female,102.0,Strong,Bangalore
38,49,Female,101.0,Mild,Delhi
75,5,Male,102.0,Mild,Kolkata
97,20,Female,101.0,Mild,Bangalore


In [30]:
x_test_df

Unnamed: 0,gender_m,city_delhi,city_kolkata,city_mumbai,cough,age,fever
0,1.0,0.0,0.0,1.0,0.0,-0.109795,104.0
1,0.0,1.0,0.0,0.0,0.0,0.760649,101.0
2,0.0,0.0,0.0,0.0,1.0,1.472829,102.0
3,1.0,0.0,0.0,0.0,0.0,-1.375894,100.0
4,0.0,0.0,0.0,1.0,0.0,0.800214,98.0
5,0.0,1.0,0.0,0.0,1.0,-0.188926,98.0
6,0.0,0.0,0.0,0.0,1.0,-0.821975,102.0
7,0.0,1.0,0.0,0.0,0.0,0.167165,101.0
8,1.0,0.0,1.0,0.0,0.0,-1.573722,102.0
9,0.0,0.0,0.0,0.0,0.0,-0.980238,101.0
