In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
df = pd.read_csv('datasets/covid_toy.csv')

In [3]:
df.head(3)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No


In [4]:
df.shape

(100, 6)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        100 non-null    int64  
 1   gender     100 non-null    object 
 2   fever      90 non-null     float64
 3   cough      100 non-null    object 
 4   city       100 non-null    object 
 5   has_covid  100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


In [6]:
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [7]:
x_train,x_test,y_train,y_test = train_test_split(df.drop('has_covid',axis=1), df['has_covid'],test_size=0.2)

In [8]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 80 entries, 70 to 69
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     80 non-null     int64  
 1   gender  80 non-null     object 
 2   fever   74 non-null     float64
 3   cough   80 non-null     object 
 4   city    80 non-null     object 
dtypes: float64(1), int64(1), object(3)
memory usage: 3.8+ KB


In [9]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20 entries, 93 to 26
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     20 non-null     int64  
 1   gender  20 non-null     object 
 2   fever   16 non-null     float64
 3   cough   20 non-null     object 
 4   city    20 non-null     object 
dtypes: float64(1), int64(1), object(3)
memory usage: 960.0+ bytes


In [10]:
x_train.head(2)

Unnamed: 0,age,gender,fever,cough,city
70,68,Female,101.0,Strong,Delhi
3,31,Female,98.0,Mild,Kolkata


In [11]:
x_train.shape

(80, 5)

In [12]:
x_test.shape

(20, 5)

In [13]:
si = SimpleImputer()

x_train_fever = si.fit_transform(x_train[['fever']])
x_test_fever = si.transform(x_test[['fever']])

In [14]:
x_train_fever.shape

(80, 1)

In [15]:
ode = OrdinalEncoder(categories=[['Mild','Strong']])
x_train_cough = ode.fit_transform(x_train[['cough']])
x_test_cough = ode.transform(x_test[['cough']])

In [17]:
print(x_train_cough.shape)
print(x_test_cough.shape)

(80, 1)
(20, 1)


In [18]:
ohe = OneHotEncoder(drop='first',sparse=False)

x_train_gender_city = ohe.fit_transform(x_train[['gender','city']])
x_test_gender_city = ohe.fit_transform(x_test[['gender','city']])

print(x_train_gender_city.shape)
print(x_test_gender_city.shape)

In [23]:
x_train_age = x_train.drop(['gender','city','fever','cough'],axis=1).values
x_test_age = x_test.drop(['gender','city','fever','cough'],axis=1).values

In [24]:
x_test_age

array([[27],
       [82],
       [81],
       [34],
       [51],
       [80],
       [38],
       [51],
       [12],
       [64],
       [15],
       [19],
       [81],
       [49],
       [11],
       [54],
       [23],
       [74],
       [82],
       [19]])

In [28]:
x_train_final = np.concatenate((x_train_age,x_train_cough,x_train_fever,x_train_gender_city),axis=1)
x_test_final = np.concatenate((x_test_age,x_test_cough,x_test_fever,x_test_gender_city),axis=1)

In [29]:
print(x_train_final.shape)
print(x_test_final.shape)

(80, 7)
(20, 7)


 # Using Columns Trasformer

In [30]:
from sklearn.compose import ColumnTransformer

In [32]:
tnf = ColumnTransformer(transformers=[
    ('tnf1', SimpleImputer(),['fever']),
    ('tnf2', OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3', OneHotEncoder(sparse=False, drop='first'), ['gender','city'])
],remainder='passthrough')

In [35]:
x_train_final2 = tnf.fit_transform(x_train)
x_test_final2 = tnf.transform(x_test)

In [37]:
print(x_train_final2.shape)
print(x_test_final2.shape)

(80, 7)
(20, 7)
