In [240]:
import pandas as pd
import numpy as np

In [241]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [242]:
df= pd.read_csv('covid_toy.csv')
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [243]:
df.shape

(100, 6)

In [244]:
df.gender


0       Male
1       Male
2       Male
3     Female
4     Female
       ...  
95    Female
96    Female
97    Female
98    Female
99    Female
Name: gender, Length: 100, dtype: object

In [245]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

#missing data on fever

Splitting the data into test and train

In [246]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns= ['has_covid']), df['has_covid'], test_size= 0.2)

In [247]:
X_train

Unnamed: 0,age,gender,fever,cough,city
5,84,Female,,Mild,Bangalore
49,44,Male,104.0,Mild,Mumbai
32,34,Female,101.0,Strong,Delhi
10,75,Female,,Mild,Delhi
63,10,Male,100.0,Mild,Bangalore
...,...,...,...,...,...
30,15,Male,101.0,Mild,Delhi
42,27,Male,100.0,Mild,Delhi
46,19,Female,101.0,Mild,Mumbai
11,65,Female,98.0,Mild,Mumbai


Normal Procedure for Analysis

Fill the null Values in *fever* column

#adding sSimpleImputer in the fever column

In [248]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train_fever= imp_mean.fit_transform(X_train[['fever']])

X_test_fever= imp_mean.fit_transform(X_test[['fever']])





In [249]:
X_test_fever

array([[100.        ],
       [ 98.        ],
       [100.83333333],
       [101.        ],
       [102.        ],
       [101.        ],
       [ 98.        ],
       [ 98.        ],
       [100.        ],
       [100.83333333],
       [102.        ],
       [103.        ],
       [102.        ],
       [101.        ],
       [100.        ],
       [102.        ],
       [104.        ],
       [103.        ],
       [100.        ],
       [100.        ]])

#Ordinal Operation for gender column

In [250]:
ohe = OneHotEncoder(drop='first', sparse_output= False)
X_train_city_gender = ohe.fit_transform(X_train[['gender', 'city']])
X_test_city_gender= ohe.fit_transform(X_test[['gender','city']])


In [251]:
X_train_city_gender

array([[0., 0., 0., 0.],
       [1., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [1., 0., 1., 0.],
       [0., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [1., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [1., 0., 1., 0.],
       [1., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 0.],
       [1., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],


#Ordinal Encoding for "Cough column"

In [252]:
encoder = OrdinalEncoder(categories=[['Mild', 'Strong']])
X_train['cough'] = encoder.fit_transform(X_train[['cough']])

# Use transform (not fit_transform) on test data
X_test['cough'] = encoder.transform(X_test[['cough']])

Now we need to join all the operational coulmns togather 

Only " age colum " is undistrubed. so need to separate  this column from the main 

In [253]:
X_train_age= X_train.drop(columns=['gender', 'fever','cough','city'])

In [254]:
X_test_age= X_test.drop(columns=['gender', 'fever','cough','city'])
X_test_age.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20 entries, 37 to 51
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   age     20 non-null     int64
dtypes: int64(1)
memory usage: 320.0 bytes


In [259]:
X_train_final_array = np.concatenate((X_train_age, X_train_fever, X_train_city_gender), axis=1)

In [260]:
X_train_final_array

array([[ 84.        , 100.84722222,   0.        ,   0.        ,
          0.        ,   0.        ],
       [ 44.        , 104.        ,   1.        ,   0.        ,
          0.        ,   1.        ],
       [ 34.        , 101.        ,   0.        ,   1.        ,
          0.        ,   0.        ],
       [ 75.        , 100.84722222,   0.        ,   1.        ,
          0.        ,   0.        ],
       [ 10.        , 100.        ,   1.        ,   0.        ,
          0.        ,   0.        ],
       [ 16.        , 103.        ,   0.        ,   0.        ,
          0.        ,   0.        ],
       [ 49.        ,  99.        ,   0.        ,   0.        ,
          0.        ,   0.        ],
       [ 16.        , 104.        ,   1.        ,   0.        ,
          1.        ,   0.        ],
       [ 20.        , 101.        ,   0.        ,   0.        ,
          0.        ,   0.        ],
       [ 54.        , 104.        ,   0.        ,   0.        ,
          1.        ,   0. 

In [257]:
X_test_final_array = np.concatenate((X_test_age, X_test_fever, X_test_city_gender), axis=1)

In [258]:
X_test_final_array

array([[ 55.        , 100.        ,   1.        ,   0.        ,
          1.        ,   0.        ],
       [ 23.        ,  98.        ,   1.        ,   0.        ,
          0.        ,   1.        ],
       [ 23.        , 100.83333333,   1.        ,   0.        ,
          0.        ,   1.        ],
       [ 81.        , 101.        ,   0.        ,   0.        ,
          0.        ,   1.        ],
       [  5.        , 102.        ,   1.        ,   0.        ,
          1.        ,   0.        ],
       [ 47.        , 101.        ,   1.        ,   0.        ,
          0.        ,   0.        ],
       [ 34.        ,  98.        ,   1.        ,   0.        ,
          1.        ,   0.        ],
       [ 31.        ,  98.        ,   0.        ,   0.        ,
          1.        ,   0.        ],
       [ 47.        , 100.        ,   0.        ,   0.        ,
          0.        ,   0.        ],
       [ 79.        , 100.83333333,   1.        ,   0.        ,
          1.        ,   0. 

In [263]:
X_test = pd.DataFrame(X_test_final_array)
X_test

Unnamed: 0,0,1,2,3,4,5
0,55.0,100.0,1.0,0.0,1.0,0.0
1,23.0,98.0,1.0,0.0,0.0,1.0
2,23.0,100.833333,1.0,0.0,0.0,1.0
3,81.0,101.0,0.0,0.0,0.0,1.0
4,5.0,102.0,1.0,0.0,1.0,0.0
5,47.0,101.0,1.0,0.0,0.0,0.0
6,34.0,98.0,1.0,0.0,1.0,0.0
7,31.0,98.0,0.0,0.0,1.0,0.0
8,47.0,100.0,0.0,0.0,0.0,0.0
9,79.0,100.833333,1.0,0.0,1.0,0.0
