In [4]:
import numpy as np
import pandas as pd

In [7]:
df = pd.read_csv("covid_toy.csv")

In [10]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [14]:
print(df["gender"].value_counts(), df["city"].value_counts())

gender
Female    59
Male      41
Name: count, dtype: int64 city
Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: count, dtype: int64


In [15]:
print(df["cough"].value_counts())

cough
Mild      62
Strong    38
Name: count, dtype: int64


In [17]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [9]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder


In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train, y_test = train_test_split(df.drop(columns = ["has_covid"]),df["has_covid"], test_size = 0.2)

## Aam Zindgi

In [24]:
## Adding simple imputer to fever column

si = SimpleImputer()
# train data
X_train_fever = si.fit_transform(X_train[["fever"]])

# test data
X_test_fever = si.transform(X_test[["fever"]])

X_train_fever.shape

(80, 1)

In [31]:
# Ordinal Enc on Cough column
oe = OrdinalEncoder(categories =[["Mild","Strong"]])
#train data
X_train_cough = oe.fit_transform(X_train[["cough"]])

# test data
X_test_cough = oe.transform(X_test[["cough"]])


X_train_cough.shape


(80, 1)

In [38]:
# One Hot Encoding --> Gender, City

ohe = OneHotEncoder(drop = "first",sparse_output = False,dtype = int)

#train data
X_train_gender_city = ohe.fit_transform(X_train[["gender", "city"]])

#test data
X_test_gender_city = ohe.transform(X_test[["gender","city"]])

X_train_gender_city.shape

(80, 4)

In [46]:
#Extracting Age
X_train_age = X_train[["age"]].values
X_test_age = X_test[["age"]].values

# X_train_age

In [47]:
# concatenateing all to get the original X_train and X_test (transformed)
X_train_trf = np.concatenate((X_train_age, X_train_fever,X_train_gender_city,X_train_cough), axis = 1)
X_test_trf =  np.concatenate((X_test_age, X_test_fever,X_test_gender_city,X_test_cough), axis = 1)

## Mentos Zindgi

In [50]:
from sklearn.compose import ColumnTransformer


In [63]:
transformer =ColumnTransformer(transformers = [
    ("tnf1",SimpleImputer(),["fever"]),
    ("tnf2",OrdinalEncoder(categories = [["Mild", "Strong"]]),["cough"]),
    ("tnf3", OneHotEncoder(drop = "first",sparse_output = False, dtype = int), ["gender","city"])
],
                              remainder = "passthrough", verbose_feature_names_out = True)

In [67]:
transformer.fit(X_train)

X_train_trf = transformer.transform(X_train)
X_test_trf = transformer.transform(X_test)

In [77]:
cols = transformer.get_feature_names_out().tolist()
col_names = []
for name in cols:
    new = name.split("__")[1]
    
    col_names.append(new)

In [80]:
X_train_trf = pd.DataFrame(X_train_trf)
X_train_trf.columns = col_names
X_test_trf = pd.DataFrame(X_test_trf)
X_test_trf.columns = col_names


In [82]:
for col in X_train_trf.columns:
    if X_train_trf[col].dtype == "float":
        X_train_trf[col] = X_train_trf[col].astype(int)
        X_test_trf[col] = X_train_trf[col].astype(int)
        
    

Unnamed: 0,fever,cough,gender_Male,city_Delhi,city_Kolkata,city_Mumbai,age
0,102,1,0,0,0,0,82
1,98,0,0,0,0,1,65
2,102,1,1,1,0,0,20
3,103,1,1,0,0,0,46
4,101,1,0,0,1,0,51
...,...,...,...,...,...,...,...
75,99,0,1,1,0,0,65
76,98,0,1,0,1,0,24
77,100,1,1,0,1,0,79
78,98,1,1,0,0,1,23
