# Ordinal Encoding

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('./data/customers.csv')
df.sample(5)

Unnamed: 0,age,gender,review,education,purchased
14,15,Male,Poor,PG,Yes
41,23,Male,Good,PG,Yes
19,97,Male,Poor,PG,Yes
21,32,Male,Average,PG,No
3,72,Female,Good,PG,No


In [3]:
df = df.iloc[:,2:]
df.head()

Unnamed: 0,review,education,purchased
0,Average,School,No
1,Poor,UG,No
2,Good,PG,No
3,Good,PG,No
4,Average,UG,No


In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop("purchased", axis=1), df["purchased"])
X_train.head()

Unnamed: 0,review,education
43,Poor,PG
34,Average,School
10,Good,UG
8,Average,UG
12,Poor,School


In [5]:
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder(categories=[['Poor','Average','Good'],['School','UG','PG']])

oe.fit(X_train)

In [6]:
X_train = oe.transform(X_train)
X_train[:5]

array([[0., 2.],
       [1., 0.],
       [2., 1.],
       [1., 1.],
       [0., 0.]])

In [7]:
oe.categories_

[array(['Poor', 'Average', 'Good'], dtype=object),
 array(['School', 'UG', 'PG'], dtype=object)]

# Label Encoding

In [8]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

le.fit(y_train)

le.classes_

array(['No', 'Yes'], dtype=object)

In [9]:
y_train = le.transform(y_train)

In [10]:
y_test = le.transform(y_test)

In [11]:
y_train

array([0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0])

In [12]:
y_test

array([0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1])

# One Hot Encoding

# Column Transformer

In [13]:
import numpy as np
import pandas as pd

In [14]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [15]:
df = pd.read_csv('./data/covid-toy.csv')
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [16]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [17]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['has_covid']),df['has_covid'],
                                                test_size=0.2)

In [18]:
from sklearn.compose import ColumnTransformer

In [19]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(),['fever']),
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3',OneHotEncoder(sparse_output=False,drop='first'),['gender','city'])
],remainder='passthrough')

In [24]:
X_train_trans = transformer.fit_transform(X_train)

In [25]:
X_test_trans = transformer.transform(X_test)

In [26]:
X_train_trans

array([[101.        ,   0.        ,   1.        ,   1.        ,
          0.        ,   0.        ,  19.        ],
       [101.        ,   1.        ,   0.        ,   1.        ,
          0.        ,   0.        ,  68.        ],
       [103.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,  16.        ],
       [104.        ,   0.        ,   1.        ,   0.        ,
          0.        ,   1.        ,  44.        ],
       [103.        ,   0.        ,   0.        ,   0.        ,
          1.        ,   0.        ,  48.        ],
       [ 98.        ,   1.        ,   0.        ,   0.        ,
          1.        ,   0.        ,  71.        ],
       [ 99.        ,   1.        ,   0.        ,   1.        ,
          0.        ,   0.        ,  59.        ],
       [ 98.        ,   0.        ,   1.        ,   1.        ,
          0.        ,   0.        ,  83.        ],
       [103.        ,   0.        ,   0.        ,   0.        ,
          1.    