In [1]:
import pandas as pd


df = pd.DataFrame({'Fare':[25,48,71,85,90,120],
        'Embarked':['S','C','S','S','C','Q'],
        'Gender':['male','female','female','female','male','male'],
        'Age':[22,34,54,29,55,None]})
df

Unnamed: 0,Fare,Embarked,Gender,Age
0,25,S,male,22.0
1,48,C,female,34.0
2,71,S,female,54.0
3,85,S,female,29.0
4,90,C,male,55.0
5,120,Q,male,


### Encoder and Imputers

In [2]:
from sklearn.preprocessing import LabelEncoder

In [3]:
lab_enc = LabelEncoder()

In [4]:
df2 = lab_enc.fit_transform(df['Embarked'])

pd.Series(df2)

0    2
1    0
2    2
3    2
4    0
5    1
dtype: int32

In [5]:
df['Embarked'] = df2
df

Unnamed: 0,Fare,Embarked,Gender,Age
0,25,2,male,22.0
1,48,0,female,34.0
2,71,2,female,54.0
3,85,2,female,29.0
4,90,0,male,55.0
5,120,1,male,


In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer

In [7]:
ohe = OneHotEncoder()
si = SimpleImputer()

In [8]:
import pandas as pd


df = pd.DataFrame({'Fare':[25,48,71,85,90,120],
        'Embarked':['S','C','S','S','C','Q'],
        'Gender':['male','female','female','female','male','male'],
        'Age':[22,34,54,29,55,None]})
df

Unnamed: 0,Fare,Embarked,Gender,Age
0,25,S,male,22.0
1,48,C,female,34.0
2,71,S,female,54.0
3,85,S,female,29.0
4,90,C,male,55.0
5,120,Q,male,


In [9]:
ct = make_column_transformer(
     (ohe,['Embarked','Gender']),
     (si, ['Age']),
     remainder='passthrough') # 'passthrough' to keep all other columns

In [10]:
ct.fit_transform(df)

array([[  0. ,   0. ,   1. ,   0. ,   1. ,  22. ,  25. ],
       [  1. ,   0. ,   0. ,   1. ,   0. ,  34. ,  48. ],
       [  0. ,   0. ,   1. ,   1. ,   0. ,  54. ,  71. ],
       [  0. ,   0. ,   1. ,   1. ,   0. ,  29. ,  85. ],
       [  1. ,   0. ,   0. ,   0. ,   1. ,  55. ,  90. ],
       [  0. ,   1. ,   0. ,   0. ,   1. ,  38.8, 120. ]])

In [11]:
df

Unnamed: 0,Fare,Embarked,Gender,Age
0,25,S,male,22.0
1,48,C,female,34.0
2,71,S,female,54.0
3,85,S,female,29.0
4,90,C,male,55.0
5,120,Q,male,


## Ordinal Encoder

In [12]:
from sklearn.preprocessing import OrdinalEncoder

In [13]:
import pandas as pd


df = pd.DataFrame({'Shape':['square','oval','square','circle'],
        'Class':['third','first','second','first'],
        'Size':['M','S','XL','M']})
        
df

Unnamed: 0,Shape,Class,Size
0,square,third,M
1,oval,first,S
2,square,second,XL
3,circle,first,M


In [14]:
ord_enc = OrdinalEncoder(categories=[['first','second','third'],['S','M','XL']])
df1 = ord_enc.fit_transform(df[['Class','Size']])

In [15]:
df1

array([[2., 1.],
       [0., 0.],
       [1., 2.],
       [0., 1.]])

In [16]:
df

Unnamed: 0,Shape,Class,Size
0,square,third,M
1,oval,first,S
2,square,second,XL
3,circle,first,M


# Binary Encoder

In [17]:
import pandas as pd


df = pd.DataFrame({'Cat_data':['A','B','C','D','E','F','G','H','I','A','A','D']})
        
df

Unnamed: 0,Cat_data
0,A
1,B
2,C
3,D
4,E
5,F
6,G
7,H
8,I
9,A


In [18]:
from category_encoders import BinaryEncoder

In [19]:
bi_enc = BinaryEncoder()

In [20]:
df_bi = bi_enc.fit_transform(df)
df_bi

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,Cat_data_0,Cat_data_1,Cat_data_2,Cat_data_3,Cat_data_4
0,0,0,0,0,1
1,0,0,0,1,0
2,0,0,0,1,1
3,0,0,1,0,0
4,0,0,1,0,1
5,0,0,1,1,0
6,0,0,1,1,1
7,0,1,0,0,0
8,0,1,0,0,1
9,0,0,0,0,1


# Comparing with OneHotEncoder

In [21]:
ohe = OneHotEncoder(sparse=False)
ohe.fit_transform(df[['Cat_data']])

array([[1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0.]])

# Knn Imputer

In [22]:
df = pd.DataFrame({'Fare':[25,48,71,85,90,120],
        'Embarked':['S','C','S','S','C','Q'],
        'Gender':['male','female','female','female','male','male'],
        'Age':[22,34,54,29,55,None]})
df

Unnamed: 0,Fare,Embarked,Gender,Age
0,25,S,male,22.0
1,48,C,female,34.0
2,71,S,female,54.0
3,85,S,female,29.0
4,90,C,male,55.0
5,120,Q,male,


In [23]:
# Knn imputer will try to find the relation with other columns and impute the data according the relation with other columns.
# In this case Age NaN is depending on the similarity with Fare columns
from sklearn.impute import KNNImputer

In [25]:
knn_ipm = KNNImputer(n_neighbors=3)
knn_ipm.fit_transform(df[['Fare','Age']])

array([[ 25.,  22.],
       [ 48.,  34.],
       [ 71.,  54.],
       [ 85.,  29.],
       [ 90.,  55.],
       [120.,  46.]])

# Iterative Imputer

### This method treat other columns (which doesnot have nulls as feature and train on them and treat Null column as label. Fianlly it will predict the NaN data and impute. Its just like regression problem. Here Null column is label.

In [26]:

# Before using Iterative Imputer, we need to enable it using below code
from sklearn.experimental import enable_iterative_imputer

#imort Iterative Imputer
from sklearn.impute import IterativeImputer

In [27]:

df = pd.DataFrame({'Fare':[25,48,71,85,90,120],
        'Embarked':['S','C','S','S','C','Q'],
        'Gender':['male','female','female','female','male','male'],
        'Age':[22,34,54,29,55,None]})
df

Unnamed: 0,Fare,Embarked,Gender,Age
0,25,S,male,22.0
1,48,C,female,34.0
2,71,S,female,54.0
3,85,S,female,29.0
4,90,C,male,55.0
5,120,Q,male,


In [28]:
iter_impute = IterativeImputer()
iter_impute.fit_transform(df[['Fare','Age']])

array([[ 25.        ,  22.        ],
       [ 48.        ,  34.        ],
       [ 71.        ,  54.        ],
       [ 85.        ,  29.        ],
       [ 90.        ,  55.        ],
       [120.        ,  52.03920049]])