In [None]:
!pip install category_encoders

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd

from category_encoders.ordinal import OrdinalEncoder
from category_encoders.woe import WOEEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.sum_coding import SumEncoder
from category_encoders.m_estimate import MEstimateEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.helmert import HelmertEncoder
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.james_stein import JamesSteinEncoder

In [None]:
col_names = ['Sex', 'Length	', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight', 'Rings']

In [None]:
df = pd.read_csv("abalone.data", names = col_names)

In [None]:
df.head()

Unnamed: 0,Sex,Length\t,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [None]:
df.isna().sum()

Sex               0
Length\t          0
Diameter          0
Height            0
Whole_weight      0
Shucked_weight    0
Viscera_weight    0
Shell_weight      0
Rings             0
dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4177 non-null   object 
 1   Length	         4177 non-null   float64
 2   Diameter        4177 non-null   float64
 3   Height          4177 non-null   float64
 4   Whole_weight    4177 non-null   float64
 5   Shucked_weight  4177 non-null   float64
 6   Viscera_weight  4177 non-null   float64
 7   Shell_weight    4177 non-null   float64
 8   Rings           4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


At first I used LabelEncoder

In [None]:
le = LabelEncoder()
df_label_encoder = df.copy()
df_label_encoder["Sex"] = le.fit_transform(df_label_encoder["Sex"])
df_label_encoder.head()

Unnamed: 0,Sex,Length\t,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
0,2,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,2,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,2,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,1,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


The next one is OrdinalEncoder

In [None]:
enc = OrdinalEncoder()
df_ordinal_encoder = df.copy()
df_ordinal_encoder["Sex"] = enc.fit_transform(df_ordinal_encoder["Sex"])
df_ordinal_encoder.head()


Unnamed: 0,Sex,Length\t,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,3,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


Classes are made according to alphabet

Then, used OneHotEncoder

In [None]:
ohenc = OneHotEncoder(sparse_output=False)
df_ohenc = df.copy()
arr_ohenc = ohenc.fit_transform(df[['Sex']])
df_ohenc[['Sex_1', 'Sex_2', 'Sex_3']] = arr_ohenc
df_ohenc.head()

Unnamed: 0,Sex,Length\t,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings,Sex_1,Sex_2,Sex_3
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,0.0,0.0,1.0
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,0.0,0.0,1.0
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,1.0,0.0,0.0
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,0.0,0.0,1.0
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,0.0,1.0,0.0


In [None]:
df_ohenc[['Sex_1', 'Sex_2', 'Sex_3']] = df_ohenc[['Sex_1', 'Sex_2', 'Sex_3']].astype(int)
df_ohenc.head()

Unnamed: 0,Sex,Length\t,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings,Sex_1,Sex_2,Sex_3
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,0,0,1
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,0,0,1
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,1,0,0
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,0,0,1
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,0,1,0


Now let's use SumEncoder

In [None]:
sumEnc = SumEncoder()
sumEnc_df = df.copy()
arr_sumEnc = sumEnc.fit_transform(sumEnc_df["Sex"])
sumEnc_df[['intercept', 'Sex_1', 'Sex_2']] = arr_sumEnc
sumEnc_df.head()



Unnamed: 0,Sex,Length\t,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings,intercept,Sex_1,Sex_2
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,1,1.0,0.0
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,1,1.0,0.0
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,1,0.0,1.0
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,1,1.0,0.0
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,1,-1.0,-1.0


And the last one is TargetEncoder

In [None]:
te = TargetEncoder()
df_target_encoder = df.copy()
df_target_encoder["Sex"] = te.fit_transform(df_target_encoder["Sex"], df_target_encoder["Height"])
df_target_encoder.head()

Unnamed: 0,Sex,Length\t,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
0,0.151381,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0.151381,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,0.158011,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0.151381,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,0.107996,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
