# Data encoding

In [3]:
import pandas as pd
import seaborn as sns

# Load Titanic dataset
df = sns.load_dataset('titanic')

# Select relevant categorical columns
df = df[['sex', 'embarked', 'class', 'who', 'survived']]
df.dropna(inplace=True)

## One-Hot Encoding

In [6]:
# Using pandas
df_ohe = pd.get_dummies(df, columns=['embarked'], prefix='embarked', drop_first=True)
df_ohe

Unnamed: 0,sex,class,who,survived,embarked_Q,embarked_S
0,male,Third,man,0,False,True
1,female,First,woman,1,False,False
2,female,Third,woman,1,False,True
3,female,First,woman,1,False,True
4,male,Third,man,0,False,True
...,...,...,...,...,...,...
886,male,Second,man,0,False,True
887,female,First,woman,1,False,True
888,female,Third,woman,0,False,True
889,male,First,man,1,False,False


In [13]:
# Using Sklearn
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False, drop="first")
encoded = ohe.fit_transform(df[['embarked']])
df_ohe_sklearn = pd.DataFrame(encoded, columns=ohe.get_feature_names_out(['embarked']))
df_ohe_sklearn

Unnamed: 0,embarked_Q,embarked_S
0,0.0,1.0
1,0.0,0.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0
...,...,...
884,0.0,1.0
885,0.0,1.0
886,0.0,1.0
887,0.0,0.0


## Label Encoding

In [15]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['sex_le'] = le.fit_transform(df['sex'])  # male=1, female=0
df

Unnamed: 0,sex,embarked,class,who,survived,sex_le
0,male,S,Third,man,0,1
1,female,C,First,woman,1,0
2,female,S,Third,woman,1,0
3,female,S,First,woman,1,0
4,male,S,Third,man,0,1
...,...,...,...,...,...,...
886,male,S,Second,man,0,1
887,female,S,First,woman,1,0
888,female,S,Third,woman,0,0
889,male,C,First,man,1,1


In [16]:
le = LabelEncoder()
df['embarked_le'] = le.fit_transform(df['embarked'])  # male=1, female=0
df

Unnamed: 0,sex,embarked,class,who,survived,sex_le,embarked_le
0,male,S,Third,man,0,1,2
1,female,C,First,woman,1,0,0
2,female,S,Third,woman,1,0,2
3,female,S,First,woman,1,0,2
4,male,S,Third,man,0,1,2
...,...,...,...,...,...,...,...
886,male,S,Second,man,0,1,2
887,female,S,First,woman,1,0,2
888,female,S,Third,woman,0,0,2
889,male,C,First,man,1,1,0


## Ordinal Encoding

In [18]:
from sklearn.preprocessing import OrdinalEncoder

# Define order: First < Second < Third
ord_enc = OrdinalEncoder(categories=[['First', 'Second', 'Third']])
df['class_ord'] = ord_enc.fit_transform(df[['class']])
df

Unnamed: 0,sex,embarked,class,who,survived,sex_le,embarked_le,class_ord
0,male,S,Third,man,0,1,2,2.0
1,female,C,First,woman,1,0,0,0.0
2,female,S,Third,woman,1,0,2,2.0
3,female,S,First,woman,1,0,2,0.0
4,male,S,Third,man,0,1,2,2.0
...,...,...,...,...,...,...,...,...
886,male,S,Second,man,0,1,2,1.0
887,female,S,First,woman,1,0,2,0.0
888,female,S,Third,woman,0,0,2,2.0
889,male,C,First,man,1,1,0,0.0


## Target Encoding

In [19]:
target_map = df.groupby('who')['survived'].mean()
df['who_target'] = df['who'].map(target_map)
df

Unnamed: 0,sex,embarked,class,who,survived,sex_le,embarked_le,class_ord,who_target
0,male,S,Third,man,0,1,2,2.0,0.163873
1,female,C,First,woman,1,0,0,0.0,0.754647
2,female,S,Third,woman,1,0,2,2.0,0.754647
3,female,S,First,woman,1,0,2,0.0,0.754647
4,male,S,Third,man,0,1,2,2.0,0.163873
...,...,...,...,...,...,...,...,...,...
886,male,S,Second,man,0,1,2,1.0,0.163873
887,female,S,First,woman,1,0,2,0.0,0.754647
888,female,S,Third,woman,0,0,2,2.0,0.754647
889,male,C,First,man,1,1,0,0.0,0.163873


## Frequency Encoding

In [20]:
freq_map = df['embarked'].value_counts()
df['embarked_freq'] = df['embarked'].map(freq_map)
df

Unnamed: 0,sex,embarked,class,who,survived,sex_le,embarked_le,class_ord,who_target,embarked_freq
0,male,S,Third,man,0,1,2,2.0,0.163873,644
1,female,C,First,woman,1,0,0,0.0,0.754647,168
2,female,S,Third,woman,1,0,2,2.0,0.754647,644
3,female,S,First,woman,1,0,2,0.0,0.754647,644
4,male,S,Third,man,0,1,2,2.0,0.163873,644
...,...,...,...,...,...,...,...,...,...,...
886,male,S,Second,man,0,1,2,1.0,0.163873,644
887,female,S,First,woman,1,0,2,0.0,0.754647,644
888,female,S,Third,woman,0,0,2,2.0,0.754647,644
889,male,C,First,man,1,1,0,0.0,0.163873,168


## Binary Encoding

In [21]:
!pip install category_encoders

import category_encoders as ce

be = ce.BinaryEncoder()
df_be = be.fit_transform(df['who'])
df_be

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
Installing collected packages: category_encoders
Successfully installed category_encoders-2.8.1


Unnamed: 0,who_0,who_1
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
886,0,1
887,1,0
888,1,0
889,0,1
