# Ordinal encoding
Conda environment: `time-series`    
05 May 2024

In [9]:
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd

from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

In [10]:
# Configs

# Plot configs:
sns.set_context("paper", font_scale= 1.5)
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.top'] = False
plotsize = (22, 5)
plt.rcParams['figure.figsize'] = plotsize

# Pandas config
pd.options.display.float_format = '{:,.4f}'.format
pd.options.display.max_columns = 200

# Output a pandas df intead of numpy
set_config(transform_output="pandas")

## 1) Input data

In [5]:
df = pd.read_csv(
    "../datasets/online_retail_dataset_countries.csv",
    parse_dates=["week"],
    index_col="week"
)

# Number of countries
num_countries = df["country"].nunique()
print(f"Number of countries: {num_countries}")

df

Number of countries: 6


Unnamed: 0_level_0,country,quantity,revenue
week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-12-06,Belgium,143,439.1000
2009-12-13,Belgium,10,8.5000
2009-12-20,Belgium,0,0.0000
2009-12-27,Belgium,0,0.0000
2010-01-03,Belgium,0,0.0000
...,...,...,...
2011-11-13,United Kingdom,135234,229378.0100
2011-11-20,United Kingdom,129454,221870.2900
2011-11-27,United Kingdom,133998,210741.7600
2011-12-04,United Kingdom,123041,220213.9900


## 2) Feature Engineering

In [11]:
# Set up the ordinal encoder
o_enc = OrdinalEncoder()

# We set the encoder inside the ColumnTransformer to encode only the variable "country".
ct = ColumnTransformer(
    [("o_enc", o_enc, ["country"])],  # to encode only the variable country
    remainder="passthrough",  # to return all the columns in the resulting array
)

ct

In [13]:
ordinal_encoding = ct.fit_transform(df)
ordinal_encoding

Unnamed: 0_level_0,o_enc__country,remainder__quantity,remainder__revenue
week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-12-06,0.0000,143,439.1000
2009-12-13,0.0000,10,8.5000
2009-12-20,0.0000,0,0.0000
2009-12-27,0.0000,0,0.0000
2010-01-03,0.0000,0,0.0000
...,...,...,...
2011-11-13,5.0000,135234,229378.0100
2011-11-20,5.0000,129454,221870.2900
2011-11-27,5.0000,133998,210741.7600
2011-12-04,5.0000,123041,220213.9900


In [14]:
ordinal_encoding["o_enc__country"].unique()

array([0., 1., 2., 3., 4., 5.])