In [1]:
import abc
from typing import Mapping, Tuple, List

import numpy as np
import polars as pl
import polars.selectors as cs

### Загружаем данные

In [2]:
melb_housing_data = pl.read_csv("../data/melburn_housing/melb_data.csv")
melb_housing_data

Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
str,str,i64,str,f64,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,str,f64
"""Abbotsford""","""85 Turner St""",2,"""h""",1.48e6,"""S""","""Biggin""","""3/12/2016""",2.5,3067.0,2.0,1.0,1.0,202.0,,,"""Yarra""",-37.7996,144.9984,"""Northern Metro…",4019.0
"""Abbotsford""","""25 Bloomburg S…",2,"""h""",1.035e6,"""S""","""Biggin""","""4/02/2016""",2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,"""Yarra""",-37.8079,144.9934,"""Northern Metro…",4019.0
"""Abbotsford""","""5 Charles St""",3,"""h""",1.465e6,"""SP""","""Biggin""","""4/03/2017""",2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,"""Yarra""",-37.8093,144.9944,"""Northern Metro…",4019.0
"""Abbotsford""","""40 Federation …",3,"""h""",850000.0,"""PI""","""Biggin""","""4/03/2017""",2.5,3067.0,3.0,2.0,1.0,94.0,,,"""Yarra""",-37.7969,144.9969,"""Northern Metro…",4019.0
"""Abbotsford""","""55a Park St""",4,"""h""",1.6e6,"""VB""","""Nelson""","""4/06/2016""",2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,"""Yarra""",-37.8072,144.9941,"""Northern Metro…",4019.0
"""Abbotsford""","""129 Charles St…",2,"""h""",941000.0,"""S""","""Jellis""","""7/05/2016""",2.5,3067.0,2.0,1.0,0.0,181.0,,,"""Yarra""",-37.8041,144.9953,"""Northern Metro…",4019.0
"""Abbotsford""","""124 Yarra St""",3,"""h""",1.876e6,"""S""","""Nelson""","""7/05/2016""",2.5,3067.0,4.0,2.0,0.0,245.0,210.0,1910.0,"""Yarra""",-37.8024,144.9993,"""Northern Metro…",4019.0
"""Abbotsford""","""98 Charles St""",2,"""h""",1.636e6,"""S""","""Nelson""","""8/10/2016""",2.5,3067.0,2.0,1.0,2.0,256.0,107.0,1890.0,"""Yarra""",-37.806,144.9954,"""Northern Metro…",4019.0
"""Abbotsford""","""6/241 Nicholso…",1,"""u""",300000.0,"""S""","""Biggin""","""8/10/2016""",2.5,3067.0,1.0,1.0,1.0,0.0,,,"""Yarra""",-37.8008,144.9973,"""Northern Metro…",4019.0
"""Abbotsford""","""10 Valiant St""",2,"""h""",1.097e6,"""S""","""Biggin""","""8/10/2016""",2.5,3067.0,3.0,1.0,2.0,220.0,75.0,1900.0,"""Yarra""",-37.801,144.9989,"""Northern Metro…",4019.0


In [4]:
class Transformer(abc.ABC):
    @abc.abstractmethod
    def fit_transform(self, df: pl.DataFrame) -> pl.DataFrame:
        pass

    @abc.abstractmethod
    def transform(self, df: pl.DataFrame) -> pl.DataFrame:
        pass

    def inverse_transform(self, df: pl.DataFrame) -> pl.DataFrame:
        raise NotImplementedError()


class Pipeline(Transformer):
    def __init__(self, transformers: List[Transformer]) -> None:
        super().__init__()
        self._transforms = transformers

    def fit_transform(self, df: pl.DataFrame) -> pl.DataFrame:
        for t in self._transforms:
            print(f"Run fit-transform of {t.__class__.__name__}")
            df = t.fit_transform(df)
        return df
    
    def transform(self, df: pl.DataFrame) -> pl.DataFrame:
        for t in self._transforms:
            print(f"Run transform of {t.__class__.__name__}")
            df = t.transform(df)
        return df

    def transform(self, df: pl.DataFrame) -> pl.DataFrame:
        for t in self._transforms:
            print(f"Run inverse-transform of {t.__class__.__name__}")
            df = t.inverse_transform(df)
        return df

### Ordinal encoding

In [6]:
melb_housing_data[["Type", "Method", "Regionname"]]

Type,Method,Regionname
str,str,str
"""h""","""S""","""Northern Metro…"
"""h""","""S""","""Northern Metro…"
"""h""","""SP""","""Northern Metro…"
"""h""","""PI""","""Northern Metro…"
"""h""","""VB""","""Northern Metro…"
"""h""","""S""","""Northern Metro…"
"""h""","""S""","""Northern Metro…"
"""h""","""S""","""Northern Metro…"
"""u""","""S""","""Northern Metro…"
"""h""","""S""","""Northern Metro…"


In [5]:
class OrdinalEncoder(Transformer):
    def __init__(self, columns: List[str]) -> None:
        super().__init__()
        self._columns = columns

    @property
    def mapping(self) -> Mapping[str, pl.DataFrame]:
        return self._mappers

    def fit(self, df: pl.DataFrame) -> "OrdinalEncoder":
        mappers = {}
        for c in self._columns:
            mappers[c] = (
                df[[c]].unique()
                .sort(c)
                .with_row_count(f"{c}_index")
                .with_columns(pl.col(f"{c}_index").cast(pl.Int32))
            )
        self._mappers = mappers
        return self

    def transform(self, df: pl.DataFrame) -> pl.DataFrame:
        for c in self._columns:
            df = (
                df
                .join(self._mappers[c], on=c, how="left")
                .drop(c)
                .rename({f"{c}_index": c})
            )
        return df
    
    def inverse_transform(self, df: pl.DataFrame) -> pl.DataFrame:
        for c in self._columns:
            df = (
                df
                .rename({c: f"{c}_index"})
                .join(self._mappers[c], on=f"{c}_index", how="left")
                .drop(f"{c}_index")
            )
        return df

    def fit_transform(self, df: pl.DataFrame) -> pl.DataFrame:
        return self.fit(df).transform(df)


ordinal_encoder = OrdinalEncoder(
    [
        "Type",
        "Method",
        "Regionname",
    ],
)
ordinal_encoder.fit_transform(melb_housing_data[["Type", "Method", "Regionname"]])

Type,Method,Regionname
i32,i32,i32
0,1,2
0,1,2
0,3,2
0,0,2
0,4,2
0,1,2
0,1,2
0,1,2
2,1,2
0,1,2


Проверяем, что последовательность transform -> inverse_transform выдаёт исходный датафрейм

In [7]:
melb_housing_data_recovered = ordinal_encoder.inverse_transform(ordinal_encoder.transform(melb_housing_data))

melb_housing_data_recovered.select(["Type", "Method", "Regionname"]).frame_equal(
    melb_housing_data.select(["Type", "Method", "Regionname"])
)

True

In [8]:
ordinal_encoder.mapping

{'Type': shape: (3, 2)
 ┌────────────┬──────┐
 │ Type_index ┆ Type │
 │ ---        ┆ ---  │
 │ i32        ┆ str  │
 ╞════════════╪══════╡
 │ 0          ┆ h    │
 │ 1          ┆ t    │
 │ 2          ┆ u    │
 └────────────┴──────┘,
 'Method': shape: (5, 2)
 ┌──────────────┬────────┐
 │ Method_index ┆ Method │
 │ ---          ┆ ---    │
 │ i32          ┆ str    │
 ╞══════════════╪════════╡
 │ 0            ┆ PI     │
 │ 1            ┆ S      │
 │ 2            ┆ SA     │
 │ 3            ┆ SP     │
 │ 4            ┆ VB     │
 └──────────────┴────────┘,
 'Regionname': shape: (8, 2)
 ┌──────────────────┬────────────────────────────┐
 │ Regionname_index ┆ Regionname                 │
 │ ---              ┆ ---                        │
 │ i32              ┆ str                        │
 ╞══════════════════╪════════════════════════════╡
 │ 0                ┆ Eastern Metropolitan       │
 │ 1                ┆ Eastern Victoria           │
 │ 2                ┆ Northern Metropolitan      │
 │ 3     

### One-hot encoding

In [41]:
class OneHotEncoder(Transformer):
    def __init__(self, columns: List[str]) -> None:
        super().__init__()
        self._columns = columns
        self._ordinal = OrdinalEncoder(columns)

    def fit(self, df: pl.DataFrame) -> "OneHotEncoder":
        self._ordinal.fit(df[self._columns])
        return self
    
    def transform(self, df: pl.DataFrame) -> pl.DataFrame:
        ordinal_encoded_df = self._ordinal.transform(df[self._columns])
        one_hot_encoded_df = ordinal_encoded_df.to_dummies()
        return pl.concat([df, one_hot_encoded_df], how="horizontal")
    
    def fit_transform(self, df: pl.DataFrame) -> pl.DataFrame:
        return self.fit(df).transform(df)
    
one_hot_encoder = OneHotEncoder([
    "Type",
    "Method",
    "Regionname",
])
one_hot_encoder.fit_transform(melb_housing_data[["Type", "Method", "Regionname"]])

Type,Method,Regionname,Type_0,Type_1,Type_2,Method_0,Method_1,Method_2,Method_3,Method_4,Regionname_0,Regionname_1,Regionname_2,Regionname_3,Regionname_4,Regionname_5,Regionname_6,Regionname_7
str,str,str,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8
"""h""","""S""","""Northern Metro…",1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
"""h""","""S""","""Northern Metro…",1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
"""h""","""SP""","""Northern Metro…",1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
"""h""","""PI""","""Northern Metro…",1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
"""h""","""VB""","""Northern Metro…",1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
"""h""","""S""","""Northern Metro…",1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
"""h""","""S""","""Northern Metro…",1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
"""h""","""S""","""Northern Metro…",1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
"""u""","""S""","""Northern Metro…",0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0
"""h""","""S""","""Northern Metro…",1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0


### Binary Encoding

In [9]:
class BinaryEncoder(Transformer):
    def __init__(self, columns: List[str]) -> None:
        super().__init__()
        self._columns = columns
        self._ordinal = OrdinalEncoder(columns)

    def fit(self, df: pl.DataFrame) -> "OneHotEncoder":
        self._ordinal.fit(df[self._columns])

        def encode_binary(x: int, width: int) -> List[int]:
            return [int(char) for char in np.binary_repr(x, width)]

        mappers = {} 
        for c in self._columns:
            width = int(np.round(np.log2(len(self._ordinal.mapping[c]))))
            mappers[c] = (
                self._ordinal.mapping[c]
                .with_columns(
                    pl.col(f"{c}_index")
                    .apply(lambda x: encode_binary(x, width))
                    .cast(pl.List(pl.UInt8))
                    .list.to_struct(fields=[f"{c}_{i}" for i in range(width)])
                )
                .unnest(f"{c}_index")
            )
        self._mappers = mappers
        return self
    
    def transform(self, df: pl.DataFrame) -> pl.DataFrame:
        for c in self._columns:
            df = (
                df
                .join(self._mappers[c], on=c, how="left")
                .drop(c)
            )
        return df
    
    def fit_transform(self, df: pl.DataFrame) -> pl.DataFrame:
        return self.fit(df).transform(df)
    
binary_encoder = BinaryEncoder(["Regionname"])
binary_encoder.fit_transform(melb_housing_data[["Regionname"]])

Regionname_0,Regionname_1,Regionname_2
u8,u8,u8
0,1,0
0,1,0
0,1,0
0,1,0
0,1,0
0,1,0
0,1,0
0,1,0
0,1,0
0,1,0
