In [1]:
import abc
from typing import Mapping, Tuple, List

import numpy as np
import polars as pl
import polars.selectors as cs

In [2]:
air_quality_data = (
    pl.read_csv("../data/air_quality/air_quality.csv", separator=";")
    .drop(cs.last())
    .filter(
        ~pl.all_horizontal(pl.all().is_null())
    )
    .with_columns(
        pl.col("Date").str.strptime(pl.Date, "%d/%m/%Y"),
        pl.col("Time").str.strptime(pl.Time, "%H.%M.%s"),
        pl.col(pl.Utf8).exclude("Date", "Time").str.replace(",", ".").cast(pl.Float32),
        pl.col(pl.Int64).cast(pl.Float32)
    )
)

air_quality_data

Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
date,time,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
2004-03-10,18:00:00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.900002,0.7578
2004-03-10,19:00:00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.700001,0.7255
2004-03-10,20:00:00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502
2004-03-10,21:00:00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867
2004-03-10,22:00:00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.599998,0.7888
2004-03-10,23:00:00,1.2,1197.0,38.0,4.7,750.0,89.0,1337.0,96.0,1393.0,949.0,11.2,59.200001,0.7848
2004-03-11,00:00:00,1.2,1185.0,31.0,3.6,690.0,62.0,1462.0,77.0,1333.0,733.0,11.3,56.799999,0.7603
2004-03-11,01:00:00,1.0,1136.0,31.0,3.3,672.0,62.0,1453.0,76.0,1333.0,730.0,10.7,60.0,0.7702
2004-03-11,02:00:00,0.9,1094.0,24.0,2.3,609.0,45.0,1579.0,60.0,1276.0,620.0,10.7,59.700001,0.7648
2004-03-11,03:00:00,0.6,1010.0,19.0,1.7,561.0,-200.0,1705.0,-200.0,1235.0,501.0,10.3,60.200001,0.7517


In [3]:
class Transformer(abc.ABC):
    @abc.abstractmethod
    def fit_transform(self, df: pl.DataFrame) -> pl.DataFrame:
        pass

    @abc.abstractmethod
    def transform(self, df: pl.DataFrame) -> pl.DataFrame:
        pass


class Pipeline(Transformer):
    def __init__(self, transformers: List[Transformer]) -> None:
        super().__init__()
        self._transforms = transformers

    def fit_transform(self, df: pl.DataFrame) -> pl.DataFrame:
        for t in self._transforms:
            print(f"Run fit-transform of {t.__class__.__name__}")
            df = t.fit_transform(df)
        return df
    
    def transform(self, df: pl.DataFrame) -> pl.DataFrame:
        for t in self._transforms:
            print(f"Run transform of {t.__class__.__name__}")
            df = t.transform(df)
        return df

In [5]:
class MinMaxScaler(Transformer):
    def __init__(self, columns: List[str], suffix: str = "_norm") -> None:
        super().__init__()
        self._columns = columns
        self._suffix = suffix

    def fit(self, df: pl.DataFrame) -> "MinMaxScaler":
        df_min = df.select(pl.col(self._columns).min())
        df_max = df.select(pl.col(self._columns).max())
        self._min_values = {k: v[0] for k, v in df_min.to_dict().items()}
        self._max_values = {k: v[0] for k, v in df_max.to_dict().items()}
        return self

    def transform(self, df: pl.DataFrame) -> pl.DataFrame:
        exprs = []
        for c in self._columns:
            exprs.append(
                (
                    (pl.col(c) - self._min_values[c]) / 
                    (self._max_values[c] - self._min_values[c])
                ).alias(c + self._suffix)
            )
        return df.with_columns(exprs)
    
    def fit_transform(self, df: pl.DataFrame) -> pl.DataFrame:
        return self.fit(df).transform(df)


min_max_scaler = MinMaxScaler(
    [
        "T",
        "RH",
        "AH",
    ],
    suffix="_norm",
)
min_max_scaler.fit_transform(air_quality_data)

Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,T_norm,RH_norm,AH_norm
date,time,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
2004-03-10,18:00:00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.900002,0.7578,0.873262,0.862141,0.992715
2004-03-10,19:00:00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.700001,0.7255,0.872036,0.857984,0.992555
2004-03-10,20:00:00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502,0.866312,0.879806,0.992678
2004-03-10,21:00:00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867,0.862633,0.900589,0.992858
2004-03-10,22:00:00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.599998,0.7888,0.863451,0.899203,0.992869
2004-03-10,23:00:00,1.2,1197.0,38.0,4.7,750.0,89.0,1337.0,96.0,1393.0,949.0,11.2,59.200001,0.7848,0.863451,0.897818,0.992849
2004-03-11,00:00:00,1.2,1185.0,31.0,3.6,690.0,62.0,1462.0,77.0,1333.0,733.0,11.3,56.799999,0.7603,0.863859,0.889505,0.992728
2004-03-11,01:00:00,1.0,1136.0,31.0,3.3,672.0,62.0,1453.0,76.0,1333.0,730.0,10.7,60.0,0.7702,0.861406,0.900589,0.992777
2004-03-11,02:00:00,0.9,1094.0,24.0,2.3,609.0,45.0,1579.0,60.0,1276.0,620.0,10.7,59.700001,0.7648,0.861406,0.89955,0.99275
2004-03-11,03:00:00,0.6,1010.0,19.0,1.7,561.0,-200.0,1705.0,-200.0,1235.0,501.0,10.3,60.200001,0.7517,0.859771,0.901282,0.992685


In [6]:
class UniformBinning(Transformer):
    def __init__(self, columns: List[str], suffix: str = "_binned") -> None:
        super().__init__()
        self._columns = columns
        self._suffix = suffix

    @property
    def n_bins(self) -> Mapping[str, int]:
        return { c: len(self._bins[c]) for c in self._columns }

    @property
    def bins(self) -> Mapping[str, np.ndarray]:
        return self._bins

    def fit(self, df: pl.DataFrame) -> "UniformBinning":
        bins = {}
        for c in self._columns:
            bins[c] = self._bin_feature(df[c].to_numpy())
        self._bins = bins
        return self

    def transform(self, df: pl.DataFrame) -> pl.DataFrame:
        binned_values = {}
        for c in self._columns:
            binned_values[c] = np.digitize(df[c].to_numpy(), self._bins[c])

        return df.with_columns([
            pl.lit(binned_values[c]).alias(c + self._suffix) 
            for c in self._columns
        ])

    def fit_transform(self, df: pl.DataFrame) -> pl.DataFrame:
        return self.fit(df).transform(df)
    
    def _bin_feature(self, feature_values: np.ndarray) -> np.ndarray:
        return np.histogram_bin_edges(feature_values, bins="auto")
    

uniform_binning = UniformBinning(
    [
        "CO(GT)",
        "PT08.S1(CO)",
        "NMHC(GT)",
    ],
    suffix="_binned",
)
uniform_binning.fit_transform(air_quality_data)

Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,CO(GT)_binned,PT08.S1(CO)_binned,NMHC(GT)_binned
date,time,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i64,i64,i64
2004-03-10,18:00:00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.900002,0.7578,1068,56,4
2004-03-10,19:00:00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.700001,0.7255,1065,53,4
2004-03-10,20:00:00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502,1066,57,4
2004-03-10,21:00:00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867,1066,56,4
2004-03-10,22:00:00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.599998,0.7888,1063,52,3
2004-03-10,23:00:00,1.2,1197.0,38.0,4.7,750.0,89.0,1337.0,96.0,1393.0,949.0,11.2,59.200001,0.7848,1061,50,3
2004-03-11,00:00:00,1.2,1185.0,31.0,3.6,690.0,62.0,1462.0,77.0,1333.0,733.0,11.3,56.799999,0.7603,1061,49,3
2004-03-11,01:00:00,1.0,1136.0,31.0,3.3,672.0,62.0,1453.0,76.0,1333.0,730.0,10.7,60.0,0.7702,1060,48,3
2004-03-11,02:00:00,0.9,1094.0,24.0,2.3,609.0,45.0,1579.0,60.0,1276.0,620.0,10.7,59.700001,0.7648,1060,46,3
2004-03-11,03:00:00,0.6,1010.0,19.0,1.7,561.0,-200.0,1705.0,-200.0,1235.0,501.0,10.3,60.200001,0.7517,1058,43,3


In [7]:
print(uniform_binning.n_bins)

{'CO(GT)': 1118, 'PT08.S1(CO)': 80, 'NMHC(GT)': 16}


In [8]:
class WinsorizingTransform(Transformer):
    def __init__(
        self, 
        columns: List[str], 
        suffix: str = "_binned", 
        limits: List[float] = [0.05, 0.05],
    ) -> None:
        super().__init__()
        self._columns = columns
        self._suffix = suffix
        self._limits = limits

    def fit(self, df: pl.DataFrame) -> "UniformBinning":
        borders = {}
        for c in self._columns:
            borders[c] = self._feature_borders(df[c].to_numpy())
        self._borders = borders
        return self

    def transform(self, df: pl.DataFrame) -> pl.DataFrame:
        transformed_values = {}
        for c in self._columns:
            transformed_values[c] = self._winsorize(df[c].to_numpy(), c)

        return df.with_columns([
            pl.lit(transformed_values[c]).alias(c + self._suffix) 
            for c in self._columns
        ])

    def fit_transform(self, df: pl.DataFrame) -> pl.DataFrame:
        return self.fit(df).transform(df)
    
    def _feature_borders(self, feature_values: np.ndarray) -> Tuple[Tuple[float, float], Tuple[float, float]]:
        left = np.quantile(feature_values, q=self._limits[0])
        right = np.quantile(feature_values, q=(1.0 - self._limits[1]))
        left_fill = np.min(feature_values[feature_values > left])
        right_fill = np.max(feature_values[feature_values < right])
        return (left, left_fill), (right, right_fill)
    
    def _winsorize(self, x: np.ndarray, name: str) -> np.ndarray:
        (left, left_fill), (right, right_fill) = self._borders[name]
        x = np.where(x > left, x, left_fill)
        x = np.where(x < right, x, right_fill)
        return x
    

winsorizing_transform = WinsorizingTransform(
    [
        "CO(GT)",
        "PT08.S1(CO)",
        "NMHC(GT)",
    ],
    suffix="_winsorized",
)
winsorizing_transform.fit_transform(air_quality_data)

Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,CO(GT)_winsorized,PT08.S1(CO)_winsorized,NMHC(GT)_winsorized
date,time,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
2004-03-10,18:00:00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.900002,0.7578,2.6,1360.0,144.0
2004-03-10,19:00:00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.700001,0.7255,2.0,1292.0,112.0
2004-03-10,20:00:00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502,2.2,1402.0,88.0
2004-03-10,21:00:00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867,2.2,1376.0,80.0
2004-03-10,22:00:00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.599998,0.7888,1.6,1272.0,51.0
2004-03-10,23:00:00,1.2,1197.0,38.0,4.7,750.0,89.0,1337.0,96.0,1393.0,949.0,11.2,59.200001,0.7848,1.2,1197.0,38.0
2004-03-11,00:00:00,1.2,1185.0,31.0,3.6,690.0,62.0,1462.0,77.0,1333.0,733.0,11.3,56.799999,0.7603,1.2,1185.0,31.0
2004-03-11,01:00:00,1.0,1136.0,31.0,3.3,672.0,62.0,1453.0,76.0,1333.0,730.0,10.7,60.0,0.7702,1.0,1136.0,31.0
2004-03-11,02:00:00,0.9,1094.0,24.0,2.3,609.0,45.0,1579.0,60.0,1276.0,620.0,10.7,59.700001,0.7648,0.9,1094.0,24.0
2004-03-11,03:00:00,0.6,1010.0,19.0,1.7,561.0,-200.0,1705.0,-200.0,1235.0,501.0,10.3,60.200001,0.7517,0.6,1010.0,19.0
