# Example of Data Preprocessing

In [1]:
import numpy as np
import polars as pl
from sklearn.datasets import fetch_openml

from pytred import DataHub
from pytred.decorators import polars_table

## Make Dataset
This example uses the titanic dataset.  
A column named "record_id" is added as a Key to join data.

In [2]:
# Load data from https://www.openml.org/d/40945
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
df_titanic = pl.from_pandas(X).with_columns(
    survived=y.values.astype(int),
    record_id=np.arange(len(X)),
)
display(df_titanic)

pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,survived,record_id
i64,str,cat,f64,i64,i64,str,f64,str,cat,str,f64,str,i64,i64
1,"""Allen, Miss. E…","""female""",29.0,0,0,"""24160""",211.3375,"""B5""","""S""","""2""",,"""St Louis, MO""",1,0
1,"""Allison, Maste…","""male""",0.9167,1,2,"""113781""",151.55,"""C22 C26""","""S""","""11""",,"""Montreal, PQ /…",1,1
1,"""Allison, Miss.…","""female""",2.0,1,2,"""113781""",151.55,"""C22 C26""","""S""",,,"""Montreal, PQ /…",0,2
1,"""Allison, Mr. H…","""male""",30.0,1,2,"""113781""",151.55,"""C22 C26""","""S""",,135.0,"""Montreal, PQ /…",0,3
1,"""Allison, Mrs. …","""female""",25.0,1,2,"""113781""",151.55,"""C22 C26""","""S""",,,"""Montreal, PQ /…",0,4
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
3,"""Zabour, Miss. …","""female""",14.5,1,0,"""2665""",14.4542,,"""C""",,328.0,,0,1304
3,"""Zabour, Miss. …","""female""",,1,0,"""2665""",14.4542,,"""C""",,,,0,1305
3,"""Zakarian, Mr. …","""male""",26.5,0,0,"""2656""",7.225,,"""C""",,304.0,,0,1306
3,"""Zakarian, Mr. …","""male""",27.0,0,0,"""2670""",7.225,,"""C""",,,,0,1307


## Preprocessing list
Implement a DataHub class that performs these preprocessing tasks
1. replace "male" with 1 and "female" with 0
2. fill in missing values of age with the mean
3. sum of sibsp and parch
4. onehot encoding of embarked

### replace "male" with 1 and "female" with 0

```
The argument "titanic" of a function with `@polars_table` equals "titanic" which is given as keyword arguments to MyDataHub.
```

In [3]:
class MyDataHub(DataHub):

    @polars_table(0, "record_id", join="left")
    def replace_sex(self, titanic):
        """
        Parameters
        -----
        titanic: pl.DataFrame
            Same as pl.DataFrame given as variable "titanic" at initialization of MyDataHub class.
        """
        # replace "male" with 1 and "female" with 0
        replace_sex = titanic.select(
            "record_id",
            "sex",
            sex_replaced=pl.when(pl.col("sex") == "male").then(1).otherwise(0),
        )
        return replace_sex

In [4]:
my_datahub = MyDataHub(
    root_df=df_titanic.select("record_id", "survived"),
    titanic=df_titanic,  # This dataframe is passed to replace_sex().
)

output = my_datahub()

display(output.head())

print("Check replacing result")
display(output.select("sex", "sex_replaced").unique())

record_id,survived,sex,sex_replaced
i64,i64,cat,i32
0,1,"""female""",0
1,1,"""male""",1
2,0,"""female""",0
3,0,"""male""",1
4,0,"""female""",0


Check replacing result


sex,sex_replaced
cat,i32
"""female""",0
"""male""",1


### fill in missing values of age with the mean

In [5]:
class MyDataHub(DataHub):

    @polars_table(1, "record_id", join="left")
    def fill_age(self, titanic):
        # calculate average
        age_mean = titanic.select("age").mean().to_numpy()[0][0]

        # this print() is for debug
        print(f"Value to fill in missing values: {age_mean}")

        # fill in missing values of age with the mean
        filled_age = titanic.select(
            "record_id",
            "age",
            filled_age=pl.col("age").fill_null(age_mean),
        )
        return filled_age


In [6]:
my_datahub = MyDataHub(
    root_df=df_titanic.select("record_id", "survived"),
    titanic=df_titanic,  # This dataframe is passed to fill_age().
)

output = my_datahub()

display(output.head())

print("Check filling result")
display(
    output.filter(
        pl.col("age").is_null()
    ).select("filled_age").unique()
)

Value to fill in missing values: 29.8811345124283


record_id,survived,age,filled_age
i64,i64,f64,f64
0,1,29.0,29.0
1,1,0.9167,0.9167
2,0,2.0,2.0
3,0,30.0,30.0
4,0,25.0,25.0


Check filling result


filled_age
f64
29.881135


### sum of sibsp and parch

In [7]:
class MyDataHub(DataHub):

    @polars_table(2, "record_id", join="left")
    def cnt_family(self, titanic):
        # sum of sibsp and parch
        cnt_family = titanic.select(
            "record_id",
            "sibsp",
            "parch",
            cnt_family=pl.col("sibsp") + pl.col("parch"),
        )
        return cnt_family


In [8]:
my_datahub = MyDataHub(
    root_df=df_titanic.select("record_id", "survived"),
    titanic=df_titanic,  # This dataframe is passed to cnt_family().
)

output = my_datahub()

display(output.head())

print("Check cnt_family column")
display(
    len(output.filter(
        pl.col("cnt_family") != (pl.col("sibsp") + pl.col("parch"))
    ))
)

record_id,survived,sibsp,parch,cnt_family
i64,i64,i64,i64,i64
0,1,0,0,0
1,1,1,2,3
2,0,1,2,3
3,0,1,2,3
4,0,1,2,3


Check cnt_family column


0

### onehot encoding of embarked


In [9]:
class MyDataHub(DataHub):

    @polars_table(3, "record_id", join="left")
    def onehot_embarked(self, titanic):
        # onehot encoding
        onehot_embarked = titanic.select(
            "record_id",
            "embarked",
            embarked_C=pl.when(pl.col("embarked") == "C").then(1).otherwise(0),
            embarked_Q=pl.when(pl.col("embarked") == "Q").then(1).otherwise(0),
            embarked_S=pl.when(pl.col("embarked") == "S").then(1).otherwise(0),
        )

        return onehot_embarked


In [10]:
my_datahub = MyDataHub(
    root_df=df_titanic.select("record_id", "survived"),
    titanic=df_titanic,  # This dataframe is passed to onehot_embarked().
)

output = my_datahub()

display(output.head())

print("Check encoding result")
display(
    output.select("embarked", "embarked_C", "embarked_Q", "embarked_S").unique()
)

record_id,survived,embarked,embarked_C,embarked_Q,embarked_S
i64,i64,cat,i32,i32,i32
0,1,"""S""",0,0,1
1,1,"""S""",0,0,1
2,0,"""S""",0,0,1
3,0,"""S""",0,0,1
4,0,"""S""",0,0,1


Check encoding result


embarked,embarked_C,embarked_Q,embarked_S
cat,i32,i32,i32
"""S""",0,0,1
"""C""",1,0,0
,0,0,0
"""Q""",0,1,0


## Completed DataHub Class

In [11]:
class MyDataHub(DataHub):

    @polars_table(0, "record_id", join="left")
    def replace_sex(self, titanic):
        # replace "male" with 1 and "female" with 0
        replace_sex = titanic.select(
            "record_id",
            "sex",
            sex_replaced=pl.when(pl.col("sex") == "male").then(1).otherwise(0),
        )
        return replace_sex

    @polars_table(1, "record_id", join="left")
    def fill_age(self, titanic):
        # calculate average
        age_mean = titanic.select("age").mean().to_numpy()[0][0]

        # fill in missing values of age with the mean
        filled_age = titanic.select(
            "record_id",
            "age",
            filled_age=pl.col("age").fill_null(age_mean),
        )
        return filled_age

    @polars_table(2, "record_id", join="left")
    def cnt_family(self, titanic):
        # sum of sibsp and parch
        cnt_family = titanic.select(
            "record_id",
            "sibsp",
            "parch",
            cnt_family=pl.col("sibsp") + pl.col("parch"),
        )
        return cnt_family

    @polars_table(3, "record_id", join="left")
    def onehot_embarked(self, titanic):
        # onehot encoding of embarked
        onehot_embarked = titanic.select(
            "record_id",
            "embarked",
            embarked_C=pl.when(pl.col("embarked") == "C").then(1).otherwise(0),
            embarked_Q=pl.when(pl.col("embarked") == "Q").then(1).otherwise(0),
            embarked_S=pl.when(pl.col("embarked") == "S").then(1).otherwise(0),
        )

        return onehot_embarked


In [12]:
my_datahub = MyDataHub(
    root_df=df_titanic.select("record_id", "survived"),
    titanic=df_titanic,
)

output = my_datahub()

display(output.head())


record_id,survived,sex,sex_replaced,age,filled_age,sibsp,parch,cnt_family,embarked,embarked_C,embarked_Q,embarked_S
i64,i64,cat,i32,f64,f64,i64,i64,i64,cat,i32,i32,i32
0,1,"""female""",0,29.0,29.0,0,0,0,"""S""",0,0,1
1,1,"""male""",1,0.9167,0.9167,1,2,3,"""S""",0,0,1
2,0,"""female""",0,2.0,2.0,1,2,3,"""S""",0,0,1
3,0,"""male""",1,30.0,30.0,1,2,3,"""S""",0,0,1
4,0,"""female""",0,25.0,25.0,1,2,3,"""S""",0,0,1
