In [1]:
import os
os.chdir('..')

# Pipeline using SKPM event feature extraction

Let us build a basic pipeline for

- extracting timesamp-related features (*SKPM*)
- one-hot encoding activities (*sklearn*)
- predicting which resource will execute the next event

In this example we are using the `BPI20 - RequestForPayment` event log, which has already been splitted into train and test sets.

In [2]:
import pandas as pd
from sklearn import set_config
set_config(transform_output="pandas")
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from skpm.encoding.trace import TraceAggregator
from skpm.event_feature_extraction import TimestampExtractor


def read_log(path):
    log = pd.read_csv(path)
    log = log.rename(
        columns={
            "case:concept:name": "case_id",
            "time:timestamp": "timestamp",
            "concept:name": "activity",
            "org:resource": "resource",
        }
    )
    log = log.loc[:, ["case_id", "activity", "timestamp", "resource"]]
    log["case_id"] = log["case_id"].astype("category")
    return log

train = read_log("data/train.csv")
test = read_log("data/test.csv")

X_train = train.drop("resource", axis=1)
y_train = train["resource"]
X_test = test.drop("resource", axis=1)
y_test = test["resource"]


## Define the time feature extraction pipeline

In [3]:
time_transformer = Pipeline(
    steps=[
        ("time", TimestampExtractor(case_col="case_id", time_col="timestamp", features="all")),
        ("scale", StandardScaler()),
    ]
)

## One-hot encode for activities

In [4]:
cat_transformer = Pipeline(
    steps=[("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))]
)

## Putting everything together as a single preprocessing step

In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ("oh", cat_transformer, ["activity"]),
        ("time", time_transformer, ["case_id", "timestamp"]),
        ("case_id", "passthrough", ["case_id"]), 
    ],
    remainder="passthrough",
).set_output(transform="pandas")

## Classification pipeline

In [6]:
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("aggregator", TraceAggregator(case_col="case_id", method="mean")),
        ("classifier", RandomForestClassifier(n_jobs=-1))
    ]
).set_output(transform="pandas")

## Running pipeline

**NOTE:** The sklearn `ColumnTransformer` always insert a prefix to mark the actual step of each outputed feature.

In [7]:
# print(preprocessor.fit_transform(X_train).columns)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

0.6167013992259601
