In [None]:
import os
os.chdir('..')

# Pipeline using SKPM event feature extraction

Let us build a basic pipeline for

- extracting timesamp-related and resource-related features (*SKPM*)
- one-hot encoding activities (*sklearn*)
- predicting the remaining time

In this example we are using the `BPI20 - RequestForPayment` event log, which has already been splitted into train and test sets.

In [None]:
import pandas as pd
from sklearn import set_config

set_config(transform_output="pandas")
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from skpm.encoding.trace import TraceAggregator
from skpm.event_feature_extraction import TimestampExtractor, ResourcePoolExtractor


def read_log(path):
    log = pd.read_csv(path)
    log = log.rename(
        columns={
            "case:concept:name": "case_id",
            "time:timestamp": "timestamp",
            "concept:name": "activity",
            "org:resource": "resource",
            "remain_time": "remaining_time"
        }
    )
    
    log = log.loc[:, ["case_id", "activity", "timestamp", "resource", "remaining_time"]]
    log["case_id"] = log["case_id"].astype("category")
    log.dropna(inplace=True)
    return log


train = read_log("data/train.csv")
test = read_log("data/test.csv")

target = "remaining_time"
X_train = train.drop(target, axis=1)
y_train = train[target]
X_test = test.drop(target, axis=1)
y_test = test[target]

## Define the time feature extraction pipeline

In [None]:
time_transformer = Pipeline(
    steps=[
        ("time", TimestampExtractor(case_col="case_id", time_col="timestamp", features=["execution_time", "accumulated_time"])),
        ("scale", StandardScaler()),
    ]
)

## One-hot encode for activities

In [None]:
cat_transformer = Pipeline(
    steps=[("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))]
)

## Putting everything together as a single preprocessing step

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("oh", cat_transformer, ["activity"]),
        ("time", time_transformer, ["case_id", "timestamp"]),
        ("resource", ResourcePoolExtractor(), ["activity", "resource"]),
        ("case_id", "passthrough", ["case_id"]), 
    ],
    remainder="drop",
).set_output(transform="pandas")

## Regression pipeline

NOTE: we have a concept drift issue since the activity `'Request For Payment FINAL_APPROVED by BUDGET OWNER'` in the test set has never been seen by the train set. SKPM throws a warning and replace it by a specical token `UNK`.

In [None]:
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("aggregator", TraceAggregator(case_col="case_id", method="mean")),
        ("classifier", RandomForestRegressor(n_estimators=10, random_state=13, n_jobs=-1))
    ]
).set_output(transform="pandas")

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

## Classification pipeline

Next activity prediction

In [None]:
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("aggregator", TraceAggregator(case_col="case_id", method="mean")),
        ("classifier", RandomForestClassifier(n_estimators=10, random_state=13, n_jobs=-1))
    ]
).set_output(transform="pandas")

y_train = X_train.activity.shift(-1, fill_value="EOS")
y_test = X_test.activity.shift(-1, fill_value="EOS")
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))