In [None]:
import dsds.prescreen as ps
import dsds.utils as u
import dsds.transform as t
import dsds.encoders as enc
import dsds.sample as sa
import dsds.fs as fs
import polars as pl

# More Complicated Pipelines that are Beyond Sklearn

This is a more advanced pipeline. It contains filter, upsample steps, which are persisted which means they will be remembered by the pipeline and will be
applied if the pipeline is applied to other dataframe. You can set persist = False so that these steps won't be remembered. You can also append a model score to the dataframe. That means this model score append can be an intermediate step, which enables pipelines in which the final model depends on previous model scores. Simple passthroughs like logistic passthrough is built in for you.

In [None]:
# `data` is Lazy because input is Lazy. All pipe functions work with LazyFrames
data = (
    pl.scan_csv("../data/advertising.csv").sort(by="id")
    .pipe(ps.var_removal, threshold = 0.5, target = "Clicked on Ad")
    .pipe(ps.filter, condition = pl.col("Age") > 30, persist=True)
    .pipe(ps.constant_removal)
    .pipe(sa.simple_upsample, subgroup = pl.col("One_Hot_Test") != 'A', count = 200, persist=True)
    .pipe(enc.binary_encode, exclude = ["Clicked on Ad"])
    .pipe(enc.one_hot_encode, cols= ["One_Hot_Test"])
    .pipe(t.impute, cols=["Area Income", "Daily Internet Usage", "Daily Internet Usage Band"], strategy="median")
    .pipe(t.impute, cols=["Daily Internet Usage"], strategy = "mean")
    .pipe(u.logistic_passthrough, col = "Daily Internet Usage", coeff=0.5, const=-100)
    .pipe(fs.mrmr_selector, target = "Clicked on Ad", top_k = 12, strategy="fscore")
    .pipe(fs.mutual_info_selector, target = "Clicked on Ad", top_k = 3)
    .pipe(ps.remove_if_exists, ["Ad Topic Line", "City", "Country", "Timestamp"])
    .pipe(ps.rename, rename_dict={"logistic_score":"Daily Internet Usage Logistic Score"}, persist=True)
)


In [None]:
data.limit(5).collect()

In [None]:
data.blueprint

In [None]:
x = data.collect()
y = x.drop_in_place("Clicked on Ad")

In [None]:
# Train and tune a model outside the data pipeline
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=4, n_estimators=30, n_jobs= -1)
rf.fit(x.select("Daily Internet Usage", "Daily Internet Usage Logistic Score", "Daily Time Spent on Site"), y)


In [None]:
# add the TRAINED model to the end of the data pipeline

final_df = data.pipe(
    u.append_classif_score, 
    model = rf, 
    features = ["Daily Internet Usage", "Daily Internet Usage Logistic Score", "Daily Time Spent on Site"],
    target = "Clicked on Ad", 
)

In [None]:
# Show last 3 steps.
final_df.blueprint.show(-3)

In [None]:
final_df.limit(5).collect()

In [None]:
# The modelled pipeline can be pickled as long as the 
# model can be pickled

final_df.blueprint.preserve("modelled_pipe.pkl")

In [None]:
import dsds.blueprint as bp
pipe = bp.from_pkl("modelled_pipe.pkl")
type(pipe)

In [None]:
processed_df = pipe.apply(
    pl.scan_csv("../data/advertising.csv")
    , collect = True
)

In [None]:
processed_df