In [1]:
import polars as pl
import polars.selectors as cs
from polars_ds.pipeline import Pipeline, Blueprint

# Builtin Pipeline Functions

To run this demo: use the latest version of polars_ds

In [2]:
df = pl.read_parquet("../examples/dependency.parquet")
df.head()

ID,Gender,DOB,Lead_Creation_Date,City_Code,City_Category,Employer_Code,Employer_Category1,Employer_Category2,Monthly_Income,Customer_Existing_Primary_Bank_Code,Primary_Bank_Type,Contacted,Source,Source_Category,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI,Var1,Approved
str,str,str,str,str,str,str,str,i64,f64,str,str,str,str,str,f64,i64,i64,f64,i64,i64,i64
"""APPC90493171225""","""Female""","""23/07/79""","""15/07/16""","""C10001""","""A""","""COM0044082""","""A""",4,2000.0,"""B001""","""P""","""N""","""S122""","""G""",0.0,,,,,0,0
"""APPD40611263344""","""Male""","""07/12/86""","""04/07/16""","""C10003""","""A""","""COM0000002""","""C""",1,3500.0,"""B002""","""P""","""Y""","""S122""","""G""",0.0,20000.0,2.0,13.25,953.0,10,0
"""APPE70289249423""","""Male""","""10/12/82""","""19/07/16""","""C10125""","""C""","""COM0005267""","""C""",4,2250.0,"""B003""","""G""","""Y""","""S143""","""B""",0.0,45000.0,4.0,,,0,0
"""APPF80273865537""","""Male""","""30/01/89""","""09/07/16""","""C10477""","""C""","""COM0004143""","""A""",4,3500.0,"""B003""","""G""","""Y""","""S143""","""B""",0.0,92000.0,5.0,,,7,0
"""APPG60994436641""","""Male""","""19/04/85""","""20/07/16""","""C10002""","""A""","""COM0001781""","""A""",4,10000.0,"""B001""","""P""","""Y""","""S134""","""B""",2500.0,50000.0,2.0,,,10,0


In [3]:
# Create a blueprint first. 
# A blueprint is a plan for a pipeline. No hard work will be done until the blueprint is materialized, which
# is when the tranforms are fitted (e.g. scale learns the mean and std from base data)
# If target is specified for the blueprint, target will be excluded from all transformations that require a fit,
# and target will be auto-filled if the transformation requires a target field and when no target field is explicitly given.

bp = (
    Blueprint(df, name = "example", target = "approved") # You can optionally put target of the ML model here
    # Select only the columns we need
    .lowercase() # lowercase all columns
    .select(cs.numeric() | cs.by_name(["gender", "employer_category1", "city_category"]))
    # explicitly put target, since this is not the target for prediction. 
    # Use a linear regression with x1 = var1, x2=existing_emi to predict missing values in loan_period
    .linear_impute(features = ["var1", "existing_emi"], target = "loan_period") 
    .impute(["existing_emi"], method = "median")
    .append_expr( # generate some features
        pl.col("existing_emi").log1p().alias("existing_emi_log1p"),
        pl.col("loan_amount").log1p().alias("loan_amount_log1p"),
        pl.col("loan_amount").sqrt().alias("loan_amount_sqrt"),
        pl.col("loan_amount").shift(-1).alias("loan_amount_lag_1") # any kind of lag transform
    )
    .scale( # target is numerical, but will be excluded automatically because bp is initialzied with a target
        cs.numeric().exclude(["var1", "existing_emi_log1p"]), method = "standard"
    ) # Scale the columns up to this point. The columns below won't be scaled
    .append_expr(
        # Add missing flags
        pl.col("employer_category1").is_null().cast(pl.UInt8).alias("employer_category1_is_missing")
    )
    .one_hot_encode("gender", drop_first=True)
    .woe_encode("city_category") # No need to specify target because we initialized bp with a target
    .target_encode("employer_category1", min_samples_leaf = 20, smoothing = 10.0) # same as above
)

print(bp)

Blueprint name: example
Blueprint current steps: 11
Features Expected: ['ID', 'Gender', 'DOB', 'Lead_Creation_Date', 'City_Code', 'City_Category', 'Employer_Code', 'Employer_Category1', 'Employer_Category2', 'Monthly_Income', 'Customer_Existing_Primary_Bank_Code', 'Primary_Bank_Type', 'Contacted', 'Source', 'Source_Category', 'Existing_EMI', 'Loan_Amount', 'Loan_Period', 'Interest_Rate', 'EMI', 'Var1', 'Approved']



In [4]:
# Materialize the blueprint
pipe:Pipeline = bp.materialize()
# Text representation of the pipeline
pipe

Naive Query Steps: 

Step 1:
col("ID").alias("id"),
col("Gender").alias("gender"),
col("DOB").alias("dob"),
col("Lead_Creation_Date").alias("lead_creation_date"),
col("City_Code").alias("city_code"),
col("City_Category").alias("city_category"),
col("Employer_Code").alias("employer_code"),
col("Employer_Category1").alias("employer_category1"),
col("Employer_Category2").alias("employer_category2"),
col("Monthly_Income").alias("monthly_income"),
col("Customer_Existing_Primary_Bank_Code").alias("customer_existing_primary_bank_code"),
col("Primary_Bank_Type").alias("primary_bank_type"),
col("Contacted").alias("contacted"),
col("Source").alias("source"),
col("Source_Category").alias("source_category"),
col("Existing_EMI").alias("existing_emi"),
col("Loan_Amount").alias("loan_amount"),
col("Loan_Period").alias("loan_period"),
col("Interest_Rate").alias("interest_rate"),
col("EMI").alias("emi"),
col("Var1").alias("var1"),
col("Approved").alias("approved")

Step 2:
selector

Step 3:
.when(col("

In [5]:
df_transformed = pipe.transform(df)
df_transformed.head()

employer_category2,monthly_income,existing_emi,loan_amount,loan_period,interest_rate,emi,var1,approved,employer_category1,city_category,existing_emi_log1p,loan_amount_log1p,loan_amount_sqrt,loan_amount_lag_1,employer_category1_is_missing,gender_Male
f64,f64,f64,f64,f64,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,u8,u8
0.346575,-0.020726,-0.157655,,-1.250145,,,0,0,0.010829,0.12492,0.0,,,-0.632338,0,0
-3.369204,-0.012144,-0.157655,-0.632338,-0.283462,-1.019936,-0.197259,10,0,0.021114,0.12492,0.0,-0.586105,-0.658025,0.181273,0,1
0.346575,-0.019296,-0.157655,0.181273,0.68322,,,0,0,0.021114,-0.597055,0.0,0.537137,0.375948,1.710861,0,1
0.346575,-0.012144,-0.157655,1.710861,1.166561,,,7,0,0.010829,-0.597055,0.0,1.527696,1.709278,0.343995,0,1
0.346575,0.025049,0.935153,0.343995,-0.283462,,,10,0,0.010829,0.12492,7.824446,0.683076,0.543738,,0,1


# Serialization Methods

Pickle + JSON support.

In [6]:
import pickle
# The pipe object can be pickled
with open("pipe.pickle", "wb") as f:
    pickle.dump(pipe, f)

In [7]:
with open("pipe.pickle", "rb") as f:
    pipe2 = pickle.load(f)

pipe2

Naive Query Steps: 

Step 1:
col("ID").alias("id"),
col("Gender").alias("gender"),
col("DOB").alias("dob"),
col("Lead_Creation_Date").alias("lead_creation_date"),
col("City_Code").alias("city_code"),
col("City_Category").alias("city_category"),
col("Employer_Code").alias("employer_code"),
col("Employer_Category1").alias("employer_category1"),
col("Employer_Category2").alias("employer_category2"),
col("Monthly_Income").alias("monthly_income"),
col("Customer_Existing_Primary_Bank_Code").alias("customer_existing_primary_bank_code"),
col("Primary_Bank_Type").alias("primary_bank_type"),
col("Contacted").alias("contacted"),
col("Source").alias("source"),
col("Source_Category").alias("source_category"),
col("Existing_EMI").alias("existing_emi"),
col("Loan_Amount").alias("loan_amount"),
col("Loan_Period").alias("loan_period"),
col("Interest_Rate").alias("interest_rate"),
col("EMI").alias("emi"),
col("Var1").alias("var1"),
col("Approved").alias("approved")

Step 2:
selector

Step 3:
.when(col("

In [8]:
df_transformed_2 = pipe2.transform(df)
df_transformed_2

employer_category2,monthly_income,existing_emi,loan_amount,loan_period,interest_rate,emi,var1,approved,employer_category1,city_category,existing_emi_log1p,loan_amount_log1p,loan_amount_sqrt,loan_amount_lag_1,employer_category1_is_missing,gender_Male
f64,f64,f64,f64,f64,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,u8,u8
0.346575,-0.020726,-0.157655,,-1.250145,,,0,0,0.010829,0.12492,0.0,,,-0.632338,0,0
-3.369204,-0.012144,-0.157655,-0.632338,-0.283462,-1.019936,-0.197259,10,0,0.021114,0.12492,0.0,-0.586105,-0.658025,0.181273,0,1
0.346575,-0.019296,-0.157655,0.181273,0.68322,,,0,0,0.021114,-0.597055,0.0,0.537137,0.375948,1.710861,0,1
0.346575,-0.012144,-0.157655,1.710861,1.166561,,,7,0,0.010829,-0.597055,0.0,1.527696,1.709278,0.343995,0,1
0.346575,0.025049,0.935153,0.343995,-0.283462,,,10,0,0.010829,0.12492,7.824446,0.683076,0.543738,,0,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
-3.369204,-0.004133,-0.157655,,1.213977,,,10,0,0.010829,0.12492,0.0,,,,0,0
0.346575,0.008971,0.476174,,0.469411,,,7,0,0.010829,-0.597055,7.280008,,,-0.50216,0,0
0.346575,-0.023015,-0.157655,-0.50216,0.68322,2.785431,-0.210546,2,0,0.021114,-0.081991,0.0,-0.333569,-0.46065,1.320328,0,0
-0.892018,0.024437,0.439455,1.320328,1.166561,,,10,0,0.021114,0.12492,7.220374,1.334103,1.40992,0.962339,0,1


In [9]:
from polars.testing import assert_frame_equal
# True
assert_frame_equal(df_transformed, df_transformed_2)

In [10]:
# To save the pipeline as JSON

pipe.to_json("test.json")
pipe3 = Pipeline.from_json("test.json")
# True
assert_frame_equal(df_transformed, pipe3.transform(df))

# Custom Tranformations in Pipeline

Need version >= v0.4.6 (Not released yet)

In [11]:
df = pl.read_parquet("../examples/dependency.parquet")
df.head()

ID,Gender,DOB,Lead_Creation_Date,City_Code,City_Category,Employer_Code,Employer_Category1,Employer_Category2,Monthly_Income,Customer_Existing_Primary_Bank_Code,Primary_Bank_Type,Contacted,Source,Source_Category,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI,Var1,Approved
str,str,str,str,str,str,str,str,i64,f64,str,str,str,str,str,f64,i64,i64,f64,i64,i64,i64
"""APPC90493171225""","""Female""","""23/07/79""","""15/07/16""","""C10001""","""A""","""COM0044082""","""A""",4,2000.0,"""B001""","""P""","""N""","""S122""","""G""",0.0,,,,,0,0
"""APPD40611263344""","""Male""","""07/12/86""","""04/07/16""","""C10003""","""A""","""COM0000002""","""C""",1,3500.0,"""B002""","""P""","""Y""","""S122""","""G""",0.0,20000.0,2.0,13.25,953.0,10,0
"""APPE70289249423""","""Male""","""10/12/82""","""19/07/16""","""C10125""","""C""","""COM0005267""","""C""",4,2250.0,"""B003""","""G""","""Y""","""S143""","""B""",0.0,45000.0,4.0,,,0,0
"""APPF80273865537""","""Male""","""30/01/89""","""09/07/16""","""C10477""","""C""","""COM0004143""","""A""",4,3500.0,"""B003""","""G""","""Y""","""S143""","""B""",0.0,92000.0,5.0,,,7,0
"""APPG60994436641""","""Male""","""19/04/85""","""20/07/16""","""C10002""","""A""","""COM0001781""","""A""",4,10000.0,"""B001""","""P""","""Y""","""S134""","""B""",2500.0,50000.0,2.0,,,10,0


In [12]:
from typing import Union, List

# Any custom function must satistfy the following function signature:
# func(df:Union[pl.DataFrame, pl.LazyFrame], cols: List[str], ...) -> List[pl.Expr]
# where ... means kwargs
# Here is a custom imputer

def smallest_abs_impute(df:Union[pl.DataFrame, pl.LazyFrame], cols: List[str], epsilon:float = 0.01) -> List[pl.Expr]:
    """
    Imputes columns by the min of the absolute values for c in columns, plus epsilon.
    """
    temp = df.lazy().select(pl.col(cols).abs().min() + epsilon).collect().row(0)
    return [pl.col(c).fill_null(m) for c, m in zip(cols, temp)]


In [13]:
bp = (
    Blueprint(df, name = "example", target = "approved")
    .lowercase() # lowercase all columns
    .append_fit_func(smallest_abs_impute, ["var1", "existing_emi", "loan_amount"], epsilon = 0.5)
    # Use append_fit_func for custom transforms
)
# Notice that the value to impute is correct, it is 0.5, because the min abs of the columns are 0.
pipe:Pipeline = bp.materialize()
pipe

Naive Query Steps: 

Step 1:
col("ID").alias("id"),
col("Gender").alias("gender"),
col("DOB").alias("dob"),
col("Lead_Creation_Date").alias("lead_creation_date"),
col("City_Code").alias("city_code"),
col("City_Category").alias("city_category"),
col("Employer_Code").alias("employer_code"),
col("Employer_Category1").alias("employer_category1"),
col("Employer_Category2").alias("employer_category2"),
col("Monthly_Income").alias("monthly_income"),
col("Customer_Existing_Primary_Bank_Code").alias("customer_existing_primary_bank_code"),
col("Primary_Bank_Type").alias("primary_bank_type"),
col("Contacted").alias("contacted"),
col("Source").alias("source"),
col("Source_Category").alias("source_category"),
col("Existing_EMI").alias("existing_emi"),
col("Loan_Amount").alias("loan_amount"),
col("Loan_Period").alias("loan_period"),
col("Interest_Rate").alias("interest_rate"),
col("EMI").alias("emi"),
col("Var1").alias("var1"),
col("Approved").alias("approved")

Step 2:
col("var1").fill_null([dyn fl

In [14]:
pipe.transform(df).null_count().select(["var1", "existing_emi", "loan_amount"])

var1,existing_emi,loan_amount
u32,u32,u32
0,0,0
