# Imports

In [1]:
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

from autowoe import AutoWoE

In [2]:
import autowoe

autowoe.__version__

'1.3.0'

# Prepare Dataset

In [3]:
df = pd.read_csv("./data/regression_dataset.csv")

In [4]:
TARGET_NAME = "Target"

In [5]:
train_df, test_df = train_test_split(df, test_size=0.4, random_state=42, shuffle=True)

In [6]:
train_df[TARGET_NAME].mean(), test_df[TARGET_NAME].mean()

(7.414631748745448, 7.417039712211102)

# Train model

In [7]:
autowoe = AutoWoE(
    task="REG", monotonic=True, interpreted_model=True, regularized_refit=True, metric_th=0.0, n_jobs=1, verbose=0
)

autowoe.fit(train=train_df, target_name=TARGET_NAME)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1542
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 56
[LightGBM] [Info] Start training from score -0.039910


In [8]:
train_pred = autowoe.predict(train_df)
test_pred = autowoe.predict(test_df)

In [9]:
train_pred = autowoe.predict(train_df)

In [10]:
train_df

Unnamed: 0,Cat_0,Cat_1,Cat_10,Cat_11,Cat_12,Cat_13,Cat_14,Cat_15,Cat_16,Cat_17,...,Real_28,Real_3,Real_4,Real_5,Real_6,Real_7,Real_8,Real_9,Target,Target.1
24,1,-1,11,13,2,4,2,2,3,0,...,0.75,0.000000,0.666667,0.428571,0.816667,0.099220,0.000000,0.372860,7.560080,7.560080
467,1,-1,11,13,1,4,2,2,3,3,...,0.75,0.588235,0.666667,0.428571,0.933333,0.002835,0.000000,0.569349,7.501082,7.501082
539,1,-1,12,14,1,4,1,3,3,1,...,0.75,0.352941,0.555556,0.428571,0.200000,0.119064,0.468114,0.000000,7.473069,7.473069
531,1,-1,11,13,3,4,2,0,3,2,...,0.75,0.000000,0.777778,0.428571,0.966667,0.000000,0.000000,0.875856,8.222822,8.222822
618,1,-1,8,10,2,4,1,3,3,3,...,1.00,0.000000,0.555556,0.714286,0.933333,0.173990,0.000000,0.076627,7.588324,7.588324
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,1,-1,8,10,2,4,1,3,3,3,...,0.75,0.000000,0.555556,0.857143,0.916667,0.077782,0.320217,0.049229,7.352441,7.352441
270,1,-1,11,13,-1,4,2,0,1,1,...,0.00,0.000000,1.000000,0.428571,0.933333,0.245571,0.000000,0.295377,8.444622,8.444622
860,1,-1,8,10,2,4,2,3,1,1,...,0.75,0.235294,0.444444,0.428571,0.283333,0.044118,0.518318,0.000000,7.522941,7.522941
435,1,-1,4,5,-1,4,2,2,3,3,...,0.50,0.235294,0.666667,0.571429,0.950000,0.000000,0.000000,0.348031,7.630219,7.630219


In [11]:
print(f"Train R2 = {r2_score(train_df[TARGET_NAME], train_pred):.3f}")
print(f"Test R2  = {r2_score(test_df[TARGET_NAME], test_pred):.3f}")

Train R2 = 0.827
Test R2  = 0.793


# Example of SQL-query

In [12]:
query = autowoe.get_sql_inference_query("FEATURE_TABLE")
print(query)

SELECT
 ( 7.415 * ( 
    0.0
    +0.275*WOE_TAB.Target.1
    +0.175*WOE_TAB.Real_4
    +0.176*WOE_TAB.Real_13
    +0.099*WOE_TAB.Real_2
    +0.027*WOE_TAB.Real_19
    +0.257*WOE_TAB.Real_10
    +0.098*WOE_TAB.Real_6
    +0.125*WOE_TAB.Real_18
    +0.063*WOE_TAB.Real_21
    +0.095*WOE_TAB.Real_1
    +0.305*WOE_TAB.Real_7
    +0.281*WOE_TAB.Real_11
    +0.193*WOE_TAB.Real_0
    +0.33*WOE_TAB.Cat_21
    +0.183*WOE_TAB.Cat_29
    +0.163*WOE_TAB.Real_20
    +0.248*WOE_TAB.Cat_33
    +0.0*WOE_TAB.Cat_22
    +0.015*WOE_TAB.Real_22
    +0.129*WOE_TAB.Cat_13
    +0.338*WOE_TAB.Cat_5
    +0.37*WOE_TAB.Real_24
    +0.462*WOE_TAB.Cat_24
) + 0.411
 ) as PROB,
  WOE_TAB.*
FROM 
    (SELECT
    CASE
      WHEN (Target.1 IS NULL OR Target.1 = 'NaN') THEN 0
      WHEN Target.1 <= 7.42207 THEN -0.765
      ELSE 0.802
    END AS Target.1,
    CASE
      WHEN (Real_4 IS NULL OR Real_4 = 'NaN') THEN 0
      WHEN Real_4 <= 0.61111 THEN -0.55
      ELSE 0.866
    END AS Real_4,
    CASE
      WHEN (Real_13 I

In [13]:
list(range(0, 0, -1))

[]

In [14]:
import sklearn

sklearn.__version__

'1.0.2'