# Description

The marked values will be processed separately from the usual object values. They will be treated as `NaN` values.

**Attention**

There are some restrictions for definition of marked values:

1) For real feature: mark value must be real.
2) For category feature: mark value can be any.

# Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from autowoe import AutoWoE


# Prepare Dataset

In [2]:
df = pd.read_csv("./data/train_demo.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,number_0,number_1,datetime_0,number_2,number_3,number_4,datetime_1,number_5,number_6,...,number_756,number_757,number_758,number_759,number_760,number_761,number_762,number_763,number_764,line_id
0,0,1.0,42.0,2016-05-20,0.0,0.0,38.0,2016-05-18,0.0,0.0,...,0.8,0.6,0.92,0.0,0.0,3.259135,1.0,,,2
1,1,1.0,62.0,2016-06-02,0.0,0.0,29.0,2016-06-01,0.0,0.0,...,-1.0,-1.0,1.0,0.0,0.0,6.906755,1.0,,,3
2,2,1.0,62.0,2016-09-07,0.0,0.0,48.0,2016-09-07,0.0,0.0,...,1.0,1.0,0.33,0.0,0.0,5.109978,1.0,,,6
3,3,1.0,57.0,2016-06-02,0.0,0.0,30.0,2016-06-01,0.0,0.0,...,0.75,0.5,0.82,1.942212,1.0,5.806138,1.0,,,8
4,4,1.0,37.0,2016-02-27,0.0,0.0,48.0,2016-02-27,1.0,0.0,...,-1.0,-1.0,1.0,1.539603,1.0,1.543687,1.0,,,9


In [4]:
TARGET_NAME = "target"

In [5]:
num_features = [col for col in df.columns if col.startswith('number')][:10]
cat_features = [col for col in df.columns if col.startswith('string')][:5]

df = df[num_features + cat_features + [TARGET_NAME]]

# Create some syntetic "marked" values


In [6]:
df.iloc[:10, 0] = -1
df.iloc[10:20, 0] = -2
df.iloc[:20, 1] = 1234567890
df.iloc[:20, 11] = 'Special'


In [7]:
df.head()

Unnamed: 0,number_0,number_1,number_2,number_3,number_4,number_5,number_6,number_7,number_8,number_9,string_0,string_1,string_2,string_3,target
0,-1.0,1234568000.0,0.0,0.0,38.0,0.0,0.0,0.0,38.0,25733.0,,Special,N,,0.0
1,-1.0,1234568000.0,0.0,0.0,29.0,0.0,0.0,0.0,29.0,16997.0,,Special,N,,1.0
2,-1.0,1234568000.0,0.0,0.0,48.0,0.0,0.0,0.0,48.0,-2.0,,Special,N,,0.0
3,-1.0,1234568000.0,0.0,0.0,30.0,0.0,0.0,0.0,30.0,118331.0,,Special,N,,0.0
4,-1.0,1234568000.0,0.0,0.0,48.0,1.0,0.0,0.0,48.0,68767.0,,Special,N,,0.0


In [8]:
train_df, test_df = train_test_split(df, stratify=df[TARGET_NAME], test_size=0.4, random_state=42, shuffle=True)

In [9]:
train_df[TARGET_NAME].mean(), test_df[TARGET_NAME].mean()

(0.010714285714285714, 0.010714285714285714)

# Train model

In [10]:


autowoe = AutoWoE(
    task="BIN",
    n_jobs=1,
    verbose=0
)

autowoe.fit(
    train=train_df, 
    target_name=TARGET_NAME, 
    features_mark_values={"number_0": (-1, -2), "number_1": (1234567890,), "string_1": ("Special",)}
)

[LightGBM] [Info] Number of positive: 34, number of negative: 3318
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 482
[LightGBM] [Info] Number of data points in the train set: 3352, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.010143 -> initscore=-4.580757
[LightGBM] [Info] Start training from score -4.580757


In [11]:
test_pred = autowoe.predict_proba(test_df)

In [12]:
print("Test ROC_AUC  = {:.3f}".format(roc_auc_score(test_df[TARGET_NAME], test_pred)))

Test ROC_AUC  = 0.602


# Example of SQL-query

In [13]:
query = autowoe.get_sql_inference_query("FEATURE_TABLE")
print(query)

SELECT
  1 / (1 + EXP(-(
    -4.548
    -0.991*WOE_TAB.number_9
    -0.89*WOE_TAB.number_8
    -0.365*WOE_TAB.number_4
    -1.152*WOE_TAB.string_1
    -0.549*WOE_TAB.number_1
  ))) as PROB,
  WOE_TAB.*
FROM 
    (SELECT
    CASE
      WHEN (number_9 IS NULL OR number_9 = 'NaN') THEN -0.876
      WHEN number_9 <= 7072.0 THEN -0.574
      WHEN number_9 <= 11699.5 THEN 1.035
      WHEN number_9 <= 13292.5 THEN -1.732
      ELSE 0.558
    END AS number_9,
    CASE
      WHEN (number_8 IS NULL OR number_8 = 'NaN') THEN 0
      WHEN number_8 <= 31.5 THEN 0.331
      WHEN number_8 <= 33.5 THEN -1.872
      WHEN number_8 <= 38.5 THEN 1.335
      WHEN number_8 <= 74.5 THEN -0.322
      ELSE 1.144
    END AS number_8,
    CASE
      WHEN (number_4 IS NULL OR number_4 = 'NaN') THEN 0
      WHEN number_4 <= 53.5 THEN 0.145
      WHEN number_4 <= 75.0 THEN -0.463
      ELSE 1.156
    END AS number_4,
    CASE
      WHEN string_1 == 'other' THEN 0.122
      WHEN string_1 == 'living in city in apart'

# Example of model representation

In [14]:
representation = autowoe.get_model_represenation()

features_representation = pd.DataFrame( representation['features'] )
intercept = representation['intercept']

In [15]:
features_representation

Unnamed: 0,number_9,number_8,number_4,string_1,number_1
f_type,real,real,real,cat,real
splits,"[7072.0, 11699.5, 13292.5]","[31.5, 33.5, 38.5, 74.5]","[53.5, 75.0]",,[21.5]
cod_dict,"{0: -0.574161, 1: 1.035276, 2: -1.732197, 3: 0...","{0: 0.330877, 1: -1.872163, 2: 1.335381, 3: -0...","{0: 0.144872, 1: -0.463322, 2: 1.156472}","{0: 0.121653, 1: -0.094148}","{0: -1.692192, 1: 0.054447}"
weight,-0.991122,-0.889965,-0.364501,-1.15229,-0.549001
nan_value,__NaN__,__NaN_0__,__NaN_0__,__NaN_0__,__NaN_0__
spec_cod,{'__NaN__': -0.875746},{'__NaN_0__': 0.0},{'__NaN_0__': 0.0},"{'__Mark_0__': 0.0, '__NaN_0__': 0.0, '__Small...","{'__Mark_0__': 0.0, '__NaN_0__': 0.0}"
cat_map,,,,"{'__Mark_0__': 0, 'living in city in apart': 1...",
spec_cat,,,,"({living in city in apart, other}, __Small_0__)",
mark_values,,,,"(Special,)","(1234567890,)"
mark_encoding,,,,{'Special': '__Mark_0__'},{1234567890: '__Mark_0__'}
