# Feature binning and evaluation

In [2]:
from typing import List

import re

import numpy as np

import pandas as pd

from utils.features import find_bin_breaks, bin_numeric_feature

In [3]:
train_data = pd.read_csv("data/train_data.csv", parse_dates=["block_time"])

In [4]:
features = [f for f in train_data.columns if f.endswith("_log_to_median_ratio")]

In [5]:
feature_breaks = {f: find_bin_breaks(train_data[f]) for f in features}

In [7]:
binned_data = [bin_numeric_feature(train_data[f], feature_breaks[f]) for f in features]

In [9]:
binned_data = pd.concat(binned_data, axis=1)

In [22]:
def get_breaks_for_bin(bin_name, feature_breaks):
    """Retrieve bin breaks from feature name and dictionary of breaks"""
    if bin_name.endswith("_nan"):
        return (None, None)
    feature, i = re.findall("(.+)_bin(\d+)", bin_name)[0]
    i = int(i)
    breaks = [None] + feature_breaks[feature] + [None]
    return (breaks[i], breaks[i + 1])

In [37]:
binned_feature_scores = []
bin_names = binned_data.columns
in_tgt = train_data["label"] == 1
in_ref = train_data["label"] == 0
n_tgt = in_tgt.sum()
n_ref = train_data.shape[0] - n_tgt
for x in bin_names:
    f = re.sub("_bin\d+|_nan", "", x)
    b = re.sub("^" + f + "_", "", x)
    breaks = get_breaks_for_bin(x, feature_breaks)
    in_bin = binned_data[x] == 1
    tgt_cnt = (in_bin & in_tgt).sum()
    ref_cnt = (in_bin & in_ref).sum()
    tgt_frac = tgt_cnt / n_tgt
    ref_frac = ref_cnt / n_ref
    ratio = 1
    if ref_frac > 0:
        ratio = tgt_frac / ref_frac
    binned_feature_scores.append(
        {
            "feature": f,
            "bin_name": b,
            "from": breaks[0],
            "to": breaks[1],
            "ref_cnt": ref_cnt,
            "ref_frac": ref_frac,
            "tgt_cnt": tgt_cnt,
            "tgt_frac": tgt_frac,
            "ratio": ratio,
        }
    )
binned_feature_scores = pd.DataFrame(binned_feature_scores)

In [39]:
binned_feature_scores.sort_values("ratio", ascending=False).head(10)

Unnamed: 0,feature,bin_name,from,to,ref_cnt,ref_frac,tgt_cnt,tgt_frac,ratio
234,snd_rcv_mean_time_diff_sec_tx_min_log_to_media...,bin0,,-3.047716,20055,0.0388,1183,0.707959,18.246323
240,snd_rcv_mean_time_diff_sec_tx_max_log_to_media...,bin0,,-3.159457,19454,0.037637,1139,0.681628,18.110402
246,snd_rcv_mean_time_diff_sec_tx_median_log_to_me...,bin0,,-3.126267,19527,0.037779,1136,0.679832,17.995175
252,snd_rcv_mean_time_diff_sec_tx_mean_log_to_medi...,bin0,,-3.070753,20095,0.038877,1138,0.681029,17.517315
136,snd_rcv_mean_amt_usd_tx_max_log_to_median_ratio,bin4,4.756586,,21022,0.040671,1034,0.618791,15.214574
160,snd_rcv_mean_amt_usd_tx_sum_log_to_median_ratio,bin4,4.661835,,30617,0.059234,1295,0.774985,13.083394
130,snd_rcv_mean_amt_usd_tx_min_log_to_median_ratio,bin4,4.082629,,27835,0.053852,1134,0.678636,12.601873
118,snd_rcv_amt_usd_sum_tx_std_log_to_median_ratio,bin4,1.888712,,993,0.001921,40,0.023938,12.460172
52,snd_rcv_tkn_type_cnt_tx_sum_log_to_median_ratio,bin4,2.220895,,20994,0.040617,835,0.499701,12.302817
40,snd_rcv_tkn_type_cnt_tx_mean_log_to_median_ratio,bin4,2.218967,,20980,0.04059,821,0.491323,12.104614


In [40]:
binned_feature_scores.sort_values("ratio").head(10)

Unnamed: 0,feature,bin_name,from,to,ref_cnt,ref_frac,tgt_cnt,tgt_frac,ratio
81,snd_rcv_tx_cnt_tx_std_log_to_median_ratio,bin3,0.652876,1.155059,17450,0.03376,0,0.0,0.0
43,snd_rcv_tkn_type_cnt_tx_std_log_to_median_ratio,bin1,-0.271993,-0.09775,333,0.000644,0,0.0,0.0
222,snd_rcv_life_time_sec_tx_std_log_to_median_ratio,bin0,,-3.85127,3168,0.006129,0,0.0,0.0
311,internal_tx_cnt_log_to_median_ratio,bin3,0.191496,0.316261,23,4.4e-05,0,0.0,0.0
145,snd_rcv_mean_amt_usd_tx_mean_log_to_median_ratio,bin1,-3.163067,-0.859914,85680,0.165764,5,0.002992,0.018051
127,snd_rcv_mean_amt_usd_tx_min_log_to_median_ratio,bin1,-3.177732,-0.80978,95082,0.183954,6,0.003591,0.019519
139,snd_rcv_mean_amt_usd_tx_median_log_to_median_r...,bin1,-3.261873,-0.932411,83528,0.1616,6,0.003591,0.022219
144,snd_rcv_mean_amt_usd_tx_mean_log_to_median_ratio,bin0,,-3.163067,38932,0.075321,4,0.002394,0.031781
138,snd_rcv_mean_amt_usd_tx_median_log_to_median_r...,bin0,,-3.261873,36778,0.071154,4,0.002394,0.033642
132,snd_rcv_mean_amt_usd_tx_max_log_to_median_ratio,bin0,,-2.045945,70829,0.137032,8,0.004788,0.034938


In [41]:
binned_feature_scores.to_csv("binned_features.csv", index=False)