In [71]:
import sys, json, os, ast
import numpy as np
import pandas as pd
from smart_open import open
from tqdm import tqdm

sys.path.insert(1, "../..")
from src.logger import make_logger
from src.dataloader import TabularDataloader
from src.Trainer import LGBMTrainer, TFTrainer

# new modules
from _utils.sample_weights import get_sample_weight

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [72]:
with open("config.json", "r") as f:
    config = json.load(f)
    
display(config.keys()) 
seed = 42

dict_keys(['data', 'meta', 'data_columns', 'model_params', 'model_features', 'impute_vals', 'monotone'])

### Load Data
---
* already sampled

In [73]:
display(config["data"]["clean"].keys())

dict_keys(['all_features_dev1', 'all_features_dev2', 'all_features_oot1', 'all_features_oot2', 'subset_dev1', 'subset_dev2'])

In [74]:
# data dict
exp_dict = pd.read_csv(config["meta"]["exp_dict_path"])

In [75]:
dl = TabularDataloader(train_path=config["data"]["clean"]["subset_dev1"])
dl.load_data(debug_size=10000, random_state=seed)

In [7]:
debug_df, _, _ = dl.get_data(debug=True)
train_df, _, _ = dl.get_data(debug=False)
train_df.shape, debug_df.shape

((228188, 5131), (10000, 5131))

In [8]:
bureau_fts = config["data_columns"]["bureau_features_cols"] 
cat_fts = ['t11_t3d_segid', 't11_t3d_segid_supp'] # config["data_columns"]["cat_cols"] 
prescreen_fts = bureau_fts + cat_fts

### Preprocessing
---
* [ ] encoding
* [ ] imputation / or not
* [ ] clipping
* [x] create sample weight

In [161]:
df = debug_df.copy()

In [182]:
from src.preprocess import Preprocess

weights = {"booked": 1,
           "proxy": 1,
           "others": 0.25}
preprocessor = Preprocess(exp_dict)
df = preprocessor.transform(df, bureau_fts, weights)

100%|██████████| 4203/4203 [00:04<00:00, 1031.98it/s]


In [180]:
df.head()

Unnamed: 0,experian_consumer_key,p13_alj0300,p13_alj0313,p13_alj0316,p13_alj0416,p13_alj5030,p13_alj5320,p13_alj5530,p13_alj5730,p13_alj5820,...,ri_target_v2,weight_cob,ri_source,weight_ri_v1,target_v1,weight_ri_v2,target_v2,indeterminate_v1,indeterminate_v2,sample_weight
147704,48884418887,0.0,,,,,,,,,...,0.347028,1.0,others,0.652972,0.0,0.652972,0.0,False,False,0.25
943427,24158613955,2.0,1.0,1.0,1.0,22847.0,49151.0,22847.0,0.0,658.0,...,0.0,1.0,booked,1.0,0.0,1.0,0.0,False,False,1.0
938963,40825054934,0.0,,,,,,,,,...,0.683209,1.0,others,0.316791,0.0,0.316791,0.0,False,False,0.25
1563096,37374844660,13.0,8.0,8.0,4.0,29698.0,34600.0,7425.0,0.0,306.0,...,0.0,1.0,booked,1.0,0.0,1.0,0.0,False,False,1.0
1667750,27555988458,0.0,,,,,,,,,...,0.023215,1.0,others,0.023215,1.0,0.023215,1.0,False,False,0.25


#### sample weight

In [37]:
# df = debug_df.copy()
df = train_df.copy()

In [38]:
col = "ri_source"
weights = {"booked": 1,
           "proxy": 1,
           "others": 0.25}
ratios = {"booked": 1,
          "proxy": 1,
          "others": 1}
assert sorted(df[col].unique().tolist()) == sorted(list(weights.keys()))

In [150]:
pd.Series([1 for _ in range(100)])

0     1
1     1
2     1
3     1
4     1
     ..
95    1
96    1
97    1
98    1
99    1
Length: 100, dtype: int64

In [39]:
df['weight'] = df['weight_cob'] * df['weight_ri_v1']

df["weight_sample"], weights = get_sample_weight(df, "ri_source", weights=weights,
                                                 return_weights=True)
df["weight_ratio_v1"], weights_v1 = get_sample_weight(df, "ri_source", ratio=ratios, 
                                                normalize_by="booked", weight_col=None, return_weights=True)
df["weight_ratio_v2"], weights_v2 = get_sample_weight(df, "ri_source", ratio=ratios, 
                                                normalize_by="booked", weight_col="weight", return_weights=True)

In [40]:
df[["weight_sample", "ri_source"]].value_counts()

weight_sample  ri_source
0.25           others       193744
1.00           proxy         18692
               booked        15752
dtype: int64

In [41]:
df[["weight_sample", "ri_source"]].groupby("ri_source")["weight_sample"].sum()

ri_source
booked    15752.0
others    48436.0
proxy     18692.0
Name: weight_sample, dtype: float64

In [42]:
df[["weight_ratio_v1", "ri_source"]].value_counts()

weight_ratio_v1  ri_source
0.081303         others       193744
0.842713         proxy         18692
1.000000         booked        15752
dtype: int64

In [43]:
df[["weight_ratio_v1", "ri_source"]].groupby("ri_source")["weight_ratio_v1"].sum()

ri_source
booked    15752.0
others    15752.0
proxy     15752.0
Name: weight_ratio_v1, dtype: float64

In [44]:
df['weight'] = df['weight_ratio_v2'] * df['weight_cob'] * df['weight_ri_v1']
df[["weight", "ri_source"]].groupby("ri_source")["weight"].sum()

ri_source
booked    15752.0
others    15752.0
proxy     15752.0
Name: weight, dtype: float64

In [45]:
weights, weights_v1, weights_v2

({'booked': 1, 'proxy': 1, 'others': 0.25},
 {'others': 0.0813031629366587, 'proxy': 0.8427134603038734, 'booked': 1.0},
 {'booked': 1.0, 'others': 0.1626063258733174, 'proxy': 0.8427134603038734})

#### imputation

In [50]:
# 1/df["target_v1"].value_counts(normalize=True).loc[1.0]
# 

2.2745105857023247

In [56]:
df[["target_v1", "weight"]].tail(5)

Unnamed: 0,target_v1,weight
2325159,1.0,0.000595
2325167,0.0,0.162437
2325168,1.0,0.000169
2325174,0.0,0.162364
2325175,1.0,0.000242


In [63]:
df_ = df[["weight", "target_v1"]].tail(5)
df_

Unnamed: 0,weight,target_v1
2325159,0.000595,1.0
2325167,0.162437,0.0
2325168,0.000169,1.0
2325174,0.162364,0.0
2325175,0.000242,1.0


In [70]:
scale_pos_weight = 1/df["target_v1"].value_counts(normalize=True).loc[1.0]
df["weight_w_target"] = np.where(df["target_v1"]==1.0, df["weight"]*scale_pos_weight, df["weight"])
df[["target_v1", "weight", "weight_w_target"]].tail()

Unnamed: 0,target_v1,weight,weight_w_target
2325159,1.0,0.000595,0.001353
2325167,0.0,0.162437,0.162437
2325168,1.0,0.000169,0.000385
2325174,0.0,0.162364,0.162364
2325175,1.0,0.000242,0.000551


#### clipping

In [135]:
df.head()

Unnamed: 0,experian_consumer_key,p13_alj0300,p13_alj0313,p13_alj0316,p13_alj0416,p13_alj5030,p13_alj5320,p13_alj5530,p13_alj5730,p13_alj5820,...,weight_cob,ri_source,weight_ri_v1,target_v1,weight_ri_v2,target_v2,indeterminate_v1,indeterminate_v2,weight,weight_ratio
147704,48884418887,0.0,,,,,,,,,...,1.0,others,0.652972,0.0,0.652972,0.0,False,False,0.25,0.088357
943427,24158613955,2.0,1.0,1.0,1.0,22847.0,49151.0,22847.0,0.0,658.0,...,1.0,booked,1.0,0.0,1.0,0.0,False,False,1.0,1.0
938963,40825054934,0.0,,,,,,,,,...,1.0,others,0.316791,0.0,0.316791,0.0,False,False,0.25,0.088357
1563096,37374844660,13.0,8.0,8.0,4.0,29698.0,34600.0,7425.0,0.0,306.0,...,1.0,booked,1.0,0.0,1.0,0.0,False,False,1.0,1.0
1667750,27555988458,0.0,,,,,,,,,...,1.0,others,0.023215,1.0,0.023215,1.0,False,False,0.25,0.088357


In [140]:
missing_pct = train_df.isna().mean().sort_values()