In [2]:
import sys, os, json
sys.path.insert(1, "../../")
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import pickle as pkl
import seaborn as sns
import lightgbm as lgb
import src.monitoring.utils as mu
import src.monitoring.monitoring as mntr
import rdsutils.score_alignment as sa
import src.monitoring.refit as refit

%matplotlib inline
plt.style.use('seaborn')

%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

In [3]:
# %%time
# df = pd.read_feather("../../data/combined_all_features/combined_1620872375.feather")

In [4]:
# with open("../../config.json", "r") as f:
#     config = json.load(f)

# sample_start = config["date_sample_start"]
# sample_end = config["date_sample_end"]
# static_sample_dates = config["static_sample_dates"]

In [5]:
# df[["target", "indeterminate"]].value_counts()

In [6]:
# df[df.sample_date.between(pd.to_datetime(sample_start),
#                           pd.to_datetime(sample_end))].sample_date.hist(bins=100, alpha=0.4)

# df[df.sample_date > pd.to_datetime(sample_end)].sample_date.hist(bins=10, alpha=0.4)

In [7]:
# train_start = pd.to_datetime("2019-02-01")
# train_end = pd.to_datetime("2020-12-31")
# valid_dates = [pd.to_datetime(d) for d in ["2021-01-01", "2021-02-01"]]

# train_df = df[df.sample_date.between(train_start, train_end)]
# test_df = df[df.sample_date.isin(valid_dates)]
# train_df.shape, test_df.shape

In [8]:
mmonth = "202104"
s3_base_path = f"s3://sofi-data-science/jxu/money-risk-models/customer-risk-model/monitor/{mmonth}"
train_df = pd.read_parquet(os.path.join(s3_base_path, "dev_train_scored.parquet"))
test_df = pd.read_parquet(os.path.join(s3_base_path, "dev_test_scored.parquet"))

In [9]:
train_df = mu.preprocess(train_df)
test_df = mu.preprocess(test_df)

In [None]:
prev_model_s3_key = "jxu/money-risk-models/customer-risk-model/models/customer_risk_target_no_giact_time_since_last_link.pkl"
prev_model = mu.read_pickle_from_s3("sofi-data-science", prev_model_s3_key)

# with indeterminant
clf_w_ind = refit.train(train_df,
                  date_col="sample_date",
                  indeterminate_col=None)

# without indeterminant
clf_wo_ind = refit.train(train_df,
                  date_col="sample_date",
                  indeterminate_col="indeterminate")

modeling_df last date: 2020-12-31 00:00:00
15556
target counts
False    4483337
True      173421
Name: target, dtype: int64
[LightGBM] [Info] Number of positive: 173421, number of negative: 4483337
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7814
[LightGBM] [Info] Number of data points in the train set: 4656758, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037241 -> initscore=-3.252401
[LightGBM] [Info] Start training from score -3.252401


In [None]:
# let's scale by bads anyways
import rdsutils.score_alignment as sa

############################################
# the incumbent model and scaling method
############################################

test_df["pred_incumbent"] = prev_model.predict_proba(test_df[prev_model.feature_name_])[:,1]
test_df["score_incumbent"] = mu.scale_scores(test_df["pred_incumbent"])

#############################################
# prediction without indeterminates - dropped ind - rescaled
############################################

test_df["pred_wo_ind"] = clf_wo_ind.predict_proba(test_df[clf_wo_ind.feature_name_])[:,1]
test_df["score_wo_ind"] = mu.scale_scores(test_df["pred_wo_ind"])


############################################
# prediction with indeterminates
############################################

test_df["pred_w_ind"] = clf_w_ind.predict_proba(test_df[clf_w_ind.feature_name_])[:,1]

src_pred = "pred_w_ind"
tgt_pred = "pred_incumbent"
target_col = "target"

br_tbl = sa.get_score_alignment_table(test_df, src_pred, tgt_pred, target_col,
                                   br_precision = 3, pred_precision = 3)
test_df[src_pred + "_rescaled"] = sa.get_aligned_score(test_df, br_tbl, src_pred, tgt_pred,
                                                    pred_precision=3)

test_df["score_w_ind_rescaled"] = mu.scale_scores(test_df["pred_w_ind_rescaled"])


In [None]:
def get_risk_groups(serie):
    return pd.cut(serie.round(), [300, 474, 510, 560, 600, 850], 
                  right=True, labels=[f'RG{i}' for i in range(1, 6)])

test_df['rg_incumbent'] = get_risk_groups(test_df.score_incumbent)
test_df['rg_wo_ind'] = get_risk_groups(test_df.score_wo_ind)
test_df['rg_w_ind_rescaled'] = get_risk_groups(test_df.score_w_ind_rescaled)

In [None]:
static_sample_dates = sorted(train_df[train_df.is_static].sample_date.unique())
static_sample_dates = [d for d in static_sample_dates if d >= pd.to_datetime("2020-01-01")]
static_sample_dates

In [None]:
%%time

clfs = {}

for d in tqdm(static_sample_dates):
    dt_str = str(d).split("T")[0]
    train_df_ = train_df[train_df.sample_date <= d]
        
    for ind in [None, "indeterminate"]:

        clf = refit.train(train_df_, 
                        date_col="sample_date",
                        indeterminate_col=ind)
        
        if ind is None:
            ind_str = "w_ind"
        elif ind == "indeterminate":
            ind_str = "wo_ind"
        elif ind == "indeterminate_prev":
            ind_str = "wo_prev_ind"
        else:
            raise NotImplemented

        clf_name = f"model_{dt_str}_{ind_str}"
        pred_col = f"pred_{dt_str}_{ind_str}"
        score_col = f"score_{dt_str}_{ind_str}"
        test_df[pred_col] = clf.predict_proba(test_df[clf.feature_name_])[:,1]

        # save model
        clfs[clf_name] = clf

        

In [None]:
pred_cols = [c for c in test_df.columns 
             if "pred_" in c and "_rescaled" not in c and "_w_ind" not in c]
score_cols = [c for c in test_df.columns 
              if "score_" in c and "_rescaled" not in c and "_w_ind" not in c]

In [None]:
%%time 

df_ = test_df[test_df.sample_date == pd.to_datetime("2021-01-01")]
report = mntr.get_pred_reports(df_, "target", pred_cols, dropna=True)
display(report.sort_values("ap", ascending=False))

In [None]:
%%time 

df_ = test_df[test_df.sample_date == pd.to_datetime("2021-02-01")]
report = mntr.get_pred_reports(df_, "target", pred_cols, dropna=True)
display(report.sort_values("ap", ascending=False))

### Summary
---

In [None]:
candidate_cols = ["pred_2020-12-31_wo_ind",
                  "pred_incumbent"]

In [None]:
mntr_path = "./artifacts"
mntr.save_valid_performance_plots(
        {"reduced": df_}, "target", candidate_cols, mntr_path, dropna=True
    )

In [None]:
def plot_score_distr(df, score1, score2, ax, title):

    df[score1].hist(bins=50, alpha=0.4, label=score1, ax=ax)
    df[score2].hist(bins=50, alpha=0.4, label=score2, ax=ax)
    
    ax.set_title(title)
    ax.legend()

In [None]:
test_df["score_2020-12-31_wo_ind"] = mu.scale_scores(test_df["pred_2020-12-31_wo_ind"])
test_df["is_active"] = (test_df.nr_past_transactions > 0) & (test_df.nr_transactions_30d > 0)

# plot 4x4
score1 = "score_incumbent"
score2 = "score_2020-12-31_wo_ind"

fig, axs = plt.subplots(2,2, figsize=(16, 16))

df_ = test_df[~test_df.is_active & ~test_df.target]  # inactive good accounts
plot_score_distr(df_, score1, score2, axs[0, 0], "score distr - reduced - in-active - good")

df_ = test_df[~test_df.is_active & test_df.target]  # inactive bad accounts
plot_score_distr(df_, score1, score2, axs[0, 1], "score distr - reduced - in-active - bad")

df_ = test_df[test_df.is_active & ~test_df.target]  # active good accounts
plot_score_distr(df_, score1, score2, axs[1, 0], "score distr - reduced - active - good")

df_ = test_df[test_df.is_active & test_df.target]  # active bad accounts
plot_score_distr(df_, score1, score2, axs[1, 1], "score distr - reduced - active - bad")

plt.show()

In [None]:
def get_risk_groups(serie):
    return pd.cut(serie.round(), [300, 474, 510, 560, 600, 850], 
                  right=True, labels=[f'RG{i}' for i in range(1, 6)])

test_df["rg_incumbent"] = get_risk_groups(test_df.score_incumbent)
test_df["rg_2020-12-31_wo_ind"] = get_risk_groups(test_df["score_2020-12-31_wo_ind"])

In [None]:
rg_incumbent = "rg_incumbent"
rg_refit = "rg_2020-12-31_wo_ind"
table = test_df[[rg_incumbent, rg_refit]].value_counts(normalize=True).sort_index().reset_index()
table.columns = [rg_incumbent, rg_refit, 'counts']
table = pd.pivot_table(table, values='counts', index=rg_incumbent, 
                       columns=rg_refit, fill_value=0)
fig = plt.figure()
sns.heatmap(table, cmap='coolwarm', annot=True, fmt='.6g')
plt.title('Risk Group Shift')

print("ratio of users kept their RG: ", np.trace(table))

In [None]:
import rdsutils.swap_set_analysis as ssa
fig, ax = ssa.plot_swap_set_bad_rate(test_df, rg_incumbent, rg_refit, "target", margins=True)