## Clean and convert customer indeterminate definition
---

In [1]:
import sys, os, json
sys.path.insert(1, "../../")
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import pickle as pkl
import seaborn as sns
import lightgbm as lgb
import src.monitoring.utils as mu
import src.monitoring.monitoring as mntr
import rdsutils.score_alignment as sa
import src.monitoring.refit as refit

%matplotlib inline
plt.style.use('seaborn')

%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

### Goal

```python
combined_df = combine_data(d_df, s_df, 
                           config["date_sample_start"],
                           config["date_sample_end"],
                           all_features=False, filtering=False)
filtered_df = combine_data(d_df, s_df, 
                           config["date_sample_start"],
                           config["date_sample_end"],
                           all_features=False, filtering=True) 
combined_df[~combined_df.indeterminate] == filtered_df
```

In [18]:
with open("../../config.json", "r") as f:
    config = json.load(f)
    
static_path = config["data"]["labeled"]["labeled"]
static_path

'labeled/labeled_1620447810.feather'

In [20]:
with open("../../config-transactional.json", "r") as f:
    config_t = json.load(f)

dynamic_path = config_t["data"]["labeled"]["labeled"]
dynamic_path

'labeled/labeled_1620447284.feather'

In [50]:
static_df = pd.read_feather(os.path.join("../../data", static_path))
dynamic_df = pd.read_feather(os.path.join("../../data-transactional", dynamic_path))

In [51]:
debug = True
if debug:
    s_df = static_df.sample(n=100000, random_state=42)
    d_df = dynamic_df.sample(n=100000, random_state=42)

In [52]:
filtered_df.shape

(3111844, 44)

In [53]:
combined_df = pd.concat([s_df,d_df])
combined_df.shape

(200000, 149)

In [154]:
import pandas as pd
import numpy as np
import datetime
import gc


def combine_data(dynamic_full, static_full, 
                 date_sample_start, date_sample_end,
                 all_features=False, filtering=True):
    # need to calculate stats for all of the features
    # dynamic!!!
    cols_raw = [
        "first_deposit_amount",
        "vantage_score",
        "bcc7120",
        "email_risk_score",
        "fraud_score_2",
        "name_email_correlation",
        "transaction_as_pct_of_balance",
        "mean_account_balance_30d",
        "giact_time_since_last_link",
        "phone_risk_score",
        "name_address_correlation",
        "all8220",
        "lag_acc_open_first_transaction",
        "dollar_val_dd",
        "all7120",
        "sum_deposits_10d",
        "nr_past_transactions",
        "total_tradelines_open",
        "education_loan_amount",
        "address_risk_score",
        "iqt9415",
        "max_withdrawals_30d",
        "iln5520",
        "max_deposits_30d",
        "pct_returned_deposits",
        "giact_nr_decline",
        "nr_direct_deposits",
        "time_since_last_transaction",
        "bal_ratio",
        "name_phone_correlation",
        "giact_nr_other",
        "dollar_val_returns",
        "nr_trans_ratio",
        "iqt9413",
        "dollar_val_returns_3d",
        "nr_returns_30d",
        "credit_card_loan_amount",
        "fraud_score_1",
        "age_money_account",
    ]

    # comment out later
    print(f"dynamic sampling on dates from {date_sample_start} to {date_sample_end}")
    dynamic_full = dynamic_full[
        dynamic_full["transaction_datetime"].between(
            pd.to_datetime(date_sample_start), pd.to_datetime(date_sample_end)
        )
    ] # keep transactions between sample date range

    dynamic_full = dynamic_full[
        ~(pd.to_datetime(dynamic_full["dtc"]) < dynamic_full["transaction_datetime"])
    ] # remove transactions before account closed
    dynamic_full = dynamic_full[
        ~(
            pd.to_datetime(dynamic_full["chg_wrt_off_date"])
            < dynamic_full["transaction_datetime"]
        )
    ] # remove transactions after the account is charged off

    transaction_codes_to_sample_on = [
        "POSDW",
        "ACHDWIN",
        "ACHDDIN",
        "ACHDD",
        "ACHINDD",
        "DDATMREFUND",
        "DWATM",
        "ACHDW",
        "DWCRDBILLPAY",
        "DDCK",
        "DDCRDBILLREF",
        "DWTRF",
        "DWCK",
        "DWBILLPAY",
        "DDA2ATXFR",
        "DDRAFNEW",
        "DDRAFCUS",
        "DDTRF",
        "POSDD",
        "DDMBR",
        "DWMBR",
        "DWATMI",
        "DDLYFTBONUS",
        "DDMKT",
        "DPNC",
        "DPND",
        "OTID",
        "ACHDWP2P",
        "DW",
        "DDINT",
        "DD",
        "DWACHRET",
        "DDPC",
        "DWSLROTP",
        "DWCKCB",
        "DDBILLPAY",
        "ACHRD",
        "DDACHRET",
    ]
    dynamic_full["sample_date"] = dynamic_full["transaction_datetime"]

    if filtering:
        dynamic_full = dynamic_full[
            ~(dynamic_full["target"] & (dynamic_full["latest_acc_bal"] > 0))
        ]
        dynamic_full = dynamic_full[
            ~(~dynamic_full["target"] & (~dynamic_full["chg_wrt_off_date"].isna()))
        ]
        dynamic_full = dynamic_full[
            ~(~dynamic_full["target"] & (dynamic_full["latest_acc_bal"] < 0))
        ]
        dynamic_full = dynamic_full[
            ~(
                ~dynamic_full["target"]
                & (
                    dynamic_full["closed_reason"].isin(
                        [
                            "Closed by SoFi - Risk Request",
                            "Closed by SoFi - Charge-Off / Write-Off",
                        ]
                    )
                )
            )
        ]
        dynamic_full = dynamic_full[
            ~(
                ~dynamic_full["target"]
                & (dynamic_full["restricted_reason"].str.startswith("No"))
            )
        ]
        dynamic_full = dynamic_full[
            ~((dynamic_full["bal_after_90d"] == 0) & ~dynamic_full["target"])
        ]
        dynamic_full = dynamic_full[
            dynamic_full["transaction_code"].isin(transaction_codes_to_sample_on)
        ]

    # shuffle the dataframe
    dynamic_full = dynamic_full.sample(frac=1).reset_index(drop=True)
    gc.collect()
    # select first 15 transactions for each customer (transactions in random order)
    dynamic_full = dynamic_full.groupby("borrower_id").head(15)
    gc.collect()

    # static!!!
    static_full["age_money_account"] = (
        static_full["sample_date"] - static_full["date_account_opened"]
    ).dt.days
    # shouldn't we change to 90d? Nope...it was set to 60 during definition

    if filtering:
        static_full = static_full[~(static_full["nr_transactions_next_60d"] == 0)]
        static_full = static_full[
            ~(pd.to_datetime(static_full["dtc"]) <= static_full["sample_date"])
        ]
        static_full = static_full[
            ~(static_full["chg_wrt_off_date"] <= static_full["sample_date"])
        ]
        static_full = static_full[
            ~(static_full["target"] & (static_full["latest_acc_bal"] > 0))
        ]
        static_full = static_full[
            ~(~static_full["target"] & (~static_full["chg_wrt_off_date"].isna()))
        ]
        static_full = static_full[
            ~(~static_full["target"] & (static_full["latest_acc_bal"] < 0))
        ]
        static_full = static_full[
            ~(
                ~static_full["target"]
                & (
                    static_full["closed_reason"].isin(
                        [
                            "Closed by SoFi - Risk Request",
                            "Closed by SoFi - Charge-Off / Write-Off",
                        ]
                    )
                )
            )
        ]
        static_full = static_full[
            ~(
                ~static_full["target"]
                & (static_full["restricted_reason"].str.startswith("No"))
            )
        ]

    static_full["indeterminate"] = (
        (static_full["nr_transactions_next_60d"] == 0)
        | (pd.to_datetime(static_full["dtc"]) <= static_full["sample_date"])
        | (static_full["last_unrestricted_date"] <= static_full["sample_date"])
        | (static_full["chg_wrt_off_date"] <= static_full["sample_date"])
        | (static_full["target"] & (static_full["latest_acc_bal"] > 0))
        | (
            ~static_full["target"] & (~static_full["chg_wrt_off_date"].isna())
            | (static_full["latest_acc_bal"] < 0)
            | (
                static_full["closed_reason"].isin(
                    [
                        "Closed by SoFi - Risk Request",
                        "Closed by SoFi - Charge-Off / Write-Off",
                    ]
                )
            )
            | (static_full["restricted_reason"].str.startswith("No"))
        )
    )

    #     indeterminates
    #     static_full['indeterminate'] = (static_full['nr_transactions_next_60d'] == 0) | \
    #                                    (pd.to_datetime(static_full['dtc']) <= static_full['sample_date']) | \
    #                                    (static_full['last_unrestricted_date'] <= static_full['sample_date']) | \
    #                                    (static_full['chg_wrt_off_date'] <= static_full['sample_date'])

    #     static_full = static_full[~static_full['indeterminate']]
    #     static_full = static_full[~(static_full['target'] & (static_full['latest_acc_bal'] > 0))]
    #     static_full = static_full[~(~static_full['target'] & (~static_full['chg_wrt_off_date'].isna()))]
    #     static_full = static_full[~(~static_full['target'] & (static_full['latest_acc_bal'] < 0))]
    #     static_full = static_full[~(~static_full['target'] & (static_full['closed_reason'].isin(['Closed by SoFi - Risk Request',
    #                                                                                              'Closed by SoFi - Charge-Off / Write-Off'])))]
    #     static_full = static_full[~(~static_full['target'] & (static_full['restricted_reason'].str.startswith('No')))]

    # combine and create modeling dataset
    dynamic_full["sample_date"] = dynamic_full["transaction_datetime"]
    cols_metadata = [
        "business_account_number",
        "borrower_id",
        "sample_date",
        "target",
        "transaction_code",
    ]

    static_full["is_static"] = True
    dynamic_full["is_static"] = False
    
    if "nr_transactions_next_60d" not in dynamic_full.columns:
        # to keep static sample's col
        dynamic_full["nr_transactions_next_60d"] = np.nan
        

    if all_features:
        # we take all features that is available to us!
        cols = dynamic_full.columns.intersection(static_full.columns)
        modeling_df = pd.concat(
            [dynamic_full[cols], static_full[cols]], axis=0, ignore_index=True
        )
    else:
        modeling_df = pd.concat(
            [
                dynamic_full[cols_metadata + cols_raw],
                static_full[cols_metadata + cols_raw],
            ],
            axis=0,
            ignore_index=True,
        )
    return modeling_df


### Clean up combine
--- 
move filtering logic to indeterminate definition

In [155]:
cols_raw = [
    "first_deposit_amount",
    "vantage_score",
    "bcc7120",
    "email_risk_score",
    "fraud_score_2",
    "name_email_correlation",
    "transaction_as_pct_of_balance",
    "mean_account_balance_30d",
    "giact_time_since_last_link",
    "phone_risk_score",
    "name_address_correlation",
    "all8220",
    "lag_acc_open_first_transaction",
    "dollar_val_dd",
    "all7120",
    "sum_deposits_10d",
    "nr_past_transactions",
    "total_tradelines_open",
    "education_loan_amount",
    "address_risk_score",
    "iqt9415",
    "max_withdrawals_30d",
    "iln5520",
    "max_deposits_30d",
    "pct_returned_deposits",
    "giact_nr_decline",
    "nr_direct_deposits",
    "time_since_last_transaction",
    "bal_ratio",
    "name_phone_correlation",
    "giact_nr_other",
    "dollar_val_returns",
    "nr_trans_ratio",
    "iqt9413",
    "dollar_val_returns_3d",
    "nr_returns_30d",
    "credit_card_loan_amount",
    "fraud_score_1",
    "age_money_account",
]

cols_metadata = [
    "business_account_number",
    "borrower_id",
    "sample_date",
    "target",
    "transaction_code",
    "indeterminate"
]
# columns required


In [185]:
def combine_data_(
    dynamic_full,
    static_full,
    date_sample_start,
    date_sample_end,
    all_features=False,
    filtering=True,
):

    ####################################
    #             dynamic
    ####################################

    # get dynamic sample range
    print(f"dynamic sampling on dates from {date_sample_start} to {date_sample_end}")
    dynamic_full = dynamic_full[
        dynamic_full["transaction_datetime"].between(
            pd.to_datetime(date_sample_start), pd.to_datetime(date_sample_end)
        )
    ]  # keep transactions between sample date range

    dynamic_full = dynamic_full[
        ~(pd.to_datetime(dynamic_full["dtc"]) < dynamic_full["transaction_datetime"])
    ]  # remove transactions before account closed
    dynamic_full = dynamic_full[
        ~(
            pd.to_datetime(dynamic_full["chg_wrt_off_date"])
            < dynamic_full["transaction_datetime"]
        )
    ]  # remove transactions after the account is charged off

    dynamic_full["sample_date"] = dynamic_full["transaction_datetime"]

    # build indeterminate here? - move this to label next
    dynamic_full["indeterminate"] = get_indeterminate_dynamic(dynamic_full)

    # shuffle the dataframe
    dynamic_full = dynamic_full.sample(frac=1, random_state=42).reset_index(drop=True)
    gc.collect()

    # select first 15 transactions for each customer (transactions in random order)
    dynamic_full = dynamic_full.groupby("borrower_id").head(15)
    gc.collect()

    ####################################
    #             Static
    ####################################

    static_full = static_full[
        ~(pd.to_datetime(static_full["dtc"]) <= static_full["sample_date"])
    ]
    static_full = static_full[
        ~(static_full["chg_wrt_off_date"] <= static_full["sample_date"])
    ]
    static_full["age_money_account"] = (
        static_full["sample_date"] - static_full["date_account_opened"]
    ).dt.days
    
    # build indeterminate here? - move this to label next
    static_full["indeterminate"] = get_indeterminate_static(static_full)

    
    ####################################
    #          Put together
    ####################################

    static_full["is_static"] = True
    dynamic_full["is_static"] = False

    if ("nr_transactions_next_60d" in static_full.columns) and (
        "nr_transactions_next_60d" not in dynamic_full.columns
    ):
        # to keep static sample's col
        dynamic_full["nr_transactions_next_60d"] = np.nan

    if all_features:
        cols = dynamic_full.columns.intersection(static_full.columns)
    else:
        cols = cols_metadata + cols_raw

    modeling_df = pd.concat(
        [dynamic_full[cols], static_full[cols]], axis=0, ignore_index=True
    )

    if filtering:
        return modeling_df[~modeling_df.indeterminate]
    return modeling_df


def get_indeterminate_dynamic(df):
    """
    indeterminate definitions

    1. bad but recovered account balance
    2. good but charged off
    3. good but recently down
    4. good but closed by risk
    5. good but restricted
    6. in-active
    """

    ind = ((df["target"] & (df["latest_acc_bal"] > 0)) 
           | (~df["target"] & (  # 1
        (~df["chg_wrt_off_date"].isna())  # 2
        | (df["latest_acc_bal"] < 0)  # 3
        | (
            df["closed_reason"].isin(  # 4
                [
                    "Closed by SoFi - Risk Request",
                    "Closed by SoFi - Charge-Off / Write-Off",
                ]
            )
        )
        | (df["restricted_reason"].str.startswith("No")))  # 5
    ))
    return ind


def get_indeterminate_static(df):
    """
    indeterminate definitions

    1. bad but recovered account balance
    2. good but charged off
    3. good but recently down
    4. good but closed by risk
    5. good but restricted
    6. in-active
    """

    ind = (
        (df["nr_transactions_next_60d"] == 0)  # 6
        | (df["target"] & (df["latest_acc_bal"] > 0))  # 1
        | (
            ~df["target"]
            & (
                ~df["chg_wrt_off_date"].isna()  # 2
                | (df["latest_acc_bal"] < 0)  # 3
                | df["restricted_reason"].str.startswith("No")  # 5
                | df["closed_reason"].isin(  # 4
                    [
                        "Closed by SoFi - Risk Request",
                        "Closed by SoFi - Charge-Off / Write-Off",
                    ]
                )
            )
        )
    )

    return ind

In [193]:
if debug:
    s_df = static_df.sample(n=100000, random_state=42)
    d_df = dynamic_df.sample(n=100000, random_state=42)
filtered_df = combine_data(d_df, s_df, 
                           config["date_sample_start"],
                           config["date_sample_end"],
                           all_features=True, filtering=True)

filtered_df[["is_static", "indeterminate"]].value_counts().sort_index()

dynamic sampling on dates from 2019-02-01 to 2020-12-31


is_static  indeterminate
False      False            60388
           True               977
True       False            20474
           True              3173
dtype: int64

In [194]:
def cond6(df):
    return df["nr_transactions_next_60d"] == 0
def cond1(df):
    return df["target"] & (df["latest_acc_bal"] > 0)
def cond2(df):
    return ~df["target"] & ~df["chg_wrt_off_date"].isna()
def cond3(df):
    return  ~df["target"] & (df["latest_acc_bal"] < 0) 
def cond4(df):
    return  ~df["target"] & df["closed_reason"].isin(  # 4
                    [
                        "Closed by SoFi - Risk Request",
                        "Closed by SoFi - Charge-Off / Write-Off",
                    ]
                )
def cond5(df):
    return  ~df["target"] & df["restricted_reason"].str.startswith("No")


def cond7(df):
    return df["chg_wrt_off_date"] <= df["sample_date"]
def cond8(df):
    return pd.to_datetime(df["dtc"]) <= df["sample_date"]

In [196]:
combined_df = combine_data_(d_df, s_df, 
                           config["date_sample_start"],
                           config["date_sample_end"],
                           all_features=True, filtering=False)
combined_df_ = combined_df[~combined_df.indeterminate]

dynamic sampling on dates from 2019-02-01 to 2020-12-31


In [198]:
combined_df_.shape

(87372, 92)

In [175]:
filtered_df.shape, combined_df.shape, combined_df_.shape

((85012, 92), (165910, 92), (87372, 92))

In [131]:
filtered_only = set(filtered_df.index) - (set(combined_df_.index))
combined_only = set(combined_df_.index) - (set(filtered_df.index))
both = set(filtered_df.index).intersection(set(combined_df_.index))

In [132]:
len(filtered_only), len(combined_only), len(both)

(16708, 19068, 68304)

In [2]:
with open("artifacts/customer_risk_refit_20201231_wo_ind.pkl", "rb") as f:
    model = pkl.load(f)

In [5]:
list(zip(model.feature_importances_, model.feature_name_))

[(1087, 'first_deposit_amount'),
 (1020, 'vantage_score'),
 (632, 'bcc7120'),
 (533, 'email_risk_score'),
 (680, 'fraud_score_2'),
 (421, 'name_email_correlation'),
 (708, 'transaction_as_pct_of_balance'),
 (844, 'mean_account_balance_30d'),
 (646, 'phone_risk_score'),
 (446, 'name_address_correlation'),
 (694, 'all8220'),
 (575, 'lag_acc_open_first_transaction'),
 (392, 'dollar_val_dd'),
 (563, 'all7120'),
 (599, 'sum_deposits_10d'),
 (533, 'nr_past_transactions'),
 (428, 'total_tradelines_open'),
 (380, 'education_loan_amount'),
 (510, 'address_risk_score'),
 (254, 'iqt9415'),
 (588, 'max_withdrawals_30d'),
 (507, 'iln5520'),
 (492, 'max_deposits_30d'),
 (154, 'pct_returned_deposits'),
 (282, 'giact_nr_decline'),
 (179, 'nr_direct_deposits'),
 (186, 'time_since_last_transaction'),
 (462, 'bal_ratio'),
 (325, 'name_phone_correlation'),
 (193, 'giact_nr_other'),
 (235, 'dollar_val_returns'),
 (351, 'nr_trans_ratio'),
 (80, 'iqt9413'),
 (101, 'dollar_val_returns_3d'),
 (152, 'nr_returns

In [7]:
for f in model.feature_name_:
    print(f)

first_deposit_amount
vantage_score
bcc7120
email_risk_score
fraud_score_2
name_email_correlation
transaction_as_pct_of_balance
mean_account_balance_30d
phone_risk_score
name_address_correlation
all8220
lag_acc_open_first_transaction
dollar_val_dd
all7120
sum_deposits_10d
nr_past_transactions
total_tradelines_open
education_loan_amount
address_risk_score
iqt9415
max_withdrawals_30d
iln5520
max_deposits_30d
pct_returned_deposits
giact_nr_decline
nr_direct_deposits
time_since_last_transaction
bal_ratio
name_phone_correlation
giact_nr_other
dollar_val_returns
nr_trans_ratio
iqt9413
dollar_val_returns_3d
nr_returns_30d
credit_card_loan_amount
fraud_score_1
age_money_account
transaction_code_encoded
all7120_default_encoded
bcc7120_default_encoded
