In [1]:
import sys, os, json, copy, sklearn, shap
import pandas as pd
import numpy as np
import lightgbm as lgb
import pickle as pkl
from tqdm import tqdm
from smart_open import open
import matplotlib.pyplot as plt
from rdsutils.metrics import get_pred_reports
from rdsutils.woe import WOE_Transform

from rdsutils.feature_selection import FeatureSelector as general_purpose_fsel
from rdsutils.feature_selection import mrmr

%load_ext autoreload
%autoreload 2

In [2]:
fl_path = "/home/ec2-user/SageMaker/fair-lending-master/"
if fl_path not in sys.path:
    sys.path.insert(1, fl_path)
import fair_lending
from fair_lending import DIAABase

## read data

In [3]:
df_score = pd.read_parquet('s3://sofi-data-science/hpeng/pl-gen4/data_dump/cleaned_tables/uw/df_fl_v3.parquet')

In [4]:
df_pii = pd.read_parquet("s3://sofi-data-science/jxu/pl-gen4/data_dump/pl-gen4-final-data/pii.parquet")

In [5]:
df_pii.shape, df_score.shape

((3219387, 15), (2552835, 10))

In [6]:
df_score.head()

Unnamed: 0,id,applicant_type,date_start,seg,ri_source,target_v2,gen4_prescreen_score,gen4_underwriting_score,gen4_underwriting_score_29,gen4_underwriting_score_30
0,4937215,PRIMARY,2017-10-20,member,others,0.0,678.0,679.0,679.0,679.0
2,4820420,PRIMARY,2017-10-08,member,proxy,0.0,683.0,720.0,720.0,720.0
3,4996279,PRIMARY,2017-10-27,member,proxy,0.0,683.0,721.0,721.0,721.0
4,4820856,PRIMARY,2017-10-08,member,proxy,0.0,683.0,720.0,720.0,720.0
5,4792136,PRIMARY,2017-10-05,member,booked,0.0,720.0,717.0,717.0,717.0


In [7]:
df_pii.head()

Unnamed: 0,applicant_fico_score,applicant_type,applicant_vantage_score,date_start,gen3_score,id,period,ri_source,first,last,age,applied_zip,applied_city,applied_cbsa_name,applied_cbsa_code
0,697.0,PRIMARY,675.0,2017-10-27,714.0,4997100,dev17,others,Joseph,Quintero,26.0,65757,STRAFFORD,"Springfield, MO",44180
1,697.0,PRIMARY,675.0,2017-10-24,714.0,4970887,dev17,others,Joseph,Quintero,26.0,65757,STRAFFORD,"Springfield, MO",44180
2,696.0,PRIMARY,660.0,2017-10-11,641.0,4848540,dev17,others,Dennis,Davidson,50.0,44188,Bellevue,Cleveland-Elyria OH Metro,17460
3,679.0,PRIMARY,633.0,2017-10-28,706.0,5003002,dev17,others,Amelia,Ramirez,46.0,77377,Tomball,"Houston-The Woodlands-Sugar Land, TX",26420
4,597.0,PRIMARY,518.0,2017-10-09,594.0,4830862,dev17,others,Marcus,Thomas,30.0,38115,Memphis,"Memphis, TN-MS-AR",32820


In [8]:
df = pd.merge(df_score,df_pii, how ='left', on =['id','applicant_type','date_start','ri_source'])

### Preprocess

In [9]:
def preprocess(df, first_name_col, last_name_col, zip_col):
    df[f"{first_name_col}_clean"]=df[first_name_col].str.split(expand=True)[0].str.capitalize()
    
    pattern = '|'.join(['-', ' Jr', ' III',
                        ' II',' JR',' Sr',' De ',
                        ' de ',' ll',' iii', 
                        ' ii'," Iii", " B "])
    df[f'{last_name_col}_clean']=df[last_name_col].str.replace(pattern," ")
    df[f'{last_name_col}_clean']=df[f'{last_name_col}_clean'].str.replace("'","")
    
    df[f'{last_name_col}_1'] = df[f'{last_name_col}_clean'].str.split(expand=True)[0]
    df[f'{last_name_col}_2'] = df[f'{last_name_col}_clean'].str.split(expand=True)[1]

    df[f'{last_name_col}_clean']=df[f'{last_name_col}_1']
    idx = ~df[f'{last_name_col}_2'].isnull()
    df.loc[idx, f'{last_name_col}_clean']=df[f'{last_name_col}_2']
    df[f'{last_name_col}_clean']=df[f'{last_name_col}_clean'].str.capitalize()

    print("columns appened: ", [f"{first_name_col}_clean", 
                                f'{last_name_col}_clean',
                                f'{last_name_col}_1',
                                f'{last_name_col}_2'])
    return df

In [10]:
key_cols = ["id", "ri_source", "applicant_type"]
pii_cols = ["first", "last", "applied_zip", "age"]
display(df[pii_cols].isna().mean())

first          0.000000e+00
last           0.000000e+00
applied_zip    3.917214e-07
age            3.917214e-07
dtype: float64

In [11]:
df[pii_cols].dtypes

first           object
last            object
applied_zip     object
age            float64
dtype: object

In [12]:
df = preprocess(df, "first", "last", "applied_zip")
display(df.shape)

columns appened:  ['first_clean', 'last_clean', 'last_1', 'last_2']


(2552835, 25)

##### Dedup Logic

* Drop duplicates by the following logic:
    * based on pii_cols and model features
        * `['first', 'last', 'applied_zip', 'age']`
    * sort by `date_start` and keep the most recent

* Properties
    * user may appear multiple times if his/her attributes have changed.
    * other than that, each user should only appear once.

In [14]:
dup_cols = list(set(['gen4_prescreen_score','gen4_underwriting_score_29'] + pii_cols))
display(len(dup_cols))

df.sort_values("date_start", inplace=True, ascending=True)
df = df.drop_duplicates(dup_cols, keep="last")
print(df.shape)

6

(2441398, 25)


In [15]:
class LightGBMDIAA(DIAABase):
    
    def __init__(self, *args, **kwargs):
        DIAABase.__init__(self, *args, **kwargs)
        
    def train(self, df, features, model_params, context):
        """
        DIAABase.train_lgb_example
        """
        raise NotImplemented
        
# final_features = model.feature_name_.copy()
# final_params = model.get_params()

#         train_df = context["train_df"]
#         test_df = context["test_df"]
#         print(i_features)
#         params_ = copy.deepcopy(i_params)
        
#         mc_constraints = [monotone_dict[ft] for ft in features]
#         params_['monotone_constraints'] = mc_constraints
        
#         clf = lgb.LGBMClassifier(**params_)
#         clf.fit(train_df[i_features], train_df["target"])
        
#         test_df["pred_lgbm"] = clf.predict_proba(test_df[i_features])[:,1]  # test_data
#         df["pred_lgbm"] = clf.predict_proba(df[i_features])[:,1]  # df with fairness info
        
        # save model to directory
#         self.save_model(clf)
        
#         auc = round(roc_auc_score(y_true=test_df["target"],
#                                   y_score=test_df["pred_lgbm"])*100, 2)
        
        return df["pred_lgbm"], auc

In [16]:
FL = LightGBMDIAA(df, "first_clean", "last_clean", "applied_zip")

Demographic data processed, containing dict_keys(['us_census', 'first_names', 'last_names_races', 'dem_pcts'])
missing rate for zip5 match: 0.00705
missing rate for first name match: 0.0473
missing rate for last name match: 0.10011


In [17]:
fdf = FL.get_combined_df()

In [18]:
fdf[fdf._PCT_BLACK.isna()][["first", "last", "ri_source"]].value_counts().head()

first     last        ri_source
Stacy     Mcfadden    others       16
Stephen   Robson      proxy        11
Samantha  Munsterman  others       10
Ivanna    Strook      others        8
Vikram    Gandikota   others        8
dtype: int64

In [19]:
def _color_red_or_green(val):
    if isinstance(val, str):
        color = 'red' if val != "Passed" else "green"
    elif isinstance(val, float):
        color = 'red' if val < -0.3 else 'green'
    return 'background-color: %s' % color

# df.style.applymap(_color_red_or_green)

In [20]:
def scale_prescreen_scores(pred):
    """
    Convert probability to score.
    """
    try:
        assert (
            (pred >= 0) & (pred <= 1)
        ).all(), "probability must be in range [0,1]"
    except AssertionError:
        raise

    # Formula parameters
    factor = 40/np.log(2)
    offset = 650 - factor * np.log(5)

    # Minimum and maximum values for validation
    MINIMUM = 300
    MAXIMUM = 850

    score = np.minimum(
        np.maximum(np.log(pred / (1 - pred)) * factor + offset, MINIMUM),
        MAXIMUM,
    )

    return score

In [21]:
df_640 = df[df.applicant_fico_score >= 640]
print(df_640.shape)
FL640 = LightGBMDIAA(df_640, "first_clean", "last_clean", "applied_zip")
fdf = FL640.get_combined_df()

stats = FL640.get_stats("no_cutoff", "gen4_prescreen_score", age_col="age")
stats.style.applymap(_color_red_or_green, subset=["SMD", "test result"])

(1895407, 25)
Demographic data processed, containing dict_keys(['us_census', 'first_names', 'last_names_races', 'dem_pcts'])
missing rate for zip5 match: 0.00676
missing rate for first name match: 0.04586
missing rate for last name match: 0.10572

            NOTE: this implementation assumes higher score indicates lower risk
            i.e. same direction as fico
            if your model prediction/score predict 'badness' of data
            - higher score indicates higher risk, 
            please invert/flip the score to achieve intended result.
            e.g. scale_score(1-pred)
            


Unnamed: 0,t-test,p-value,SMD,test result
senior,51.146778,0.0,0.142628,Passed
female,-57.602691,0.0,-0.089106,Passed
black,-103.930072,0.0,-0.234538,Passed
hispanic,-130.207255,0.0,-0.284356,Passed
asian_pi,-1.033258,0.301483,-0.065849,Passed
ai_an,-16.664186,0.0,-0.17725,Passed
o_2race,-5.7809,0.0,-0.078279,Passed


In [22]:
df_640 = df[df.applicant_fico_score >= 640]
print(df_640.shape)
FL640 = LightGBMDIAA(df_640, "first_clean", "last_clean", "applied_zip")
fdf = FL640.get_combined_df()

stats = FL640.get_stats("no_cutoff", "gen4_underwriting_score_29", age_col="age")
stats.style.applymap(_color_red_or_green, subset=["SMD", "test result"])

(1895407, 25)
Demographic data processed, containing dict_keys(['us_census', 'first_names', 'last_names_races', 'dem_pcts'])
missing rate for zip5 match: 0.00676
missing rate for first name match: 0.04586
missing rate for last name match: 0.10572

            NOTE: this implementation assumes higher score indicates lower risk
            i.e. same direction as fico
            if your model prediction/score predict 'badness' of data
            - higher score indicates higher risk, 
            please invert/flip the score to achieve intended result.
            e.g. scale_score(1-pred)
            


Unnamed: 0,t-test,p-value,SMD,test result
senior,36.344427,0.0,0.101385,Passed
female,-83.653757,0.0,-0.129289,Passed
black,-100.299677,0.0,-0.224622,Passed
hispanic,-128.79186,0.0,-0.277374,Passed
asian_pi,0.235186,0.814064,-0.054296,Passed
ai_an,-16.706967,0.0,-0.186182,Passed
o_2race,-7.10623,0.0,-0.069888,Passed
