In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, accuracy_score
import datetime
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
from joblib import dump, load
#rom sklearn.ensemble import RandomForestClassifier
from cuml.ensemble import RandomForestClassifier

In [2]:
#constants
rand_state = 1337


In [3]:
def set_col_types(df, target_col=True):
    if target_col:
        categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68','target']
    else:
        categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
        df['customer_ID'] = df['customer_ID'].astype('object')
    
    for col in categorical_cols:
        df[col] = df[col].astype('category')
    df["S_2"] = pd.to_datetime(df['S_2'], format=r'%Y-%m-%d').astype('datetime64[ns]')
    return df









def preprocess_data(df, label_cols=[], drop_cols=[]):

  df = df.fillna(np.nan) #because SimpleImputer requires specification of the type of nan value, we use this generic to change all nan types to np.nan types

  df = df.drop(columns=drop_cols)
  print(df.isna().sum().sum(), "nulls exist after drop")

  df_labels = df[label_cols] #splits any specified columns off to a label df
  df = df.drop(columns=label_cols)

  cat_cols = df.select_dtypes(include="category")
  num_cols = df.select_dtypes(include="number")
  date_cols = df.select_dtypes(include="datetime")
  other_cols = df.select_dtypes(exclude={"category","number","datetime"})

  #impute cat cols
  for col in cat_cols:
      if cat_cols[col].isna().any():
        cat_cols[col] = cat_cols[col].cat.add_categories('⍼')
  cat_cols = cat_cols.fillna('⍼')

  #impute num cols
  # for col in num_cols:
  #   if num_cols[col].isna().any():
  #     num_cols[col] = num_cols[col].fillna(num_cols[col].mean())
  num_cols_imputed = SimpleImputer(strategy="mean").fit_transform(num_cols)
  num_cols = pd.DataFrame(num_cols_imputed, columns=num_cols.columns)

  #scale num
  num_cols_scaled = StandardScaler().fit_transform(num_cols)
  num_cols_scaled = pd.DataFrame(num_cols_scaled, columns=num_cols.columns)
  num_cols = num_cols_scaled

  #get dummies for cat cols
  cat_cols = pd.get_dummies(cat_cols)

  #change datetime into components
  date_cols_expanded = pd.DataFrame()
  for col in date_cols:
    date_cols_expanded[col + "Month"] = date_cols[col].dt.month
    date_cols_expanded[col + "Day"] = date_cols[col].dt.day
    date_cols_expanded[col + "Year"] = date_cols[col].dt.year

  date_cols = date_cols_expanded

  #recombine columns
  df = pd.concat([other_cols, date_cols, num_cols, cat_cols], axis=1)

  if df.isna().sum().sum() > 0:
    print(f"WARNING: {df.isna().sum().sum()} nulls still exist after imputing.")
  else:
    print("No nulls exist after imputing.")
  
  if len(label_cols)>0:
    return df, df_labels
  else:
    return df







    


def sync_cols(train_df, pred_df):
    for col in train_df.columns:
      if col not in pred_df.columns:
        print(col, "not in pred_df so adding - should always be categorical!")
        pred_df[col] = 0
    for col in pred_df.columns:
      if col not in train_df.columns:
        print(col, "not in train_df so dropping")
        pred_df = pred_df.drop(col, axis=1)
    return pred_df









def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)


In [4]:
df = pd.read_parquet(r'../../amex-default-prediction/train_data.parquet')
df = set_col_types(df)

#reduce df for development !!!!! comment out line below for final model
#df = df[:200000]

df['statement_num'] = df.groupby("customer_ID")['S_2'].rank(method='first', ascending=False).astype(np.int8)

In [5]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5531451 entries, 0 to 5531450
Data columns (total 192 columns):
 #    Column         Dtype         
---   ------         -----         
 0    customer_ID    object        
 1    S_2            datetime64[ns]
 2    P_2            float32       
 3    D_39           float32       
 4    B_1            float32       
 5    B_2            float32       
 6    R_1            float32       
 7    S_3            float32       
 8    D_41           float32       
 9    B_3            float32       
 10   D_42           float32       
 11   D_43           float32       
 12   D_44           float32       
 13   B_4            float32       
 14   D_45           float32       
 15   B_5            float32       
 16   R_2            float32       
 17   D_46           float32       
 18   D_47           float32       
 19   D_48           float32       
 20   D_49           float32       
 21   B_6            float32       
 22   B_7            f

In [6]:
print(df.isna().sum().sum())
print(df.isna().sum()) #count nulls in each col

160858968
customer_ID           0
S_2                   0
P_2               45985
D_39                  0
B_1                   0
                  ...  
D_143            101548
D_144             40727
D_145            101548
target                0
statement_num         0
Length: 192, dtype: int64


In [7]:
#build list of columns with 50 percent missing values
percent_null = df.isnull().sum() / len(df) 
half_missing_cols = percent_null[percent_null > 0.5].index.to_list()

In [8]:
#prep data
y = df['target'].astype(np.int8)
x = df.drop(columns=["target"])
del df

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=rand_state)
del y
del x
# IMPORTANT! - MUST reset the index because numcols gets it's index reset by either simpleimputer or standardscaler so concat later on will produce mismatched rows
x_train.reset_index(inplace=True, drop=True)
x_test.reset_index(inplace=True, drop=True) 
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

In [9]:
x_train, x_train_labels = preprocess_data(x_train, label_cols=["customer_ID"], drop_cols=half_missing_cols)

12196010 nulls exist after drop
No nulls exist after imputing.


In [10]:
#build model
rf_all = RandomForestClassifier(random_state=rand_state, n_bins=512)
rf_all.fit(x_train, y_train)

  return func(**kwargs)
  ret_val = func(*args, **kwargs)


RandomForestClassifier()

In [11]:
del x_train
del y_train
del x_train_labels

x_test, x_test_labels = preprocess_data(x_test, label_cols=["customer_ID"], drop_cols=half_missing_cols)

rf_all.score(x_test, y_test)

5220592 nulls exist after drop
No nulls exist after imputing.


0.8762170076370239

In [None]:
#pickle model

dump(rf_all, r'../../amex-default-prediction/rf_all_by_parquet.joblib')


['../../amex-default-prediction/rf_all_by_parquet.joblib']

In [None]:
x_test, x_test_labels = preprocess_data(x_test, label_cols=["customer_ID"], drop_cols=half_missing_cols)

rf_all.score(x_test, y_test)

NameError: name 'x_test' is not defined

In [None]:
test_expected_df = pd.DataFrame(y_test, columns=['target'])
test_expected_df['target'] = test_expected_df['target'].astype(int)

test_predict_df = rf_all.predict_proba(x_test)
test_predict_df = pd.DataFrame(test_predict_df,columns=["proba-inv","prediction"]).drop(columns="proba-inv")

print(amex_metric(test_expected_df, test_predict_df))

0.7350931444401154


In [None]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5531451 entries, 0 to 5531450
Data columns (total 192 columns):
 #    Column         Dtype         
---   ------         -----         
 0    customer_ID    object        
 1    S_2            datetime64[ns]
 2    P_2            float32       
 3    D_39           float32       
 4    B_1            float32       
 5    B_2            float32       
 6    R_1            float32       
 7    S_3            float32       
 8    D_41           float32       
 9    B_3            float32       
 10   D_42           float32       
 11   D_43           float32       
 12   D_44           float32       
 13   B_4            float32       
 14   D_45           float32       
 15   B_5            float32       
 16   R_2            float32       
 17   D_46           float32       
 18   D_47           float32       
 19   D_48           float32       
 20   D_49           float32       
 21   B_6            float32       
 22   B_7            f

In [None]:
df = pd.read_parquet(r'../../amex-default-prediction/test_data.parquet')

In [None]:

df = set_col_types(df, target_col=False)


#reduce df for development !!!!! comment out line below for final model
#df = df[:100000]

df['statement_num'] = df.groupby("customer_ID")['S_2'].rank(method='first', ascending=False).astype(np.int8)

#prep test data
full_test, full_test_labels = preprocess_data(df, label_cols=["customer_ID"], drop_cols=half_missing_cols)
del df

full_test = sync_cols(x_train, full_test)

rf_all_prediction_preds = rf_all.predict(full_test)
rf_all_prediction_proba = rf_all.predict_proba(full_test)
rf_all_prediction_output = pd.concat([full_test_labels,pd.DataFrame(rf_all_prediction_preds,columns=["pred"]),pd.DataFrame(rf_all_prediction_proba,columns=["proba-inv","proba"])], axis=1)


rf_all_prediction_output.to_csv(r"../../amex-default-prediction/rf_all_output_parquet.csv",index=False)

32578874 nulls exist after drop
No nulls exist after imputing.
D_64_-1 not in pred_df so adding - should always be categorical!
D_68_0.0 not in pred_df so adding - should always be categorical!


Feature names must be in the same order as they were in fit.

Feature names must be in the same order as they were in fit.

