In [1]:
import pandas as pd
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import datetime
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np


In [2]:
#constants
rand_state = 1337

In [3]:
def set_col_types(df, target_col=True):
    if target_col:
        categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68','target']
    else:
        categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
        df['customer_ID'] = df['customer_ID'].astype('object')
    
    for col in categorical_cols:
        df[col] = df[col].astype('category')
    df["S_2"] = pd.to_datetime(df['S_2'], format=r'%Y-%m-%d').astype('datetime64[ns]')
    return df









def preprocess_data(df, label_cols=[], drop_cols=[]):

  df = df.fillna(np.nan) #because SimpleImputer requires specification of the type of nan value, we use this generic to change all nan types to np.nan types

  df = df.drop(columns=drop_cols)
  print(df.isna().sum().sum(), "nulls exist after drop")

  df_labels = df[label_cols] #splits any specified columns off to a label df
  df = df.drop(columns=label_cols)

  cat_cols = df.select_dtypes(include="category")
  num_cols = df.select_dtypes(include="number")
  date_cols = df.select_dtypes(include="datetime")
  other_cols = df.select_dtypes(exclude={"category","number","datetime"})

  #impute cat cols
  for col in cat_cols:
      if cat_cols[col].isna().any():
        cat_cols[col] = cat_cols[col].cat.add_categories('⍼')
  cat_cols = cat_cols.fillna('⍼')

  #impute num cols
  # for col in num_cols:
  #   if num_cols[col].isna().any():
  #     num_cols[col] = num_cols[col].fillna(num_cols[col].mean())
  num_cols_imputed = SimpleImputer(strategy="mean").fit_transform(num_cols)
  num_cols = pd.DataFrame(num_cols_imputed, columns=num_cols.columns)

  #scale num
  num_cols_scaled = StandardScaler().fit_transform(num_cols)
  num_cols_scaled = pd.DataFrame(num_cols_scaled, columns=num_cols.columns)
  num_cols = num_cols_scaled

  #get dummies for cat cols
  cat_cols = pd.get_dummies(cat_cols)

  #change datetime into components
  date_cols_expanded = pd.DataFrame()
  for col in date_cols:
    date_cols_expanded[col + "Month"] = date_cols[col].dt.month
    date_cols_expanded[col + "Day"] = date_cols[col].dt.day
    date_cols_expanded[col + "Year"] = date_cols[col].dt.year

  date_cols = date_cols_expanded.fillna(0)  ### ONLY USE THIS ON 3 WIDE - FILLS IN MISSING DATE COMPONENTS FOR PEOPLE WITH < 3 STATEMENTS

  #recombine columns
  df = pd.concat([other_cols, date_cols, num_cols, cat_cols], axis=1)

  if df.isna().sum().sum() > 0:
    print(f"WARNING: {df.isna().sum().sum()} nulls still exist after imputing.")
  else:
    print("No nulls exist after imputing.")
  
  if len(label_cols)>0:
    return df, df_labels
  else:
    return df







    


def sync_cols(train_df, pred_df):
    for col in train_df.columns:
      if col not in pred_df.columns:
        print(col, "not in pred_df so adding - should always be categorical!")
        pred_df[col] = 0
    for col in pred_df.columns:
      if col not in train_df.columns:
        print(col, "not in train_df so dropping")
        pred_df = pred_df.drop(col, axis=1)
    return pred_df









def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)


In [4]:
df = pd.read_parquet(r'../../amex-default-prediction/train_data.parquet')
df = set_col_types(df)
#reduce df for development !!!!! comment out line below for final model
#df = df[:100000]

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5531451 entries, 0 to 5531450
Columns: 191 entries, customer_ID to target
dtypes: category(12), datetime64[ns](1), float32(176), int64(1), object(1)
memory usage: 3.9+ GB


In [6]:
#df['S_2'] = np.int8(df['S_2'].dt.month) #reduce S_2 date column to month only for seasonality

In [7]:
df['statement_num'] = df.groupby("customer_ID")['S_2'].rank(method='first', ascending=False).astype(int)
df_1 = df[df['statement_num'] == 1] #last
df_2 = df[df['statement_num'] == 2] #2nd last
df_3 = df[df['statement_num'] == 3] #3rd last


df_comb = df_1.merge(df_2, how='left', on='customer_ID', suffixes=('', '_2'))
df_comb = df_comb.merge(df_3, how='left', on='customer_ID', suffixes=('_1', '_3'))
df_comb = df_comb.drop(['statement_num_1','target_2', 'statement_num_2','target_3', 'statement_num_3'], axis=1)
df_comb.rename(columns={'target_1':'target'}, inplace=True)
df_comb

Unnamed: 0,customer_ID,S_2_1,P_2_1,D_39_1,B_1_1,B_2_1,R_1_1,S_3_1,D_41_1,B_3_1,...,D_136_3,D_137_3,D_138_3,D_139_3,D_140_3,D_141_3,D_142_3,D_143_3,D_144_3,D_145_3
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2018-03-13,0.934745,0.009119,0.009382,1.007647,0.006104,0.135021,0.001604,0.007174,...,,,,0.000427,0.004594,0.003613,,0.007568,0.003004,0.006362
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,2018-03-25,0.880519,0.178126,0.034684,1.004028,0.006911,0.165509,0.005552,0.005068,...,,,,0.007274,0.008972,0.002497,,0.003979,0.006627,0.002178
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,2018-03-12,0.880875,0.009704,0.004284,0.812649,0.006450,,0.003796,0.007196,...,,,,0.009116,0.003886,0.000715,,0.007705,0.009415,0.000563
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,2018-03-29,0.621776,0.001083,0.012564,1.006183,0.007829,0.287766,0.004532,0.009937,...,,,,0.002382,0.003795,0.000637,,0.009453,0.004793,0.000834
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,2018-03-30,0.871900,0.005573,0.007679,0.815746,0.001247,,0.000231,0.005528,...,,,,0.002578,0.001665,0.000579,,0.004662,0.002551,0.009652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458908,ffff41c8a52833b56430603969b9ca48d208e7c192c6a4...,2018-03-31,0.844229,0.447585,0.028515,1.009866,0.001928,0.128707,0.003482,0.005893,...,,,,0.008494,0.001622,0.009657,,0.005007,0.008420,0.002532
458909,ffff518bb2075e4816ee3fe9f3b152c57fc0e6f01bf7fd...,2018-03-22,0.831279,0.033670,0.292360,0.055656,0.006953,,0.005791,0.233078,...,,,,0.002530,0.007390,0.003150,,0.007441,0.005510,0.002929
458910,ffff9984b999fccb2b6127635ed0736dda94e544e67e02...,2018-03-07,0.800522,0.267018,0.020563,1.007023,0.000957,0.066648,0.007424,0.006314,...,,,,,0.000498,,,,0.004904,
458911,ffffa5c46bc8de74f5a4554e74e239c8dee6b9baf38814...,2018-03-23,0.754129,0.008619,0.015838,0.714486,0.000993,0.408849,0.003392,0.050048,...,,,,1.009795,0.008908,0.941023,0.433807,1.003074,0.007104,0.182622


In [8]:
df = df_comb #overwriting df so I can reuse code from rf_pipeline
del df_1
del df_2
del df_3
del df_comb

In [9]:
#build list of columns with 50 percent missing values
percent_null = df.isnull().sum() / len(df) 
half_missing_cols = percent_null[percent_null > 0.5].index

In [10]:
#build model
y = df['target']
x = df.drop(columns=["target"])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=rand_state)
# IMPORTANT! - MUST reset the index because numcols gets it's index reset by either simpleimputer or standardscaler so concat later on will produce mismatched rows
x_train.reset_index(inplace=True, drop=True)
x_test.reset_index(inplace=True, drop=True) 
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

x_train, x_train_labels = preprocess_data(x_train, label_cols=["customer_ID"], drop_cols=half_missing_cols)


4870428 nulls exist after drop
No nulls exist after imputing.


In [11]:
rf_all = RandomForestClassifier(random_state=rand_state, n_jobs=-1)
print("Fitting Model")
rf_all.fit(x_train, y_train)

Fitting Model


In [12]:
x_test, x_test_labels = preprocess_data(x_test, label_cols=["customer_ID"], drop_cols=half_missing_cols)

x_test = sync_cols(x_train, x_test)

2085378 nulls exist after drop
No nulls exist after imputing.


In [24]:
test_expected_df = pd.DataFrame(y_test, columns=['target'])
test_expected_df['target'] = test_expected_df['target'].astype(int)

test_predict_df = rf_all.predict_proba(x_test)
test_predict_df = pd.DataFrame(test_predict_df,columns=["proba-inv","prediction"]).drop(columns="proba-inv")

print(amex_metric(test_expected_df, test_predict_df))

0.7222298604754077


In [14]:
del df #free up memory

df = pd.read_parquet(r'../../amex-default-prediction/test_data.parquet')
#reduce df for development !!!!! comment out line below for final model
#df = df[:100000]

df = set_col_types(df, target_col=False)

df['statement_num'] = df.groupby("customer_ID")['S_2'].rank(method='first', ascending=False).astype(np.int8)

In [15]:
df_1 = df[df['statement_num'] == 1] #last
df_2 = df[df['statement_num'] == 2] #2nd last
df_3 = df[df['statement_num'] == 3] #3rd last


df_comb = df_1.merge(df_2, how='left', on='customer_ID', suffixes=('', '_2'))
df_comb = df_comb.merge(df_3, how='left', on='customer_ID', suffixes=('_1', '_3'))
df_comb = df_comb.drop(['statement_num_1', 'statement_num_2', 'statement_num_3'], axis=1)
df_comb.rename(columns={'target_1':'target'}, inplace=True)
df_comb

df = df_comb #overwriting df so I can reuse code from rf_pipeline

del df_1
del df_2
del df_3
del df_comb

In [16]:
df, df_labels = preprocess_data(df, label_cols=["customer_ID"], drop_cols=half_missing_cols)

df = sync_cols(x_train, df)

11012246 nulls exist after drop
No nulls exist after imputing.
D_64_1_-1 not in pred_df so adding - should always be categorical!
D_68_1_0.0 not in pred_df so adding - should always be categorical!
D_64_2_-1 not in pred_df so adding - should always be categorical!
D_68_2_0.0 not in pred_df so adding - should always be categorical!
D_64_3_-1 not in pred_df so adding - should always be categorical!
D_68_3_0.0 not in pred_df so adding - should always be categorical!


In [17]:
rf_wide_prediction_preds = rf_all.predict(df)
rf_wide_prediction_proba = rf_all.predict_proba(df)
rf_wide_prediction_output = pd.concat([df_labels,pd.DataFrame(rf_wide_prediction_preds,columns=["pred"]),pd.DataFrame(rf_wide_prediction_proba,columns=["proba-inv","proba"])], axis=1)

rf_wide_prediction_output = rf_wide_prediction_output.drop(['pred', 'proba-inv'], axis=1)
rf_wide_prediction_output = rf_wide_prediction_output.rename(columns={"proba":"prediction"})



Feature names must be in the same order as they were in fit.

Feature names must be in the same order as they were in fit.



In [18]:
rf_wide_prediction_output.to_csv(r"..\..\amex-default-prediction\rf_wide_neg_one_output.csv",index=False)