In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import datetime
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
from joblib import dump, load

In [1]:
import cudf

In [6]:
#constants
rand_state = 1337


In [7]:
def set_col_types(df, target_col=True):
    if target_col:
        categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68','target']
    else:
        categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
        df['customer_ID'] = df['customer_ID'].astype('object')
    
    for col in categorical_cols:
        df[col] = df[col].astype('category')
    df["S_2"] = pd.to_datetime(df['S_2'], format=r'%Y-%m-%d').astype('datetime64[ns]')
    return df









def preprocess_data(df, label_cols=[], drop_cols=[]):

  df = df.fillna(np.nan) #because SimpleImputer requires specification of the type of nan value, we use this generic to change all nan types to np.nan types

  df = df.drop(columns=drop_cols)
  print(df.isna().sum().sum(), "nulls exist after drop")

  df_labels = df[label_cols] #splits any specified columns off to a label df
  df = df.drop(columns=label_cols)

  cat_cols = df.select_dtypes(include="category")
  num_cols = df.select_dtypes(include="number")
  date_cols = df.select_dtypes(include="datetime")
  other_cols = df.select_dtypes(exclude={"category","number","datetime"})

  #impute cat cols
  for col in cat_cols:
      if cat_cols[col].isna().any():
        cat_cols[col] = cat_cols[col].cat.add_categories('⍼')
  cat_cols = cat_cols.fillna('⍼')

  #impute num cols
  # for col in num_cols:
  #   if num_cols[col].isna().any():
  #     num_cols[col] = num_cols[col].fillna(num_cols[col].mean())
  num_cols_imputed = SimpleImputer(strategy="mean").fit_transform(num_cols)
  num_cols = pd.DataFrame(num_cols_imputed, columns=num_cols.columns)

  #scale num
  num_cols_scaled = StandardScaler().fit_transform(num_cols)
  num_cols_scaled = pd.DataFrame(num_cols_scaled, columns=num_cols.columns)
  num_cols = num_cols_scaled

  #get dummies for cat cols
  cat_cols = pd.get_dummies(cat_cols)

  #change datetime into components
  date_cols_expanded = pd.DataFrame()
  for col in date_cols:
    date_cols_expanded[col + "Month"] = date_cols[col].dt.month
    date_cols_expanded[col + "Day"] = date_cols[col].dt.day
    date_cols_expanded[col + "Year"] = date_cols[col].dt.year

  date_cols = date_cols_expanded

  #recombine columns
  df = pd.concat([other_cols, date_cols, num_cols, cat_cols], axis=1)

  if df.isna().sum().sum() > 0:
    print(f"WARNING: {df.isna().sum().sum()} nulls still exist after imputing.")
  else:
    print("No nulls exist after imputing.")
  
  if len(label_cols)>0:
    return df, df_labels
  else:
    return df







    


def sync_cols(train_df, pred_df):
    for col in train_df.columns:
      if col not in pred_df.columns:
        print(col, "not in pred_df so adding - should always be categorical!")
        pred_df[col] = 0
    for col in pred_df.columns:
      if col not in train_df.columns:
        print(col, "not in train_df so dropping")
        pred_df = pred_df.drop(col, axis=1)
    return pred_df









def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)


In [8]:
df = pd.read_parquet(r'../../amex-default-prediction/train_data.parquet')
df = set_col_types(df)

#reduce df for development !!!!! comment out line below for final model
df = df[:100000]

df['statement_num'] = df.groupby("customer_ID")['S_2'].rank(method='first', ascending=False).astype(np.int8) #statement_num - 1 is last statement
df['statement_num_reverse'] = df.groupby("customer_ID")['S_2'].rank(method='first', ascending=True).astype(np.int8) #reverse  - 1 is first statement

In [71]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200000 entries, 0 to 199999
Data columns (total 193 columns):
 #    Column                 Dtype         
---   ------                 -----         
 0    customer_ID            object        
 1    S_2                    datetime64[ns]
 2    P_2                    float32       
 3    D_39                   float32       
 4    B_1                    float32       
 5    B_2                    float32       
 6    R_1                    float32       
 7    S_3                    float32       
 8    D_41                   float32       
 9    B_3                    float32       
 10   D_42                   float32       
 11   D_43                   float32       
 12   D_44                   float32       
 13   B_4                    float32       
 14   D_45                   float32       
 15   B_5                    float32       
 16   R_2                    float32       
 17   D_46                   float32       
 18   D_

In [72]:
(df['customer_ID'].value_counts()).value_counts()

13    13925
12      399
10      265
8       241
9       232
11      210
3       208
2       199
7       195
6       185
4       175
1       174
5       168
Name: customer_ID, dtype: int64

In [9]:
#ok so 91 customers only have 1 statement. lets isolate them
cust_only_one = (df['customer_ID'].value_counts() == 1) #creates series of customers with T/F value if they only have 1 statement
cust_only_one = cust_only_one[cust_only_one == True].index #filters series to only the True and leaving behind only the customer_ID index


df_cust_one_statement = df[df['customer_ID'].isin(cust_only_one)] #creates df of customers with only 1 statement
df = df[~df['customer_ID'].isin(cust_only_one)] #creates df of customers with more than 1 statement

del cust_only_one

In [10]:
#lets export customers with one statement so we don't have to worry about them anymore - we will run them though the last statement RF model
df_cust_one_statement.to_parquet(r'../../amex-default-prediction/model_output/rf_delta_first_last/cust_one_statement.parquet')
del df_cust_one_statement

#now all we have in memory is "df" with all customers with multiple statements time to reduce them to first + last

In [11]:
df_last = df[df['statement_num'] == 1]
df_first = df[df['statement_num_reverse'] == 1]
del df

In [12]:
df_last_labels = df_last[['customer_ID', 'S_2']]
df_last = df_last.drop(['customer_ID', 'S_2'], axis='columns')

In [13]:
df_last = pd.get_dummies(df_last)

In [None]:
knn_imputer = cudf.

In [81]:
knn_imputer = KNNImputer(n_neighbors=1)
knn_imputer.fit_transform(df_last)

TypeError: __init__() got an unexpected keyword argument 'n_jobs'

In [79]:
mean_imputer = SimpleImputer(strategy='mean')
mean_imputer.fit_transform(df_last)

array([[0.93474483, 0.00911864, 0.00938244, ..., 1.        , 1.        ,
        0.        ],
       [0.88051909, 0.17812583, 0.0346842 , ..., 1.        , 1.        ,
        0.        ],
       [0.88087451, 0.00970358, 0.00428367, ..., 1.        , 1.        ,
        0.        ],
       ...,
       [0.68783605, 0.00609202, 0.00782171, ..., 1.        , 1.        ,
        0.        ],
       [0.87805784, 0.21224305, 0.03065685, ..., 1.        , 1.        ,
        0.        ],
       [0.65540254, 0.0020595 , 0.02236446, ..., 1.        , 1.        ,
        0.        ]])