In [None]:
#Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#set WD
import os
os.chdir('/content/drive/MyDrive/Amex/parquet')

# Load Libraries

In [None]:
# LOAD LIBRARIES
import pandas as pd, numpy as np # CPU libraries
import matplotlib.pyplot as plt, gc, os

from sklearn.model_selection import KFold
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import XGBClassifier
import csv, itertools

from sklearn.model_selection import train_test_split
from xgboost import plot_importance
from sklearn.inspection import permutation_importance
from tqdm.auto import tqdm


In [None]:
# VERSION NAME FOR SAVED MODEL FILES
VER = 12

# TRAIN RANDOM SEED
SEED = 42

# FILL NAN VALUE
NAN_VALUE = -127 # will fit in int8

# FOLDS PER MODEL
FOLDS = 5

NUM_PARTS = 10


TRAIN_PATH = 'train.parquet'
TEST_PATH = 'test.parquet'
TARGET_PATH = 'train_labels.csv'
SAVE_PATH = '/content/drive/MyDrive/Amex/parquet/XGB final/'
SUBMISSION_FILE_PATH = '/content/drive/MyDrive/Amex/parquet/sample_submission.csv'

# Process and Feature Engineer Train Data

In [None]:
def read_file(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = pd.read_parquet(path, columns=usecols)
    else: df = pd.read_parquet(path) #df = cudf.read_parquet(path)
    df['customer_ID'] = df['customer_ID'].str[-16:].apply(int, base =16)
    df.S_2 = pd.to_datetime( df.S_2 )
    print('shape of data:', df.shape)
    return df

def revertnan(df):
  df[df==-1] = np.nan 
  return df

def fill_na(df, NAN_VALUE):
  df = df.fillna(NAN_VALUE)
  return df

def numberobs_feature(df):
  df['number_of_observations'] = df.groupby('customer_ID')['customer_ID'].transform('count')
  df.loc[df['B_33'].isnull() & (df.number_of_observations==1),'number_of_observations'] = 0.5
  return df

def afterpay(df):
  # compute "after pay" features
  for bcol in [f'B_{i}_last' for i in [11,14,17]]+['D_39_last','D_131_last']+[f'S_{i}_last' for i in [16,23]]:
    for pcol in ['P_2_last','P_3_last']:
      if bcol in df.columns:
        df[f'{bcol}-{pcol}'] = df[bcol] - df[pcol]
  return df

def get_features(df):
  all_cols = [c for c in list(df.columns) if c not in ['customer_ID','S_2']]
  cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
  num_features = [col for col in all_cols if col not in cat_features]
  return all_cols, cat_features, num_features

def agg_functions(df, num_features, cat_features, numberobs = False#, exclnullCols, 
                  #dummy_nan_col
                  ):
  
  test_num_agg = df.groupby("customer_ID")[num_features].agg(['mean', 'std', 'max', 'min', 'last', 'first'])

  print('num agg complete')
  
  test_cat_agg = df.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
  print('cat agg complete')

  df = pd.concat([test_num_agg, test_cat_agg],   #dummy_nan_col_agg, 
                   # test_nan_agg], 
                   axis=1)
  
  print('concat complete')
  df.columns = ['_'.join(x) for x in df.columns]

  print('drop numberobs')

  if numberobs ==True:
    to_drop = ['number_of_observations_mean', 'number_of_observations_std', 'number_of_observations_max','number_of_observations_min', 'number_of_observations_first']
    df.drop(to_drop, axis = 1, inplace = True)
    df.rename(columns={'number_of_observations_last':'number_of_observations'}, inplace = True)

  print('drop numberobs complete')
  del test_num_agg, test_cat_agg
  _ = gc.collect()
  print('shape after engineering', df.shape )
  return df

def add_meandev(df, num_features):
  
  for i in [f for f in num_features if f not in ['number_of_observations']]:
    last = f'{i}_last'
    mean = f'{i}_mean' 
    df[f'{i}_meandev'] = np.nan
    df.loc[(df[last] != np.nan), f'{i}_meandev'] = df[last] -df[mean]

  return df


def add_targets(df, TARGET_PATH):
  # ADD TARGETS
  targets = pd.read_csv(TARGET_PATH)
  targets['customer_ID'] = targets['customer_ID'].str[-16:].apply(int, base =16)
  targets = targets.set_index('customer_ID')
  df = df.merge(targets, left_index=True, right_index=True, how='left', sort = True)
  df.target = df.target.astype('int8')
  del targets

  # NEEDED TO MAKE CV DETERMINISTIC (cudf merge above randomly shuffles rows)
  df = df.reset_index()
  return df


def get_difference(df, num_features, train_set = None, Part =None):
    df1 = []
    customer_ids = []
    for customer_id, cus in tqdm(df.groupby(['customer_ID'])):
        # Get the differences
        diff_df1 = cus[num_features].diff(1).iloc[[-1]].values.astype(np.float32)
        # Append to lists
        df1.append(diff_df1)
        customer_ids.append(customer_id)
    # Concatenate
    df1 = np.concatenate(df1, axis = 0)
    # Transform to dataframe
    df1 = pd.DataFrame(df1, columns = [col + '_diff1' for col in df[num_features].columns])
    # Add customer id
    df1['customer_ID'] = customer_ids
    df1.set_index('customer_ID', inplace = True)
    if train_set == True: df1.to_parquet(f'{SAVE_PATH}diff_{VER}.parquet')
    elif train_set == False: df1.to_parquet(f'{SAVE_PATH}diff_test_{VER}_num{Part}.parquet')
    return df1
  
def onehot_encoding(df, cat_cols):
  cat_cols = [f'{i}_last' for i in cat_cols]
  for col in cat_cols:
    df = pd.get_dummies(df, columns=[col], drop_first=True)
  return df


In [None]:
#Preprocessing Pipeline
def preprocess(PATH = TRAIN_PATH, TARGET_PATH = TARGET_PATH, train_set = True, test = None, Part = None):
  if train_set == True:
    df = read_file(path = TRAIN_PATH)
  else:
    df = test
  print('read file complete')
  df = revertnan(df)
  print('revertnan complete')
  df = numberobs_feature(df)
  print('numberobs complete')
  all_cols, cat_features, num_features = get_features(df)
  print('get features complete')
  diff1 = get_difference(df, num_features, train_set = train_set, Part = Part)
  #if train_set == True: diff1 = pd.read_parquet(f'{SAVE_PATH}diff_{VER}.parquet')
  print('get diff complete')
  df = agg_functions(df, num_features, cat_features, numberobs = True)
  df = df.merge(diff1, left_index=True, right_index=True, how='left') 
  del diff1
  _ = gc.collect()
  print('agg features complete')
  df = add_meandev(df, num_features)
  print('meandev complete')
  df = afterpay(df)
  print('afterpay complete')
  df = onehot_encoding(df, cat_cols=cat_features)
  print('onehot complete')
  # df = add_Bratios(df)
  df = fill_na(df, NAN_VALUE)
  print('fillna complete')
  gc.collect()
  if train_set == True:
    df = add_targets(df, TARGET_PATH)
  return df

In [None]:
train = preprocess(PATH = TRAIN_PATH)
train.to_parquet(f'{SAVE_PATH}train_{VER}.parquet')

In [None]:
#if already pretrained
train = pd.read_parquet(f'{SAVE_PATH}train_{VER}.parquet')

In [None]:
def get_feature_list(df):
  features = df.columns[1:-1]
  print(f'There are {len(features)} features!')
  return features

In [None]:
features = get_feature_list(train)

There are 1487 features!


In [None]:
def save_features(features):
  with open(f'{SAVE_PATH}features_V{VER}.csv', 'w') as csvfile:
    # creating a csv writer object
    writer = csv.writer(csvfile)    
    writer.writerow(features) 
save_features(features)

In [None]:
del features
features = pd.read_csv(f'{SAVE_PATH}features_V{VER}.csv')
features = pd.Index(features.columns)

Train XGB

In [None]:
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

In [None]:
def get_xgb_parameters():

  xgb_parms = {
   'lambda': 0.19846538518330817, 
   'alpha': 0.11499421368543077, 
   'colsample_bytree': 1.0, 
   'subsample': 0.6, 
   'learning_rate': 0.01, 
   'max_depth': 8, 
   'min_child_weight': 56,
   'eval_metric':'logloss',
   'objective':'binary:logistic',
   'tree_method':'gpu_hist',
   'predictor':'gpu_predictor',
   'random_state':SEED  
    }
 
  return xgb_parms

In [None]:
xgb_parms = get_xgb_parameters()

In [None]:
def train_model(df, SEED=SEED, SAVE_PATH =SAVE_PATH, VER=VER):
  importances = []
  oof = []
  TRAIN_SUBSAMPLE = 1.0
  gc.collect()
  kaggle_metrics_folds =[]

  skf = KFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
  for fold,(train_idx, valid_idx) in enumerate(skf.split(
            df, df.target )):
    # TRAIN WITH SUBSAMPLE OF TRAIN FOLD DATA
    if TRAIN_SUBSAMPLE<1.0:
        np.random.seed(SEED)
        train_idx = np.random.choice(train_idx, 
                       int(len(train_idx)*TRAIN_SUBSAMPLE), replace=False)
        np.random.seed(None)
    
    print('#'*25)
    print('### Fold',fold+1)
    print('### Train size',len(train_idx),'Valid size',len(valid_idx))
    print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
    print('#'*25)
    


    X_train = df.loc[train_idx, features]
    y_train = df.loc[train_idx, 'target']

    X_valid = df.loc[valid_idx, features]
    y_valid = df.loc[valid_idx, 'target']


    dtrain = xgb.DMatrix(data=X_train, label=y_train)   
    dvalid = xgb.DMatrix(data=X_valid, label=y_valid)
    
    # TRAIN MODEL FOLD K
    model = xgb.train(xgb_parms, 
                dtrain=dtrain,
                evals=[(dtrain,'train'),(dvalid,'valid')],
                num_boost_round=9999,
                early_stopping_rounds=100,
                verbose_eval=100) 
    model.save_model(f'{SAVE_PATH}XGB_v{VER}_fold{fold}.xgb')
    
    # GET FEATURE IMPORTANCE FOR FOLD K
    dd = model.get_score(importance_type='weight')
    df_pred = pd.DataFrame({'feature':dd.keys(),f'importance_{fold}':dd.values()})
    importances.append(df_pred)
            
    # INFER OOF FOLD K
    oof_preds = model.predict(dvalid)
    acc = amex_metric_mod(y_valid.values, oof_preds)
    print('Kaggle Metric =',acc,'\n')
    kaggle_metrics_folds.append(acc)
    
    # SAVE OOF
    df_pred = df.loc[valid_idx, ['customer_ID','target'] ].copy()
    df_pred['oof_pred'] = oof_preds
    oof.append( df_pred )
    
    del dtrain, X_train, y_train, dd, df_pred #,Xy_train,
    del X_valid, y_valid, dvalid, model
    _ = gc.collect()
    
  print('#'*25)
  oof = pd.concat(oof,axis=0,ignore_index=True).set_index('customer_ID')
  acc = amex_metric_mod(oof.target.values, oof.oof_pred.values)
  print('OVERALL CV Kaggle Metric =',acc)
  print(kaggle_metrics_folds)

In [None]:
train_model(train)

Prepare Testdata

In [None]:
# CALCULATE SIZE OF EACH SEPARATE TEST PART
def get_rows(customers, test, NUM_PARTS = NUM_PARTS, verbose = ''):
    chunk = len(customers)//NUM_PARTS
    if verbose != '':
        print(f'We will process {verbose} data as {NUM_PARTS} separate parts.')
        print(f'There will be {chunk} customers in each part (except the last part).')
        print('Below are number of rows in each part:')
    rows = []

    for k in range(NUM_PARTS):
        if k==NUM_PARTS-1: cc = customers[k*chunk:]
        else: cc = customers[k*chunk:(k+1)*chunk]
        s = test.loc[test.customer_ID.isin(cc)].shape[0]
        rows.append(s)
    if verbose != '': print( rows )
    return rows,chunk

In [None]:
def get_rowsnumcust(TEST_PATH, NUM_PARTS=NUM_PARTS):
  print(f'Reading test data...')
  test = read_file(path = TEST_PATH, usecols = ['customer_ID','S_2'])
  customers = test[['customer_ID']].drop_duplicates().sort_index().values.flatten()
  rows,num_cust = get_rows(customers, test[['customer_ID']], NUM_PARTS = NUM_PARTS, verbose = 'test')
  return rows,num_cust,customers

In [None]:
rows,num_cust, customers = get_rowsnumcust(TEST_PATH, NUM_PARTS=NUM_PARTS)

Reading test data...
shape of data: (11363762, 2)
We will process test data as 10 separate parts.
There will be 92462 customers in each part (except the last part).
Below are number of rows in each part:
[1136415, 1137255, 1135580, 1135734, 1136082, 1137166, 1136612, 1137228, 1136301, 1135389]


In [None]:
# INFER TEST DATA IN PARTS
skip_rows = 0
skip_cust = 0
test_preds = []

for k in range(NUM_PARTS):
  # READ PART OF TEST DATA
  print(f'\nReading test data...')
  test = read_file(path = TEST_PATH)
  test = test.sort_index()
  test = test.iloc[skip_rows:skip_rows+rows[k]]
  skip_rows += rows[k]
  print(f'=> Test part {k+1} has shape', test.shape )
    
  # PROCESS AND FEATURE ENGINEER PART OF TEST DATA
  test = preprocess(train_set = False, test = test, Part =k)
  if k==NUM_PARTS-1: test = test.loc[customers[skip_cust:]]
  else: test = test.loc[customers[skip_cust:skip_cust+num_cust]]
  skip_cust += num_cust
 
   
  #ammend for one hot encoding
  test['D_64_last_1.0'] = 0
  test['D_66_last_1.0'] = 0
  test['D_68_last_1.0'] = 0
  # TEST DATA FOR XGB
  X_test = test[features]
  print('X_test complete')
  dtest = xgb.DMatrix(data=X_test)
  print('dtest complete')
  #test = test[['P_2_mean']] # reduce memory
  del test, X_test
  gc.collect()
  gc.collect()

  # INFER XGB MODELS ON TEST DATA
  model = xgb.Booster()
  model.load_model(f'{SAVE_PATH}XGB_v{VER}_fold0.xgb')
  print('load model complete')
  preds = model.predict(dtest)
  print('preds complete')
  for f in range(1,FOLDS):
    del model
    gc.collect()
    model = xgb.Booster()
    model.load_model(f'{SAVE_PATH}XGB_v{VER}_fold{f}.xgb')
    print(f'load {f} complete')
    preds += model.predict(dtest)
    print(f'preds {f} complete')
  preds /= FOLDS
  test_preds.append(preds)

  # CLEAN MEMORY
  del dtest, model
  _ = gc.collect()

test_preds = np.concatenate(test_preds)
test = pd.DataFrame(index=customers,data={'prediction':test_preds})
sub = pd.read_csv(SUBMISSION_FILE_PATH)[['customer_ID']]
sub['customer_ID_hash'] = sub['customer_ID'].str[-16:].apply(int, base =16)
sub = sub.set_index('customer_ID_hash')
sub = sub.merge(test[['prediction']], left_index=True, right_index=True, how='left')
sub = sub.reset_index(drop=True)

sub.to_csv(f'{SAVE_PATH}submission_xgb_v{VER}.csv',index=False)
print('Submission file shape is', sub.shape )
sub.head()