In [1]:
import pandas as pd
import numpy as np
import time

# from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import auc, accuracy_score

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.compose import ColumnTransformer

from keras.models import Sequential
from keras.layers import Dense
import xgboost as xgb

rand_state = 1337

### Define pipelines and functions 
1. Preprocessing 2. Sampling 3. Scoring

In [2]:
# Pipelines: Defining the categorical imputation and one-hot encoder for categorical variables.
categorical_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent"))
        # ("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False)), #Commented out because the categorical variables won't play nice with dummies between test/train. Retry when we do a full train model. Can impute values on test_data.csv if necessary.
    ]
)

# defining the numerical imputation and standard scaler for numerical variables.
numeric_pipeline = Pipeline(
    steps=[("impute", SimpleImputer(strategy="mean")), 
           ("scale", StandardScaler())]
           #("scale", MinMaxScaler())]
)

# def_prep_df: Preparing the TRAINING data for creating and testing the model.
def prep_df(df, target, target_to_drop):

    # save indices
    df_index = df.index
    # save statement_age column
    statement_age_s = df['statement_age']

    # Drop columns that shouldn't be scaled or imputed
    df = df.drop(columns=["s_2", 'statement_age', target_to_drop])

    # Missing values handling
    missing_props = df.isna().mean(axis=0)
    

    over_threshold = missing_props[missing_props >= 0.4]
    over_threshold
    

    df.drop(over_threshold.index, 
            axis=1, 
            inplace=True)

    
    
    # Split into predictors and target
    X = df.drop(columns=[target])
    y = df[target]

    # Split categorical and numerical columns
    cat_cols_all = ['b_30', 'b_38', 'd_114', 'd_116', 'd_117', 'd_120', 'd_126', 'd_63', 'd_64', 'd_66', 'd_68', 'b_31', 'd_87']
    cat_cols = [col for col in X.columns.str.lower() if col in cat_cols_all]
    num_cols = [col for col in X.columns.str.lower() if col not in cat_cols]
    
    # get dummies for categorical variables
    Xcat = pd.get_dummies(X[cat_cols], columns=cat_cols, drop_first=True)
    
    X = pd.concat([X[num_cols],Xcat], axis=1)
    X.columns = X.columns.str.lower()
    cols_list = X.columns.tolist()

    cat_cols = [col for col in cols_list if col not in num_cols]
   


    full_processor = ColumnTransformer(
        transformers=[
        ("numeric", numeric_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols),
        ]
    )


    
    # Apply preprocessing
    X_processed = full_processor.fit_transform(X)
    X_processed = pd.concat([pd.DataFrame(X_processed, index=df_index), statement_age_s], axis=1)
    print(X_processed.shape)

    y_processed = SimpleImputer(strategy="most_frequent").fit_transform(
            y.values.reshape(-1, 1)
            )
    y_processed = pd.DataFrame(y_processed, index=df_index)

    
    return X_processed, y_processed, cols_list




def get_train_test(df_train, df_train_y, X_processed, y_processed, usefraction):
    n = 100
    ids = np.array(df_train_y.index)
    target = np.array(df_train_y['target'])
    
    skf = StratifiedKFold(n_splits=n, shuffle=True, random_state=rand_state)
    skf.get_n_splits(ids, target)

    i = 0
    id_subsets = [None]*n
    for _, subset in skf.split(ids, target):
        id_subsets[i] =list(ids[subset])
        i += 1

    
    list1 = list(np.arange(0, int(usefraction[0]*100), 1))
    list2 = list(np.arange(int(usefraction[0]*100), int(usefraction[0]*100)+int(usefraction[1]*100), 1))


    train_ids = []
    for i in list1:
        train_ids.extend(id_subsets[i])
    test_ids = []
    for i in list2:
        test_ids.extend(id_subsets[i])


    X_train = X_processed[df_train.index.isin(train_ids)]
    y_train = y_processed[df_train.index.isin(train_ids)]
    X_test = X_processed[df_train.index.isin(test_ids)]
    y_test = y_processed[df_train.index.isin(test_ids)]

    indices_train = df_train.index.isin(train_ids)
    indices_test = df_train.index.isin(test_ids)


    print(f'Train data obs.: {len(X_train)}')
    print(f'Test data obs: {len(X_test)}')

    # also extract the statement dates for combining the predictions later on
    # train_statement_age = X_train['statement_age']
    # test_statement_age = X_test['statement_age']
    # X_train = X_train.drop(columns='statement_age')
    # X_test = X_test.drop(columns='statement_age')

    return X_train, X_test, y_train, y_test




def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

### Create initial df to be further processed

In [4]:
df_train_x = pd.read_parquet('./../ignore/train.parquet')
df_train_x.columns = df_train_x.columns.str.lower()
df_train_x = df_train_x.sort_values(['customer_id', 's_2'])
df_train_x = df_train_x.set_index('customer_id')

df_train_y = pd.read_csv('./../ignore/train_labels.csv')
df_train_y.columns = df_train_y.columns.str.lower()
df_train_y = df_train_y.set_index('customer_id')



df_train = pd.merge(df_train_x, df_train_y, left_index=True, right_on='customer_id', how='left')

df_train['last_statement_flag'] = (df_train.groupby(df_train.index)['s_2']
                      .rank(method='dense', ascending=False)
                      .astype(int)
                   )

df_train['last_statement_target'] = df_train['target']*df_train['last_statement_flag'].apply(lambda x: 1 if x==1 else 0)
df_train = df_train.rename(columns={'last_statement_flag':'statement_age'})

### Select which statements to use

In [5]:
use_statements = [1,2,3]
df_train = df_train[df_train['statement_age'].isin(use_statements)]

### Process all the data after selecting statements

In [6]:
# Prep the dataframe
# Note that the last column 'statement_age' is left in the dataframes for scoring, not for predicting!
X_processed, y_processed, cols_list = prep_df(df_train, target='target', target_to_drop='last_statement_target')

(1360401, 204)


In [15]:
X_processed.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,194,195,196,197,198,199,200,201,202,statement_age
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,1.040795,-0.529039,-0.589915,0.995724,-0.349097,-0.728393,-0.298688,-0.586957,-1.250986e-07,-0.486207,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,3
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,1.146568,-0.529039,-0.515046,1.004268,-0.377995,-0.705657,-0.298688,-0.5703,-1.250986e-07,-0.486207,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,1.136566,-0.529039,-0.561068,1.002661,-0.357007,-0.528131,-0.298688,-0.571407,-1.250986e-07,-0.486207,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,0.92984,0.081478,-0.555786,0.989671,-0.357574,-0.55531,-0.298688,-0.588678,-0.5108954,-0.486207,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,0.921894,1.040861,-0.450245,0.997235,-0.352637,-0.392563,-0.298688,-0.565251,-0.5278185,-0.486207,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2


### Get samples for training and testing

In [7]:
# First vale of "usefraction" specifies the train size and the second, the test size (fraction of total train data available)
X_train, X_test, y_train, y_test = get_train_test(df_train, df_train_y, X_processed, y_processed, usefraction = [0.5, 0.5])

Train data obs.: 680246
Test data obs: 680155


## XGB
feature selection loop using feature importances

In [8]:
def get_reduced_features(xgbc, feature_names, remove_n_features):

    xgb_feature_imp = pd.DataFrame({'name':feature_names,
                                    'importances':[val[0] for val in xgbc.feature_importances_.reshape(-1,1)]})

    xgb_feature_imp = xgb_feature_imp.sort_values(by='importances', ascending=False)
    
    df_features = xgb_feature_imp.iloc[:-remove_n_features,:]
    feature_ind = df_features.index
    feature_names = df_features['name']


    return feature_ind, feature_names



def fit_predict(feature_ind, feature_names, remove_n_features, X_train, y_train, X_test, y_test):
    xgbc = xgb.XGBClassifier(use_label_encoder=False).fit(X_train[feature_ind], y_train, verbose=0, eval_metric='logloss')
    
    y_pred_a_xgb = pd.DataFrame({'customer_id':X_test.index.values,
                            'statement_age':X_test.iloc[:,-1].values,
                            'prediction':[val[1] for val in xgbc.predict_proba(X_test[feature_ind])]})
    
    last_proba_xgb = y_pred_a_xgb[y_pred_a_xgb['statement_age']==1].set_index('customer_id')
    score = [amex_metric(y_test.groupby(y_test.index).max().rename(columns={0:'target'}), last_proba_xgb)]

    feature_ind, feature_names = get_reduced_features(xgbc, feature_names, remove_n_features)

    return score, feature_ind, feature_names, xgbc




feature_reduction_rate = 0.25 # Attempt to remove 25% of remaining features in each loop iteration
accuracy_loss = 0.003 # accepted accuracy loss c.f. max accuracy

scores = []
feature_ind = [list(range(0,(X_train.shape[1]-1)))]
feature_names = [cols_list]
remove_n_features = [int(len(feature_ind[0])*feature_reduction_rate)]
i = 0

while True:
    start_time = time.time()

    cscore, cfeature_ind, cfeature_names, cxgb = fit_predict(feature_ind[i], feature_names[i], remove_n_features[i], X_train, y_train, X_test, y_test) # "c" denotes current
    scores += cscore # add score to list

    if i > 0:
        if (max(scores) - scores[i]) >= accuracy_loss: # Maximum residual between max and current score
            
            print(f'Iter {i+1}, First fit yielded too low score, trying again (number of features attempted: {len(feature_ind[i])}')
            cscore, cfeature_ind, cfeature_names, cxgb = fit_predict(feature_ind[i], feature_names[i], remove_n_features[i], X_train, y_train, X_test, y_test) # "c" denotes current

            if (max(scores) - cscore) >= accuracy_loss: # use same criterion again
                
                # print(f'Iter {i+1}, Duration: {round((time.time() - start_time),3)} s, Iteration failed, too large accuracy loss ({cscore-max(scores)}), removing fewer features')
                print(f'Iter {i+1}, Duration: {round((time.time() - start_time),3)} s, Score: {round(scores[i],5)} - Iteration failed, too large accuracy loss ({cscore-max(scores)})')
                feature_ind += [feature_ind[i-1]] # add list of feature indices to list
                feature_names += [feature_names[i-1]] # add list of feature names to list

                feature_reduction_rate = feature_reduction_rate*0.5 # decrease feature reduction rate if iteration failed

                if int(len(feature_ind[i])*feature_reduction_rate) >= 1: # Check that at least 1 feature can be removed
                    remove_n_features += [int(len(feature_ind[i])*feature_reduction_rate)]
                    print(f'Current feature reduction rate: {feature_reduction_rate}')
                    i += 1
                    continue
                else:
                    print('Completed')
                    break


            else:
                scores[i] = cscore # Overwrite first fit score with 2nd fit score
                print(f'Iter {i+1},  Retry successful')



    feature_ind += [cfeature_ind] # add list of feature indices to list
    feature_names += [cfeature_names] # add list of feature names to list
    remove_n_features += [int(len(feature_ind[i])*feature_reduction_rate)]
    
    print(f'Iter {i+1}, Duration: {round((time.time() - start_time),3)} s, Score: {round(scores[i],5)}, Number of features: {len(feature_ind[i])}')

    i += 1

Iter 1, Duration: 169.766 s, Score: 0.77687, Number of features: 203
Iter 2, Duration: 145.173 s, Score: 0.77536, Number of features: 153
Iter 3, Duration: 108.277 s, Score: 0.77652, Number of features: 103
Iter 4, First fit yielded too low score, trying again (number of features attempted: 65
Iter 4, Duration: 151.539 s, Score: 0.77235 - Iteration failed, too large accuracy loss ([-0.00451821])
Current feature reduction rate: 0.125
Iter 5, Duration: 108.122 s, Score: 0.77652, Number of features: 103
Iter 6, Duration: 100.883 s, Score: 0.77565, Number of features: 95
Iter 7, Duration: 89.18 s, Score: 0.77533, Number of features: 83
Iter 8, Duration: 75.438 s, Score: 0.77523, Number of features: 72
Iter 9, First fit yielded too low score, trying again (number of features attempted: 62
Iter 9, Duration: 116.069 s, Score: 0.77251 - Iteration failed, too large accuracy loss ([-0.00435888])
Current feature reduction rate: 0.0625
Iter 10, Duration: 66.458 s, Score: 0.77523, Number of feature

In [11]:
print(len(feature_names))
print(len(feature_ind))

18
18


In [10]:
print(scores)

[0.7768696243238989, 0.7753553218939775, 0.7765176143503996, 0.7723514119724455, 0.7765176143503996, 0.7756472515062318, 0.7753290546241556, 0.7752259002665958, 0.772510747849558, 0.7752259002665958, 0.7737585043899986, 0.7752259002665958, 0.7741145383119681, 0.7737193685946455, 0.7741145383119681, 0.7738927334607886, 0.7736627965111049]


In [16]:
list(feature_names[16].values)

['p_2',
 'b_1',
 'r_4',
 'd_44',
 'd_51',
 'b_2',
 'r_1',
 'b_9',
 'd_66_1',
 'd_41',
 'r_2',
 'd_39',
 'd_129',
 's_3',
 'r_3',
 'd_112',
 'b_38_4',
 'd_64_0',
 'd_111',
 'b_4',
 'd_43',
 'r_27',
 'd_49',
 's_23',
 'b_8',
 'd_45',
 'b_3',
 'd_46',
 'd_48',
 'd_54',
 'd_131',
 'd_63_3',
 'b_11',
 'd_140',
 'b_38_2',
 'b_5',
 'r_11',
 'b_7',
 'r_7',
 'd_117_6',
 'd_138',
 'p_3',
 'b_10',
 'b_20',
 'r_26',
 'b_16',
 'r_5',
 'd_120_0',
 'd_62',
 's_7',
 'r_12',
 'd_72',
 'd_117_3',
 's_24',
 'd_75',
 'd_82',
 'b_18',
 's_5',
 'd_52',
 'd_86',
 's_8',
 'd_47',
 's_26',
 's_15',
 'd_65',
 'd_141',
 's_11',
 's_18']