In [13]:
import pandas as pd
import numpy as np
import time
import pickle

# from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.compose import ColumnTransformer

from keras.models import Sequential
from keras.layers import Dense
import xgboost as xgb

rand_state = 1337

# Use features according to pickled lists

In [3]:
f = open('./../../Feature selection/features.pickle', 'rb')
featurelist = pickle.load(f)
f.close()

f = open("./../../Feature selection/lag_features.pickle", 'rb')
lagfeaturelist = pickle.load(f)
f.close()

print('original features:', len(featurelist), ', lag-features:', len(lagfeaturelist), ', total features:', len(featurelist)+len(lagfeaturelist) )

original features: 61 , lag-features: 37 , total features: 98


# Define Pipelines & Functions: 
1. Pipelines
2. Train preprocessing 
3. Train sampling 
4. Train scoring 
5. Test preprocessing

In [54]:
# Pipelines: Defining the categorical imputation and one-hot encoder for categorical variables.
categorical_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent"))
        # ("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False)), #Commented out because the categorical variables won't play nice with dummies between test/train. Retry when we do a full train model. Can impute values on test_data.csv if necessary.
    ]
)

# defining the numerical imputation and standard scaler for numerical variables.
numeric_pipeline = Pipeline(
    steps=[("impute", SimpleImputer(strategy="mean")), 
           ("scale", StandardScaler())]
)


# def_prep_df: Preparing the TRAINING data for creating and testing the model.
def prep_df(df, target, target_to_drop):

    # save indices
    df_index = df.index
    # save statement_age column
    statement_age_s = df['statement_age']

    # Drop columns that shouldn't be scaled or imputed
    df = df.drop(columns=["s_2", 'statement_age', target_to_drop])

    # Missing values handling
    missing_props = df.isna().mean(axis=0)
    

    over_threshold = missing_props[missing_props >= 0.4]
    df.drop(over_threshold.index, axis=1, inplace=True)

    
    
    # Split into predictors and target
    X = df.drop(columns=[target])
    y = df[target]

    # Split categorical and numerical columns
    cat_cols_all = ['b_30', 'b_38', 'd_114', 'd_116', 'd_117', 'd_120', 'd_126', 'd_63', 'd_64', 'd_66', 'd_68', 'b_31', 'd_87']
    cat_cols = [col for col in X.columns.str.lower() if col in cat_cols_all]
    num_cols = [col for col in X.columns.str.lower() if col not in cat_cols]
    
    # get dummies for categorical variables
    Xcat = pd.get_dummies(X[cat_cols], columns=cat_cols, drop_first=True)
    
    X = pd.concat([X[num_cols],Xcat], axis=1)
    X.columns = X.columns.str.lower()
    cols_list = X.columns.tolist()

    cat_cols = [col for col in cols_list if col not in num_cols]
   


    full_processor = ColumnTransformer(
        transformers=[
        ("numeric", numeric_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols),
        ]
    )


    
    # Apply preprocessing
    X_processed = full_processor.fit_transform(X)
    X_processed = pd.concat([pd.DataFrame(X_processed, index=df_index), statement_age_s], axis=1)
    print(X_processed.shape)

    y_processed = SimpleImputer(strategy="most_frequent").fit_transform(
            y.values.reshape(-1, 1)
            )
    y_processed = pd.DataFrame(y_processed, index=df_index)

    
    return X_processed, y_processed, cols_list




def get_train_test(df_train, df_train_y, X_processed, y_processed, usefraction):
    n = 100
    ids = np.array(df_train_y.index)
    target = np.array(df_train_y['target'])
    
    skf = StratifiedKFold(n_splits=n, shuffle=True, random_state=rand_state)
    skf.get_n_splits(ids, target)

    i = 0
    id_subsets = [None]*n
    for _, subset in skf.split(ids, target):
        id_subsets[i] =list(ids[subset])
        i += 1

    
    list1 = list(np.arange(0, int(usefraction[0]*100), 1))
    list2 = list(np.arange(int(usefraction[0]*100), int(usefraction[0]*100)+int(usefraction[1]*100), 1))


    train_ids = []
    for i in list1:
        train_ids.extend(id_subsets[i])
    test_ids = []
    for i in list2:
        test_ids.extend(id_subsets[i])


    X_train = X_processed[df_train.index.isin(train_ids)]
    y_train = y_processed[df_train.index.isin(train_ids)]
    X_test = X_processed[df_train.index.isin(test_ids)]
    y_test = y_processed[df_train.index.isin(test_ids)]


    print(f'Train data obs.: {len(X_train)}')
    print(f'Test data obs: {len(X_test)}')

    return X_train, X_test, y_train, y_test




def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)




def prep_test_df(df, selected_features):
    # save indices
    df_index = df.index
    # save statement_age column
    statement_age_s = df['statement_age']

    # Drop columns that shouldn't be scaled or imputed
    X = df.drop(columns=["s_2", 'statement_age'])

    # Missing values handling
    missing_props = df.isna().mean(axis=0)
    over_threshold = missing_props[missing_props >= 0.4]
    df.drop(over_threshold.index, axis=1, inplace=True)

    cols_list = list(df.columns.str.lower())


    # Split categorical and numerical columns
    cat_cols_all = ['b_30', 'b_38', 'd_114', 'd_116', 'd_117', 'd_120', 'd_126', 'd_63', 'd_64', 'd_66', 'd_68', 'b_31', 'd_87']
    cat_cols = [col for col in X.columns if col in cat_cols_all]
    num_cols = [col for col in X.columns if col not in cat_cols]

    # get dummies for categorical variables
    Xcat = pd.get_dummies(X[cat_cols], columns=cat_cols, drop_first=True)
    X = pd.concat([X[num_cols],Xcat], axis=1)

    X.columns = X.columns.str.lower()
    cols_list = X.columns.tolist()

    cat_cols = [col for col in cols_list if col not in num_cols]


    # Split list of numerical features into sublist for imputing and scaling in chunks
    sublist_size = 20
    sublists_cols = [num_cols[x:x+sublist_size] for x in range(0, len(num_cols), sublist_size)]


    i = 0
    for sublist in sublists_cols:
        full_processor = ColumnTransformer(transformers=[("numeric", numeric_pipeline, sublist)])
        if i == 0:
            X_num = pd.DataFrame(full_processor.fit_transform(X[sublist]), index=df_index, columns=sublist)
        else:
            X_num = pd.concat([X_num, pd.DataFrame(full_processor.fit_transform(X[sublist]), index=df_index, columns=sublist)], axis=1)
        i +=1


    X_processed = pd.concat([X_num, X[cat_cols], statement_age_s], axis=1)
    X_processed = X_processed[selected_features + ['statement_age']]
    print(X_processed.shape)


    return X_processed

# train

#### Create initial train df to be further processed

In [5]:
df_train_x = pd.read_parquet('./../../ignore/train.parquet', columns=(['customer_ID', 'S_2'] + featurelist))
df_train_x.columns = df_train_x.columns.str.lower()
df_train_x = df_train_x.sort_values(['customer_id', 's_2'])
df_train_x = df_train_x.set_index('customer_id')

df_train_y = pd.read_csv('./../../ignore/train_labels.csv')
df_train_y.columns = df_train_y.columns.str.lower()
df_train_y = df_train_y.set_index('customer_id')



df_train = pd.merge(df_train_x, df_train_y, left_index=True, right_on='customer_id', how='left')

df_train['statement_age'] = (df_train.groupby(df_train.index)['s_2']
                      .rank(method='dense', ascending=False)
                      .astype(int)
                   )

df_train['last_statement_target'] = df_train['target']*df_train['statement_age'].apply(lambda x: 1 if x==1 else 0)

# join lag features (dfl = delta first last) (repeated identical values for all statements)
df_train = df_train.join(pd.read_parquet('./../../ignore/train_dfl.parquet', columns=lagfeaturelist), how='left')

#### Select which statements to use

In [6]:
use_statements = [1,2,3]
df_train = df_train[df_train['statement_age'].isin(use_statements)]

#### Process remaining data after selecting statements

In [7]:
# Prep the dataframe
# Note that the last column 'statement_age' is left in the dataframes for scoring, not for predicting!
X_processed, y_processed, cols_list = prep_df(df_train, target='target', target_to_drop='last_statement_target')

(1360401, 120)


#### Get train samples for training and testing with train sample

In [9]:
# First vale of "usefraction" specifies the train size and the second, the test size (fraction of total train data available)
X_train, X_test, y_train, y_test = get_train_test(df_train, df_train_y, X_processed, y_processed, usefraction = [0.05, 0.05])

Train data obs.: 68032
Test data obs: 68024


#### Feature selection loop with XGB using feature importances prior to model tuning

In [10]:
feature_reduction_rate = 0.2 # Attempt to remove 10% of remaining features in each loop iteration
accuracy_loss = 0.003 # accepted accuracy loss c.f. max accuracy
scoring_method = 'last_statement'


scores = []
feature_ind = [list(range(0,(X_train.shape[1]-1)))]
feature_names = [cols_list]
remove_n_features = [int(len(feature_ind[0])*feature_reduction_rate)]
i = 0


def get_reduced_features(xgbc, feature_names, remove_n_features):
    xgb_feature_imp = pd.DataFrame({'name':feature_names,
                                    'importances':[val[0] for val in xgbc.feature_importances_.reshape(-1,1)]})

    xgb_feature_imp = xgb_feature_imp.sort_values(by='importances', ascending=False)
    
    df_features = xgb_feature_imp.iloc[:-remove_n_features,:]
    feature_ind = df_features.index
    feature_names = df_features['name']


    return feature_ind, feature_names



def fit_predict(feature_ind, feature_names, remove_n_features, X_train, y_train, X_test, y_test):
    xgbc = xgb.XGBClassifier(use_label_encoder=False).fit(X_train[feature_ind], y_train, verbose=0, eval_metric='logloss')
    
    y_pred_a_xgb = pd.DataFrame({'customer_id':X_test.index.values,
                            'scoring_var':X_test.iloc[:,-1].values,
                            'prediction':[val[1] for val in xgbc.predict_proba(X_test[feature_ind])]})
    
    if scoring_method == 'last_statement':
        proba_xgb = y_pred_a_xgb[y_pred_a_xgb['scoring_var']==1].set_index('customer_id')

    elif scoring_method == 'delta0':
        proba_xgb = y_pred_a_xgb[y_pred_a_xgb['scoring_var']==0].set_index('customer_id')

    elif scoring_method == 'delta1':
        proba_xgb = y_pred_a_xgb[y_pred_a_xgb['scoring_var']==1].set_index('customer_id')

    elif scoring_method == 'average-deltas':
        proba_xgb = y_pred_a_xgb.groupby('customer_id')['prediction'].mean()
    
    score = [amex_metric(y_test.groupby(y_test.index).max().rename(columns={0:'target'}), proba_xgb)]
    feature_ind, feature_names = get_reduced_features(xgbc, feature_names, remove_n_features)

    return score, feature_ind, feature_names, xgbc




while True:
    start_time = time.time()

    cscore, cfeature_ind, cfeature_names, cxgb = fit_predict(feature_ind[i], feature_names[i], remove_n_features[i], X_train, y_train, X_test, y_test) # "c" denotes current
    scores += cscore # add score to list

    if i > 0:
        if (max(scores) - scores[i]) >= accuracy_loss: # Maximum residual between max and current score
            
            print(f'Iter {i+1}, First fit yielded too low score, trying again (number of features attempted: {len(feature_ind[i])}')
            cscore, cfeature_ind, cfeature_names, cxgb = fit_predict(feature_ind[i], feature_names[i], remove_n_features[i], X_train, y_train, X_test, y_test) # "c" denotes current

            if (max(scores) - cscore) >= accuracy_loss: # use same criterion again
                
                # print(f'Iter {i+1}, Duration: {round((time.time() - start_time),3)} s, Iteration failed, too large accuracy loss ({cscore-max(scores)}), removing fewer features')
                print(f'Iter {i+1}, Duration: {round((time.time() - start_time),3)} s, Score: {round(scores[i],5)} - Iteration failed, too large accuracy loss ({cscore-max(scores)})')
                feature_ind += [feature_ind[i-1]] # add list of feature indices to list
                feature_names += [feature_names[i-1]] # add list of feature names to list

                feature_reduction_rate = feature_reduction_rate*0.5 # decrease feature reduction rate if iteration failed

                if int(len(feature_ind[i])*feature_reduction_rate) >= 1: # Check that at least 1 feature can be removed
                    remove_n_features += [int(len(feature_ind[i])*feature_reduction_rate)]
                    print(f'Current feature reduction rate: {feature_reduction_rate}')
                    i += 1
                    continue
                else:
                    print('Completed')
                    break


            else:
                scores[i] = cscore # Overwrite first fit score with 2nd fit score
                print(f'Iter {i+1},  Retry successful')



    feature_ind += [cfeature_ind] # add list of feature indices to list
    feature_names += [cfeature_names] # add list of feature names to list
    remove_n_features += [int(len(feature_ind[i])*feature_reduction_rate)]
    
    print(f'Iter {i+1}, Duration: {round((time.time() - start_time),3)} s, Score: {round(scores[i],5)}, Number of features: {len(feature_ind[i])}')

    i += 1

Iter 1, Duration: 5.881 s, Score: 0.74494, Number of features: 119
Iter 2, Duration: 5.842 s, Score: 0.74716, Number of features: 96
Iter 3, Duration: 4.731 s, Score: 0.74833, Number of features: 73
Iter 4, Duration: 4.034 s, Score: 0.74538, Number of features: 54
Iter 5, First fit yielded too low score, trying again (number of features attempted: 40
Iter 5, Duration: 7.701 s, Score: 0.74163 - Iteration failed, too large accuracy loss ([-0.0066929])
Current feature reduction rate: 0.1
Iter 6, Duration: 4.152 s, Score: 0.74538, Number of features: 54
Iter 7, First fit yielded too low score, trying again (number of features attempted: 50
Iter 7, Duration: 11.096 s, Score: 0.73978 - Iteration failed, too large accuracy loss ([-0.00855058])
Current feature reduction rate: 0.05
Iter 8, Duration: 5.569 s, Score: 0.74538, Number of features: 54
Iter 9, First fit yielded too low score, trying again (number of features attempted: 52
Iter 9, Duration: 10.008 s, Score: 0.74397 - Iteration failed,

In [56]:
selected_features= list(feature_names[len(feature_names)-2])
ind_selected_features= feature_ind[len(feature_names)-2]

# f = open('features_and_lagfeatures_0.pickle', 'wb')
# pickle.dump(list(selected_features), f)
# f.close()

In [43]:
# Tune parameters of model with selected features
X_train, X_test, y_train, y_test = get_train_test(df_train, df_train_y, X_processed, y_processed, usefraction = [0.05, 0.05])

classifier = xgb.XGBClassifier(use_label_encoder=False)

params = {
    'learning_rate' : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    'max_depth' : [5, 6, 8, 10, 12, 15],
    'min_child_weight' : [1, 3, 5, 7],
    'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
    'colsample_bytree' : [0.3, 0.4, 0.5, 0.7],
    'eval_metric': ['logloss']
}

model_tune=RandomizedSearchCV(classifier, param_distributions=params, n_iter=10, scoring='roc_auc', n_jobs=-1, cv=5,verbose=0)
model_tune.fit(X_train[ind_selected_features],y_train, verbose=1)

Train data obs.: 68032
Test data obs: 68024


RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           enable_categorical=False, gamma=None,
                                           gpu_id=None, importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints...
                                           scale_pos_weight=None,
                                           subsample=None, tree_method=None,
                                      

In [44]:
tuned_model = model_tune.best_estimator_ 
scoring_method == 'last_statement'

def post_predict(model, feature_ind, scoring_method, X_train, X_test, y_train, y_test):
    xgb_base = xgb.XGBClassifier(use_label_encoder=False).fit(X_train[feature_ind], y_train, verbose=0, eval_metric='logloss')
    y_pred_pre = pd.DataFrame({'customer_id':X_test.index.values,
                            'scoring_var':X_test.iloc[:,-1].values,
                            'prediction':[val[1] for val in xgb_base.predict_proba(X_test[feature_ind])]})
    
    xgb_tuned = model
    y_pred_post = pd.DataFrame({'customer_id':X_test.index.values,
                            'scoring_var':X_test.iloc[:,-1].values,
                            'prediction':[val[1] for val in xgb_tuned.predict_proba(X_test[feature_ind])]})
    
    if scoring_method == 'last_statement':
        proba_xgb_pre = y_pred_pre[y_pred_pre['scoring_var']==1].set_index('customer_id')
        proba_xgb_post = y_pred_post[y_pred_post['scoring_var']==1].set_index('customer_id')

    elif scoring_method == 'xxxx':
        print('scoring method not yet defined')

    
    score_pre = [amex_metric(y_test.groupby(y_test.index).max().rename(columns={0:'target'}), proba_xgb_pre)]
    score_post = [amex_metric(y_test.groupby(y_test.index).max().rename(columns={0:'target'}), proba_xgb_post)]
    
    print(f'Default xgb-model score: {score_pre}')
    print(f'Tuned xgb-model score: {score_post}')

    return [score_pre, score_post]


val_scores = post_predict(tuned_model, ind_selected_features, scoring_method, X_train, X_test, y_train, y_test)

Default xgb-model score: [0.7438977273160245]
Tuned xgb-model score: [0.761865571556257]


## Prepare test data sample and make predictions

In [45]:
df_test = pd.read_parquet('./../../ignore/test.parquet', columns=(['customer_ID', 'S_2'] + featurelist))
df_test.columns = df_test.columns.str.lower()
df_test = df_test.sort_values(['customer_id', 's_2'])
df_test = df_test.set_index('customer_id')

# join lag features (dfl = delta first last) (repeated identical values for all statements)
df_test = df_test.join(pd.read_parquet('./../../ignore/test_dfl.parquet', columns=lagfeaturelist), how='left')

df_test['statement_age'] = (df_test.groupby(df_test.index)['s_2']
                      .rank(method='dense', ascending=False)
                      .astype(int)
                   )

#### Selct which statements to use

In [46]:
use_statements = [1,2,3]
df_test = df_test[df_test['statement_age'].isin(use_statements)]

In [57]:
X_processed_test = prep_test_df(df_test, selected_features)

(2754035, 54)


In [71]:
# predict with the tuned classifier
# tuned_model = model_tune.best_estimator_ 
scoring_method == 'last_statement'

def predict_on_test(model, scoring_method, X):
    y_pred = pd.DataFrame({'customer_ID':X.index.values,
                            'scoring_var':X.iloc[:,-1].values,
                            'prediction':[val[1] for val in model.predict_proba(X.iloc[:,:-1])]})
    
    if scoring_method == 'last_statement':
        y_pred = y_pred[y_pred['scoring_var']==1].set_index('customer_ID').drop(columns='scoring_var')
        # y_pred = y_pred[y_pred['scoring_var']==1].drop(columns='scoring_var')

    elif scoring_method == 'xxxx':
        print('scoring method not yet defined')

    return y_pred

test_predictions = predict_on_test(tuned_model, 'last_statement', X_processed_test)

In [73]:
test_predictions.to_csv('./../../ignore/_my_submissions/submission1.csv')