In [1]:
import pandas as pd
import numpy as np

# from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.compose import ColumnTransformer

from keras.models import Sequential
from keras.layers import Dense
import xgboost as xgb

rand_state = 1337

### Define pipelines and functions 
1. Preprocessing 2. Sampling 3. Scoring

In [2]:
# Pipelines: Defining the categorical imputation and one-hot encoder for categorical variables.
categorical_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent"))
        # ("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False)), #Commented out because the categorical variables won't play nice with dummies between test/train. Retry when we do a full train model. Can impute values on test_data.csv if necessary.
    ]
)

# defining the numerical imputation and standard scaler for numerical variables.
numeric_pipeline = Pipeline(
    steps=[("impute", SimpleImputer(strategy="mean")), 
           ("scale", StandardScaler())]
           #("scale", MinMaxScaler())]
)

# def_prep_df: Preparing the TRAINING data for creating and testing the model.
def prep_df(df, target, target_to_drop):

    # save indices
    df_index = df.index
    # save statement_age column
    statement_age_s = df['statement_age']

    # Drop columns that shouldn't be scaled or imputed
    df = df.drop(columns=["s_2", 'statement_age', target_to_drop])

    # Missing values handling
    missing_props = df.isna().mean(axis=0)
    

    over_threshold = missing_props[missing_props >= 0.4]
    over_threshold
    

    df.drop(over_threshold.index, 
            axis=1, 
            inplace=True)

    
    
    # Split into predictors and target
    X = df.drop(columns=[target])
    y = df[target]

    # Split categorical and numerical columns
    cat_cols_all = ['b_30', 'b_38', 'd_114', 'd_116', 'd_117', 'd_120', 'd_126', 'd_63', 'd_64', 'd_66', 'd_68', 'b_31', 'd_87']
    cat_cols = [col for col in X.columns.str.lower() if col in cat_cols_all]
    num_cols = [col for col in X.columns.str.lower() if col not in cat_cols]
    
    # get dummies for categorical variables
    Xcat = pd.get_dummies(X[cat_cols], columns=cat_cols, drop_first=True)
    
    X = pd.concat([X[num_cols],Xcat], axis=1)
    X.columns = X.columns.str.lower()
    cols_list = X.columns.tolist()

    cat_cols = [col for col in cols_list if col not in num_cols]
   


    full_processor = ColumnTransformer(
        transformers=[
        ("numeric", numeric_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols),
        ]
    )


    
    # Apply preprocessing
    X_processed = full_processor.fit_transform(X)
    X_processed = pd.concat([pd.DataFrame(X_processed, index=df_index), statement_age_s], axis=1)
    print(X_processed.shape)

    y_processed = SimpleImputer(strategy="most_frequent").fit_transform(
            y.values.reshape(-1, 1)
            )
    y_processed = pd.DataFrame(y_processed, index=df_index)

    
    return X_processed, y_processed, cols_list




def get_train_test(df_train, df_train_y, X_processed, y_processed, usefraction):
    n = 100
    ids = np.array(df_train_y.index)
    target = np.array(df_train_y['target'])
    
    skf = StratifiedKFold(n_splits=n, shuffle=True, random_state=rand_state)
    skf.get_n_splits(ids, target)

    i = 0
    id_subsets = [None]*n
    for _, subset in skf.split(ids, target):
        id_subsets[i] =list(ids[subset])
        i += 1

    
    list1 = list(np.arange(0, int(usefraction[0]*100), 1))
    list2 = list(np.arange(int(usefraction[0]*100), int(usefraction[0]*100)+int(usefraction[1]*100), 1))


    train_ids = []
    for i in list1:
        train_ids.extend(id_subsets[i])
    test_ids = []
    for i in list2:
        test_ids.extend(id_subsets[i])


    X_train = X_processed[df_train.index.isin(train_ids)]
    y_train = y_processed[df_train.index.isin(train_ids)]
    X_test = X_processed[df_train.index.isin(test_ids)]
    y_test = y_processed[df_train.index.isin(test_ids)]


    print(f'Train data obs.: {len(X_train)}')
    print(f'Test data obs: {len(X_test)}')

    # also extract the statement dates for combining the predictions later on
    # train_statement_age = X_train['statement_age']
    # test_statement_age = X_test['statement_age']
    # X_train = X_train.drop(columns='statement_age')
    # X_test = X_test.drop(columns='statement_age')

    return X_train, X_test, y_train, y_test




def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

### Create initial df to be further processed

In [3]:
df_train_x = pd.read_parquet('./../../ignore/train.parquet')
df_train_x.columns = df_train_x.columns.str.lower()
df_train_x = df_train_x.sort_values(['customer_id', 's_2'])
df_train_x = df_train_x.set_index('customer_id')

df_train_y = pd.read_csv('./../../ignore/train_labels.csv')
df_train_y.columns = df_train_y.columns.str.lower()
df_train_y = df_train_y.set_index('customer_id')


df_train = pd.merge(df_train_x, df_train_y, left_index=True, right_on='customer_id', how='left')

df_train['last_statement_flag'] = (df_train.groupby(df_train.index)['s_2']
                      .rank(method='dense', ascending=False)
                      .astype(int)
                   )

df_train['last_statement_target'] = df_train['target']*df_train['last_statement_flag'].apply(lambda x: 1 if x==1 else 0)
df_train = df_train.rename(columns={'last_statement_flag':'statement_age'})

In [None]:
df_train.head(20)

### Select which statements to use

In [4]:
use_statements = [1,2,3]
df_train = df_train[df_train['statement_age'].isin(use_statements)]

### Process all the data after selecting statements

In [5]:
# Prep the dataframe
# Note that the last column 'statement_age' is left in the dataframes for scoring, not for predicting!
X_processed, y_processed, cols_list = prep_df(df_train, target='target', target_to_drop='last_statement_target')

(1360401, 204)


### Get samples for training and testing

In [6]:
# First vale of "usefraction" specifies the train size and the second, the test size (fraction of total train data available)
X_train, X_test, y_train, y_test = get_train_test(df_train, df_train_y, X_processed, y_processed, usefraction = [0.1, 0.1])

Train data obs.: 136056
Test data obs: 136031


## XGB

In [7]:
# Init classifier
xgb_a = xgb.XGBClassifier(use_label_encoder=False)

# Fit
xgb_a.fit(X_train.iloc[:,:-1], y_train, verbose=0, eval_metric='logloss')

# Predict
y_pred_a_xgb = pd.DataFrame({'customer_id':X_test.index.values,
                            'statement_age':X_test.iloc[:,-1].values,
                             'prediction':[val[1] for val in xgb_a.predict_proba(X_test.iloc[:,:-1])]})

# Score
last_proba_xgb = y_pred_a_xgb[y_pred_a_xgb['statement_age']==1].set_index('customer_id')
y_test_amexeval = y_test.groupby(y_test.index).max().rename(columns={0:'target'})
print('last_proba_score_xgb:', amex_metric(y_test_amexeval, last_proba_xgb))

last_proba_score_xgb: 0.7654475216038875


## NN

In [8]:
# Define the model
keras_a = Sequential()
keras_a.add(Dense(240, input_shape=(X_train.shape[1]-1,), activation='relu'))
keras_a.add(Dense(120, activation='relu'))
keras_a.add(Dense(40, activation='relu'))
keras_a.add(Dense(8, activation='relu'))
keras_a.add(Dense(1, activation='sigmoid'))

# Compile and fit
keras_a.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
keras_a.fit(X_train.iloc[:,:-1], y_train, epochs=4, batch_size=40, verbose=0)
# model.save('xxxxx')

# Predict
y_pred_a_keras = pd.DataFrame({'customer_id':X_test.index.values,
                        'statement_age':X_test.iloc[:,-1].values,
                        'prediction':[val[0] for val in list(keras_a.predict(X_test.iloc[:,:-1], verbose=0))]})

# Score
last_proba_keras = y_pred_a_keras[y_pred_a_keras['statement_age']==1].set_index('customer_id')
y_test_amexeval = y_test.groupby(y_test.index).max().rename(columns={0:'target'})
print('last_proba_score_keras:', amex_metric(y_test_amexeval, last_proba_keras))

last_proba_score_keras: 0.7502083314181278


## Secondary model to consolidate per statement predictions

In [15]:
# First make prediction on the train dataset
model_b_train_input = pd.DataFrame({'customer_id':X_train.index.values,
                            'statement_age':list(X_train.iloc[:,-1]), 
                            'prediction':[val[0] for val in list(keras_a.predict(X_train.iloc[:,:-1], verbose=0))],
                            'target':y_train[0].values})

# pivot, impute predictions & split into x and y
df_train_b = pd.pivot_table(model_b_train_input, values=['prediction', 'target'], index='customer_id', columns='statement_age')
df_train_b = df_train_b.iloc[:,0:4] # remove unnecessary columns
# df_train_b.head()

df_train_b.columns = ['pred1', 'pred2', 'pred3', 'target']

imparray = SimpleImputer(strategy='mean').fit_transform(df_train_b)
df_train_b = pd.DataFrame(imparray, index=df_train_b.index, columns = df_train_b.columns)

X_train_b = df_train_b.drop(columns=['target'])
y_train_b = df_train_b['target']

In [18]:
# define and fit a keras model
keras_b = Sequential()
keras_b.add(Dense(8, input_shape=(X_train_b.shape[1],), activation='relu'))
keras_b.add(Dense(4, activation='relu'))
keras_b.add(Dense(1, activation='sigmoid'))

# Compile and fit the keras model
keras_b.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
keras_b.fit(X_train_b, y_train_b, epochs=10, batch_size=40, verbose=1)

# score model on training data
# y_pred_train_b = pd.DataFrame({'customer_id':X_train_b.index.values, 'prediction':[val[0] for val in list(model_b.predict(X_train_b, verbose=0))]}).set_index('customer_id')
# print(amex_metric(y_train_amexeval, y_pred_train_b))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x17e8dc91b08>

## retrain keras_b using both keras and xgb predictions
This creates something like an ensemble model

In [19]:
# First make prediction on the train dataset
model_b_train_input2 = pd.DataFrame({'customer_id':X_train.index.values,
                            'statement_age':list(X_train.iloc[:,-1]), 
                            'prediction':[val[1] for val in xgb_a.predict_proba(X_train.iloc[:,:-1])],
                            'target':y_train[0].values})

# pivot, impute predictions & split into x and y
df2_train_b = pd.pivot_table(model_b_train_input2, values=['prediction', 'target'], index='customer_id', columns='statement_age')
df2_train_b = df2_train_b.iloc[:,0:4] # remove unnecessary columns
df2_train_b.columns = ['pred1', 'pred2', 'pred3', 'target']

imparray = SimpleImputer(strategy='mean').fit_transform(df2_train_b)
df2_train_b = pd.DataFrame(imparray, index=df2_train_b.index, columns = df2_train_b.columns)

X_train_b = pd.concat([X_train_b, df2_train_b.drop(columns=['target'])], axis=0)
y_train_b = pd.concat([y_train_b, df2_train_b['target']],axis=0)


# define and fit a keras model
keras_b = Sequential()
keras_b.add(Dense(8, input_shape=(X_train_b.shape[1],), activation='relu'))
keras_b.add(Dense(4, activation='relu'))
keras_b.add(Dense(1, activation='sigmoid'))

# Compile and fit the keras model
keras_b.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
keras_b.fit(X_train_b, y_train_b, epochs=10, batch_size=40, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x17e8e2daac8>

### Make predictions on test

In [36]:
model_b_test_keras = pd.DataFrame({'customer_id':X_test.index.values,
                            'statement_age':list(X_test.iloc[:,-1]), 
                            'prediction':[val[0] for val in list(keras_a.predict(X_test.iloc[:,:-1], verbose=0))],
                            'target':y_test[0].values})

df2_test_keras = pd.pivot_table(model_b_test_keras, values=['prediction', 'target'], index='customer_id', columns='statement_age')
df2_test_keras = df2_test_keras.iloc[:,0:4] # remove unnecessary columns
df2_test_keras.columns = ['pred1', 'pred2', 'pred3', 'target']

imparray = SimpleImputer(strategy='mean').fit_transform(df2_test_keras)
df2_test_keras = pd.DataFrame(imparray, index=df2_test_keras.index, columns = df2_test_keras.columns)

model_b_test_xgb = pd.DataFrame({'customer_id':X_test.index.values,
                            'statement_age':list(X_test.iloc[:,-1]), 
                            'prediction':[val[1] for val in xgb_a.predict_proba(X_test.iloc[:,:-1])],
                            'target':y_test[0].values})

df2_test_xgb = pd.pivot_table(model_b_test_xgb, values=['prediction', 'target'], index='customer_id', columns='statement_age')
df2_test_xgb = df2_test_keras.iloc[:,0:4] # remove unnecessary columns
df2_test_xgb.columns = ['pred1', 'pred2', 'pred3', 'target']

imparray = SimpleImputer(strategy='mean').fit_transform(df2_test_xgb)
df2_test_xgb = pd.DataFrame(imparray, index=df2_test_xgb.index, columns = df2_test_xgb.columns)


X_test_b = pd.concat([df2_test_keras.drop(columns=['target']), df2_test_xgb.drop(columns=['target'])], axis=0)
y_test_b = pd.concat([df2_test_keras['target'], df2_test_xgb['target']],axis=0)


# Predict
y_pred_model_b = pd.DataFrame({'customer_id':X_test_b.index.values,
                        'prediction':[val[0] for val in list(keras_b.predict(X_test_b, verbose=0))]})

# Score
average_proba_keras_b = y_pred_model_b.groupby(by='customer_id').mean()
y_test_b_amexeval = pd.DataFrame(y_test_b.groupby(y_test_b.index).max())
print('consolidated score:', amex_metric(y_test_b_amexeval, average_proba_keras_b))


consolidated score: 0.7429679512849126
