In [62]:
import pandas as pd
import numpy as np

# from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import auc, accuracy_score

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.compose import ColumnTransformer

from keras.models import Sequential
from keras.layers import Dense


rand_state = 1337

In [63]:
df_train_x = pd.read_parquet('./git-main/DSBA_6156_SERJ/ignore/train.parquet')
df_train_x.columns = df_train_x.columns.str.lower()

df_train_y = pd.read_csv('./git-main/DSBA_6156_SERJ/ignore/train_labels.csv')
df_train_y.columns = df_train_y.columns.str.lower()
df_train_y = df_train_y.set_index('customer_id')

In [64]:
df_train_x = df_train_x.sort_values(['customer_id', 's_2'])
df_train_x = df_train_x.set_index('customer_id')

In [65]:
df_train = pd.merge(df_train_x, df_train_y, left_index=True, right_on='customer_id', how='left')

df_train['last_statement_flag'] = (df_train.groupby(df_train.index)['s_2']
                      .rank(method='dense', ascending=False)
                      .astype(int).apply(lambda x: 1 if x==1 else 0)
                   )
df_train['last_statement_target'] = df_train['target']*df_train['last_statement_flag']
df_train = df_train.drop(columns='last_statement_flag')

df_train.head()

Unnamed: 0_level_0,s_2,p_2,d_39,b_1,b_2,r_1,s_3,d_41,b_3,d_42,...,d_138,d_139,d_140,d_141,d_142,d_143,d_144,d_145,target,last_statement_target
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,2017-03-09,0.938469,0,0.008724,1.006838,0.009228,0.124035,0.0,0.004709,,...,-1,0,0,0.0,,0,0.00061,0,0,0
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,2017-04-07,0.936665,0,0.004923,1.000653,0.006151,0.12675,0.0,0.002714,,...,-1,0,0,0.0,,0,0.005492,0,0,0
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,2017-05-28,0.95418,3,0.021655,1.009672,0.006815,0.123977,0.0,0.009423,,...,-1,0,0,0.0,,0,0.006986,0,0,0
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,2017-06-13,0.960384,0,0.013683,1.0027,0.001373,0.117169,0.0,0.005531,,...,-1,0,0,0.0,,0,0.006527,0,0,0
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,2017-07-16,0.947248,0,0.015193,1.000727,0.007605,0.117325,0.0,0.009312,,...,-1,0,0,0.0,,0,0.008126,0,0,0


In [None]:
# check for unique categorical values in train data

# cat_features_list = ['b_30', 'b_38', 'd_114', 'd_116', 'd_117', 'd_120', 'd_126', 'd_63', 'd_64', 'd_66', 'd_68', 'b_31', 'd_87'] # sakshi

# cat_unq_train = []
# for col in cat_features_list:
#     # cat_unq_train.append([col, 0, list(df_train[col].dropna().unique())])
#     cat_unq_train.append([col, 0, list(df_train[col].unique())])

# cat_unq_train

In [None]:
# Example

# n = 3
# X = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'])
# y = np.array([1, 0, 1, 1, 0, 0, 0, 0, 0])
# skf = StratifiedKFold(n_splits=n, shuffle=True, random_state=rand_state)
# skf.get_n_splits(X, y)

# fold = 0
# ids = [None]*n
# for train_index, test_index in skf.split(X, y):
#     ids[fold] =list(X[test_index])
#     print(list(X[train_index]))
#     print(list(y[train_index]))
#     print(list(X[test_index])) # <-- customer_ids stratified by default =1
#     print(list(y[test_index]))
#     fold += 1

In [149]:
# Generate n stratified folds for training and testing
n = 10

X = np.array(df_train_y.index)
y = np.array(df_train_y['target'])
skf = StratifiedKFold(n_splits=n, shuffle=True, random_state=rand_state)
skf.get_n_splits(X, y)

fold = 0
ids = [None]*n
for train_index, test_index in skf.split(X, y):
    ids[fold] =list(X[test_index])
    fold += 1

# test = df_train[df_train.index.isin(ids[0])]

In [67]:
# Defining the categorical imputation and one-hot encoder for categorical variables.
categorical_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent"))
        # ("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False)), #Commented out because the categorical variables won't play nice with dummies between test/train. Retry when we do a full train model. Can impute values on test_data.csv if necessary.
    ]
)

# defining the numerical imputation and standard scaler for numerical variables.
numeric_pipeline = Pipeline(
    steps=[("impute", SimpleImputer(strategy="mean")), 
           ("scale", MinMaxScaler())]
)

In [68]:
# Preparing the TRAINING data for creating and testing the model.
def prep_df(df, target, target_to_drop):
    # Set index
#     df = df.loc[:,~df.columns.duplicated()]

    # save indices
    df_index = df.index

    # Drop unecessary columns
    df = df.drop(columns=["s_2", target_to_drop])

    # Missing values handling
    missing_props = df.isna().mean(axis=0)
    

    over_threshold = missing_props[missing_props >= 0.4]
    over_threshold
    

    df.drop(over_threshold.index, 
            axis=1, 
            inplace=True)

    
    
    # Split into predictors and target
    X = df.drop(columns=[target])
    y = df[target]

    # Split categorical and numerical columns
    cat_cols_all = ['b_30', 'b_38', 'd_114', 'd_116', 'd_117', 'd_120', 'd_126', 'd_63', 'd_64', 'd_66', 'd_68', 'b_31', 'd_87']
    cat_cols = [col for col in X.columns.str.lower() if col in cat_cols_all]
    num_cols = [col for col in X.columns.str.lower() if col not in cat_cols]
    
    # get dummies for categorical variables
    Xcat = pd.get_dummies(X[cat_cols], columns=cat_cols, drop_first=True)
    
    X = pd.concat([X[num_cols],Xcat], axis=1)
    X.columns = X.columns.str.lower()
    cols_list = X.columns.tolist()

    cat_cols = [col for col in cols_list if col not in num_cols]
   



    full_processor = ColumnTransformer(
        transformers=[
        ("numeric", numeric_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols),
        ]
    )




    
    # Apply preprocessing
    X_processed = full_processor.fit_transform(X)
    X_processed = pd.DataFrame(X_processed, index=df_index)
    print(X_processed.shape)

    y_processed = SimpleImputer(strategy="most_frequent").fit_transform(
            y.values.reshape(-1, 1)
            )
    y_processed = pd.DataFrame(y_processed, index=df_index)

    
    return X_processed, y_processed, cols_list 
    

In [69]:
# Prep the dataframe
X_processed, y_processed, cols_list = prep_df(df_train, target='target', target_to_drop='last_statement_target')

(5531451, 203)


In [223]:
# use the first fold for training and the second for testing
X_train = X_processed[df_train.index.isin(ids[0])]
y_train = y_processed[df_train.index.isin(ids[0])]

X_test = X_processed[df_train.index.isin(ids[1])]
y_test = y_processed[df_train.index.isin(ids[1])]

In [211]:
# define the keras model
model = Sequential()
model.add(Dense(240, input_shape=(203,), activation='relu'))
model.add(Dense(120, activation='relu'))
model.add(Dense(40, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [212]:
#compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [213]:
model.fit(X_train, y_train, epochs=10, batch_size=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1ea28578148>

In [214]:
model.save('keras_model_ver0')

INFO:tensorflow:Assets written to: keras_model_ver0\assets


In [163]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [215]:
y_pred = pd.DataFrame({'customer_id':list(X_test.index), 'prediction':[val[0] for val in model.predict(X_test)]})



In [216]:
y_pred_amexeval = y_pred.groupby('customer_id').max()
y_test_amexeval = y_test.groupby(y_test.index).max().rename(columns={0:'target'})

In [217]:
print(amex_metric(y_test_amexeval, y_pred_amexeval))


0.6830288664482157
