In [96]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

def set_col_types(df):
    if "target" in df.columns:
        categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68', 'target']
    else:
        categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
    df['customer_ID'] = df['customer_ID'].astype("string")
    
    for col in categorical_cols:
        df[col] = df[col].astype("string")
    df["S_2"] = pd.to_datetime(df['S_2'], format=r'%Y-%m-%d').astype('datetime64[ns]')
    df["B_31"] = df["B_31"].astype(np.int8)
    return df

def sync_cols(train_df, pred_df):
    for col in train_df.columns:
      if col not in pred_df.columns:
        print(col, "not in pred_df so adding - should always be categorical!")
        pred_df[col] = 0
    for col in pred_df.columns:
      if col not in train_df.columns:
        print(col, "not in train_df so dropping")
        pred_df = pred_df.drop(col, axis=1)
    return pred_df

In [107]:
# Import necessary libraries
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np

# Read data from parquet file
df = pd.read_parquet(r"../../amex-default-prediction/train_data.parquet")
#reduce df for development !!!!! comment out line below for final model
#df = df[:100000]

# Set the data types for the columns
df = set_col_types(df)

#engineer statement num
df['statement_num'] = df.groupby("customer_ID")['S_2'].rank(method='first', ascending=False).astype(np.int8) 
df = df[df["statement_num"] == 1]
df.reset_index(inplace=True)
#engineer date cols
df["Month"] = df["S_2"].dt.month
df["Day"] = df["S_2"].dt.day
df["Year"] = df["S_2"].dt.year
df = df.drop(["S_2"], axis=1)

# Separate target variable and feature columns
target = df["target"]
labels = df['customer_ID']
features = df.drop(["customer_ID", "target"], axis=1)

# Impute missing values using mode for categorical columns and median for numerical columns
cat_columns = features.select_dtypes(include=["string"]).columns
num_columns = features.select_dtypes(include="number").columns

# Replace missing values in the categorical columns with the most frequent value
# for col in cat_columns:
#     features[col].fillna("NA", inplace=True)

# Replace missing values in the numerical columns with the median value
for col in num_columns:
    features[col].fillna(features[col].mean(), inplace=True)

features = pd.get_dummies(features, dummy_na=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

# Train a LightGBM model on the training data
model = LGBMClassifier()
model.set_params(**{'bagging_fraction': 0.9212862922324698, 'bagging_freq': 6, 'feature_fraction': 0.5071870813127417, 'learning_rate': 0.09393773349524241, 'num_leaves': 82, 'reg_alpha': 0.8316783754122955, 'reg_lambda': 0.1695940899747675})
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the performance of the model
print(classification_report(y_test, y_pred, digits=5))


              precision    recall  f1-score   support

           0    0.93282   0.93183   0.93232    102185
           1    0.80431   0.80676   0.80553     35489

    accuracy                        0.89959    137674
   macro avg    0.86856   0.86929   0.86893    137674
weighted avg    0.89969   0.89959   0.89964    137674



In [104]:
amex_test = y_test.reset_index(drop=True).to_frame()
amex_test['target'] = amex_test['target'].astype('int')

amex_pred = model.predict_proba(X_test)
amex_pred = pd.DataFrame(amex_pred,columns=["proba-inv","proba"])
amex_pred = amex_pred.drop(columns=['proba-inv'])
amex_pred = amex_pred.rename(columns={"proba":"prediction"})


amex_metric(amex_test, amex_pred)

0.7798888718374688

In [106]:
import lightgbm as lgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

train_data = lgb.Dataset(X_train, label=y_train.astype(int))

# Define the search space for the hyperparameters
space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'num_leaves': hp.quniform('num_leaves', 30, 150, 1),
    'feature_fraction': hp.uniform('feature_fraction', 0.5, 1.0),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.5, 1.0),
    'bagging_freq': hp.quniform('bagging_freq', 5, 25, 1),
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
}

# Define the objective function that will be minimized
def objective(params):
    params = {
        'learning_rate': params['learning_rate'],
        'num_leaves': int(params['num_leaves']),
        'feature_fraction': params['feature_fraction'],
        'bagging_fraction': params['bagging_fraction'],
        'bagging_freq': int(params['bagging_freq']),
        'reg_alpha': params['reg_alpha'],
        'reg_lambda': params['reg_lambda'],
    }

    # Use the LightGBM model to train on the data with the given hyperparameters
    model = lgb.train(params, train_data)


    
    # Calculate the accuracy of the model on the validation data
    amex_test = y_test.reset_index(drop=True).to_frame()
    amex_test['target'] = amex_test['target'].astype(int)

    amex_pred = model.predict(X_test)
    amex_pred = pd.DataFrame(amex_pred,columns=["prediction"])
    #amex_pred = amex_pred.drop(columns=['proba-inv'])
    #amex_pred = amex_pred.rename(columns={"proba":"prediction"})


    accuracy = float(amex_metric(amex_test, amex_pred))


    # Return the negative accuracy since we want to minimize the objective
    return {'loss': -accuracy, 'status': STATUS_OK}

# Use the Tree-structured Parzen Estimator (TPE) to find the best set of hyperparameters
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=Trials())

# Print the best set of hyperparameters
print(best)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 45010                     
[LightGBM] [Info] Number of data points in the train set: 321239, number of used features: 226
[LightGBM] [Info] Start training from score 0.259430   
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 45010                                                
[LightGBM] [Info] Number of data points in the train set: 321239, number of used features: 226
[LightGBM] [Info] Start training from score 0.259430                              
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 45010                                                
[LightGBM] [Info] Number of data points in the train set: 321239, number of used features: 226
[LightGBM] [Info] Start training from score 0.259430                              
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 45010     

In [108]:
# Read test data from parquet file
df = pd.read_parquet(r"../../amex-default-prediction/test_data.parquet")
#df = df[:100000]

# Set the data types for the columns
df = set_col_types(df)

# Engineer statement num
df['statement_num'] = df.groupby("customer_ID")['S_2'].rank(method='first', ascending=False).astype(np.int8)
df = df[df["statement_num"] == 1]
df.reset_index(inplace=True)
# Engineer date cols
df["Month"] = df["S_2"].dt.month
df["Day"] = df["S_2"].dt.day
df["Year"] = df["S_2"].dt.year
df = df.drop(["S_2"], axis=1)

# Separate labels and feature columns
labels = df['customer_ID']
features = df.drop(["customer_ID"], axis=1)

# Impute missing values using mode for categorical columns and median for numerical columns
cat_columns = features.select_dtypes(include=["string"]).columns
num_columns = features.select_dtypes(include="number").columns

# Replace missing values in the categorical columns with the most frequent value
# for col in cat_columns:
#     features[col].fillna("NA", inplace=True)

# Replace missing values in the numerical columns with the median value
for col in num_columns:
    features[col].fillna(features[col].mean(), inplace=True)

features = pd.get_dummies(features, dummy_na=True)

features = sync_cols(X_train, features)

# Make predictions on the test data
y_pred = model.predict(features)
y_prob = model.predict_proba(features)



In [109]:
# Add the labels to the predictions
prediction_output = pd.concat([labels,pd.DataFrame(y_pred,columns=["pred"]),pd.DataFrame(y_prob,columns=["proba-inv","proba"])], axis=1)

prediction_output = prediction_output.drop(columns=['proba-inv','pred'])
prediction_output = prediction_output.rename(columns={"proba":"prediction"})

prediction_output.to_csv("train_last_pred_last_mean_tuned_but_int.csv", index=False)