In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Introduction

The objective of this competition is to predict the probability that a customer does not pay back their credit card balance amount in the future based on their monthly customer profile. The target binary variable is calculated by observing 18 months performance window after the latest credit card statement, and if the customer does not pay due amount in 120 days after their latest statement date it is considered a default event.

The dataset contains aggregated profile features for each customer at each statement date. Features are anonymized and normalized, and fall into the following general categories:

D_* = Delinquency variables
S_* = Spend variables
P_* = Payment variables
B_* = Balance variables
R_* = Risk variables
with the following features being categorical:

['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

Your task is to predict, for each customer_ID, the probability of a future payment default (target = 1).

Note that the negative class has been subsampled for this dataset at 5%, and thus receives a 20x weighting in the scoring metric.

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Load the data

In [None]:
# Reading feather format data(memory efficient, available on kaggle: https://www.kaggle.com/datasets/munumbutt/amexfeather) 
train_df = pd.read_feather('../input/amexfeather/train_data.ftr')

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
#check for missing values
train_df.isna().sum()

In [None]:
#shape of the dataset
train_df.shape

In [None]:
#target unique values

train_df["customer_ID"].nunique()

In [None]:
# Target values distribution
train_df["target"].value_counts("%")

### EDA

In [None]:
# Handing missing values
#Dropping columns with missing values greater than 70%

missing_cols = train_df.isna().sum().mul(100).div(len(train_df)).sort_values(ascending=False)
missing_cols_df = pd.DataFrame(missing_cols).reset_index()
drop_cols = missing_cols_df[missing_cols_df[0]>70]['index'].values
print(drop_cols)

In [None]:
train_df.head()

In [None]:
train_df.drop(columns = drop_cols,axis=1, inplace=True)

In [None]:
# For categorical column

cols = train_df.columns
num_cols = train_df._get_numeric_data().columns

categorical_columns = list(set(cols) - set(num_cols))
filtered_categorical_columns = list(set(train_df[categorical_columns])-{"S_2","customer_ID"})

In [None]:
train_df[filtered_categorical_columns].nunique()

In [None]:
train_df[filtered_categorical_columns].isna().sum().mul(100).div(len(train_df))

In [None]:
for i in filtered_categorical_columns:
    print(train_df[i].value_counts())

In [None]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(strategy="most_frequent")
transformed_df = pd.DataFrame(imputer.fit_transform(train_df[filtered_categorical_columns]),columns = filtered_categorical_columns)

In [None]:
train_df[filtered_categorical_columns] = transformed_df[filtered_categorical_columns]

In [None]:
# For numeric columns
numeric_columns = train_df.select_dtypes(np.number).columns
train_df[numeric_columns] = train_df[numeric_columns].fillna(train_df[numeric_columns].mean())

In [None]:
train_df.head()

In [None]:
# Handling date column

train_df["S_2_day"] = train_df["S_2"].dt.day
train_df["S_2_month"] = train_df["S_2"].dt.month
train_df["S_2_year"] = train_df["S_2"].dt.year

In [None]:
# considering only one data point per customer
train_df = train_df.groupby(['customer_ID']).nth(-1).reset_index(drop=True)

In [None]:
# drop S_2
train_df.drop(columns=["S_2"], axis=1, inplace=True)

In [None]:
# converting pandas "categorical" dtype to numeric
cols = ["D_63", "D_64", "D_68", "B_30", "B_38", "D_114", "D_116", "D_117", "D_120", "D_126"]
train_df[cols] = train_df[cols].apply(pd.to_numeric, errors='coerce')

### Modelling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from xgboost import XGBClassifier
import xgboost as xgb
from datetime import datetime, timedelta

In [None]:
# https://www.kaggle.com/code/inversion/amex-competition-metric-python

def amex_metric_official(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()

    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [None]:
X = train_df.drop(columns=["target"],axis=1)
y = train_df["target"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33,random_state=100)

In [None]:
xgb_classifier = XGBClassifier(objective='binary:logistic', 
                      n_estimators=10,
                      seed=123,
                      use_label_encoder=False,
                      eval_metric='aucpr',                      
#                       early_stopping_rounds=10,tree_method='gpu_hist',enable_categorical=True
                            )
xgb_classifier.fit(X_train, y_train)

In [None]:
y_pred = xgb_classifier.predict(X_test)

In [None]:
y_pred_prob = xgb_classifier.predict_proba(X_test)[:,1]

In [None]:
y_test = pd.DataFrame(y_test, columns=["target"])
y_pred = pd.DataFrame(y_pred, columns=["prediction"])
y_pred_prob = pd.DataFrame(y_pred_prob, columns=["prediction"])

In [None]:
# # computing metric score
amex_metric_official(y_test, y_pred_prob)

In [None]:
# Compute accuracy
accuracy = metrics.accuracy_score(y_test["target"], y_pred["prediction"])
print(f'accuracy: {accuracy: .2%}')

In [None]:
import joblib
joblib.dump(xgb_classifier, "xgb_classifier_v1.h5")