In [None]:
# Import libraries
import os
import warnings

import numpy as np
import pandas as pd

import gc  # Garbage collector

warnings.filterwarnings('ignore')

#### For EDA, refer: https://www.kaggle.com/code/awaldeep/first-look-eda/data

### Data pre-processing

In [None]:
# Reading feather format data(memory efficient, available on kaggle: https://www.kaggle.com/datasets/munumbutt/amexfeather) 
train_raw = pd.read_feather('../input/amexfeather/train_data.ftr')

In [None]:
train_raw.head(2)

In [None]:
train_raw.info()

In [None]:
# Missing values
tmp = train_raw.isna().sum().mul(100).div(len(train_raw)).sort_values(ascending=False)

### Handling missing values

In [None]:
# dropping columns with missing values >70%
missingDF = pd.DataFrame(tmp).reset_index()
drop_cols = missingDF[missingDF[0]>70]["index"].values
print(drop_cols)

In [None]:
train_raw

In [None]:
train_raw.drop(columns = drop_cols,axis=1, inplace=True)

In [None]:
# For categorical columns
cols = train_raw.columns
num_cols = train_raw._get_numeric_data().columns

categorical_columns = list(set(cols) - set(num_cols))
filtered_categorical_columns = list(set(train_raw[categorical_columns])-{"S_2","customer_ID"})

In [None]:
train_raw[filtered_categorical_columns].nunique()

In [None]:
train_raw[filtered_categorical_columns].isna().sum().mul(100).div(len(train_raw))

In [None]:
for i in filtered_categorical_columns:
    print(train_raw[i].value_counts())

In [None]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(strategy="most_frequent")
transformed_df = pd.DataFrame(imputer.fit_transform(train_raw[filtered_categorical_columns]),columns = filtered_categorical_columns)

In [None]:
train_raw[filtered_categorical_columns] = transformed_df[filtered_categorical_columns]

In [None]:
# For numeric columns
numeric_columns = train_raw.select_dtypes(np.number).columns
train_raw[numeric_columns] = train_raw[numeric_columns].fillna(train_raw[numeric_columns].mean())

In [None]:
train_raw.head()

In [None]:
# Handling date column

train_raw["S_2_day"] = train_raw["S_2"].dt.day
train_raw["S_2_month"] = train_raw["S_2"].dt.month
train_raw["S_2_year"] = train_raw["S_2"].dt.year


In [None]:
# considering only one data point per customer (latest one) as time series is not being used
train_raw = train_raw.groupby(['customer_ID']).nth(-1).reset_index(drop=True)

In [None]:
# drop S_2
train_raw.drop(columns=["S_2"], axis=1, inplace=True)

In [None]:
# converting pandas "categorical" dtype to numeric
cols = ["D_68", "B_30", "B_38", "D_114", "D_116", "D_117", "D_120", "D_126"]
train_raw[cols] = train_raw[cols].apply(pd.to_numeric, errors='coerce')

## Modeling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from xgboost import XGBClassifier
import xgboost as xgb
from datetime import datetime, timedelta

In [None]:
# https://www.kaggle.com/code/inversion/amex-competition-metric-python

def amex_metric_official(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()

    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [None]:
X = train_raw.drop(columns=["target"],axis=1)
y = train_raw["target"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33,random_state=100)

In [None]:
# label encoding
from sklearn.preprocessing import OrdinalEncoder

categorical_columns = ["D_63","D_64"]

oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-999)
oe.fit(X_train[categorical_columns])

X_train_enc = oe.transform(X_train[categorical_columns])
X_test_enc = oe.transform(X_test[categorical_columns])

X_train[categorical_columns] = X_train_enc
X_test[categorical_columns] = X_test_enc

In [None]:
# X_train.to_csv("x_train.csv", index=False)
# X_test.to_csv("x_test.csv", index=False)
# y_train.to_csv("y_train.csv", index=False)
# y_test.to_csv("y_test.csv", index=False)

In [None]:
xgb_classifier = XGBClassifier(objective='binary:logistic', 
                      n_estimators=200,
                      eta=0.2,
                      seed=12,
                      learning_rate=0.02,
                      use_label_encoder=False,
                      eval_metric='aucpr',                      
#                       early_stopping_rounds=10,tree_method='gpu_hist',enable_categorical=True
                            )
xgb_classifier.fit(X_train, y_train)

In [None]:
y_pred = xgb_classifier.predict(X_test)

In [None]:
y_pred_prob = xgb_classifier.predict_proba(X_test)[:,1]


In [None]:
y_test = pd.DataFrame(y_test, columns=["target"])
y_pred = pd.DataFrame(y_pred, columns=["prediction"])
y_pred_prob = pd.DataFrame(y_pred_prob, columns=["prediction"])

In [None]:
# # computing metric score
amex_metric_official(y_test, y_pred_prob)

In [None]:
# Compute accuracy
accuracy = metrics.accuracy_score(y_test["target"], y_pred["prediction"])
print(f'accuracy: {accuracy: .2%}')

In [None]:
import joblib
joblib.dump(xgb_classifier, "xgb_classifier_v1.h5")

In [None]:
import joblib
joblib.dump(oe, "oe.h5")

In [None]:
# # load the model
# import joblib
# xgb_classifier = joblib.load("../input/01-starter-xgboost-implementation/xgb_classifier_v1.h5")

In [None]:
# submission in 02. xgboost implementation


## DO UPVOTE !