Cloned from
- [First submission using CatBoost](https://www.kaggle.com/code/aninda/first-submission-using-catboost)
- The original dataset: [https://www.kaggle.com/code/aninda/creating-smaller-train-test-data](https://www.kaggle.com/code/aninda/creating-smaller-train-test-data)


Replace with the [AE Credit ID Encoded Dataset [FP16]](https://www.kaggle.com/competitions/amex-default-prediction/discussion/327228) dataset.

The whole purpose here is to cross-validate different dataset compression methods to see if we have done something wrong.

# Creating a basic submission using CatBoost Model

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc

In [None]:
train = pd.read_pickle("../input/ae-credit-id-encoded-dataset-fp16/id_encoded_fp16_train_data.pkl")
labels = pd.read_pickle("../input/ae-credit-id-encoded-dataset-fp16/id_encoded_train_labels.pkl")

In [None]:
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327094
train =  (train
            .groupby('customer_ID')
            .tail(1)
            .set_index('customer_ID', drop=True)
            .sort_index()
            .drop(['S_2'], axis='columns'))

In [None]:
train = pd.merge(train, labels, how="left", on="customer_ID")

In [None]:
train.shape

In [None]:
all_cols = train.columns.to_list()

cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

num_cols = [col for col in all_cols if col not in cat_cols + ["target", "customer_ID"]]

In [None]:
num_cols

In [None]:
train_X = train[cat_cols + num_cols]
train_y = pd.DataFrame(train["target"])

In [None]:
for col in cat_cols:
    train_X[col] = train_X[col].astype(str)

**Competition metric**

In [None]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

http://Testing the competition metric

In [None]:
y_pred = train_y.copy(deep=True).rename(columns={'P_2': 'prediction'}).drop("target",axis=1)
y_pred["prediction"] = 0
y_pred.head()

In [None]:
amex_metric(train_y,y_pred)

**Using CatBoostClassifier to make a basic submission**

In [None]:
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(train_X,train_y,stratify=train_y)
clf = CatBoostClassifier(iterations=1000)

clf.fit(X_train,y_train,eval_set=[(X_test,y_test)],cat_features=cat_cols,verbose=100)

In [None]:
y_pred = y_test.copy(deep=True)
y_pred = y_pred.rename(columns={"target":"prediction"})
y_pred["prediction"] = clf.predict_proba(X_test)[:,1]

In [None]:
amex_metric(y_test,y_pred) # Metric calculation on validation set

# Making prediction on Competition Test data

In [None]:
del train, train_X, train_y, X_train, X_test, y_train, y_test
gc.collect()

In [None]:
test = pd.read_pickle("../input/ae-credit-id-encoded-dataset-fp16/id_encoded_fp16_test_data.pkl")

In [None]:
test =  (test
            .groupby('customer_ID')
            .tail(1)
            .set_index('customer_ID', drop=True)
            .sort_index()
            .drop(['S_2'], axis='columns'))

In [None]:
for col in cat_cols:
    test[col] = test[col].astype(str)

In [None]:
test["prediction"] = clf.predict_proba(test[cat_cols + num_cols])[:,1]
test.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
loaded_encoder = LabelEncoder()
loaded_encoder.classes_ = np.load(f"../input/ae-credit-id-encoded-dataset-fp16/id_encodings.npy", allow_pickle=True)

In [None]:
test = test.reset_index()

In [None]:
test["customer_ID"] = loaded_encoder.inverse_transform(test["customer_ID"])
test.head()

In [None]:
test[["customer_ID", "prediction"]].to_csv("submission_first.csv",index=False) #Creating submission file