# AMEX-Default-Prediction

In this version we try a baseline approach using LightGBM
Also the data currently employs only the last row corresponding to a user

In [None]:
# -i --- input file location
# -o --- output file location
# Uncomment the below line and convert the test and train dataset
# Although I'll advise to separately convert the datasets because it can give OOM (Out of Memory Error)
# !python /kaggle/usr/lib/amex_dataset_prep/amex_dataset_prep.py -i ../input/amex-default-prediction/test_data.csv -o test.csv 

In [None]:
# A cell to clear off variables in case you continue to make predictions within the same notebook
%reset -sf
import gc

gc.collect()

## Importing the necessary libraries

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm 
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


from catboost import CatBoostClassifier
import lightgbm as lgb
import xgboost as xgb

## Preprocessing the dataset

In [None]:
df = pd.read_csv("../input/d/datasets/bhavesjain/amex-default-prediction/train-last-rows.csv")
df1 = pd.read_csv("../input/amex-default-prediction/train_labels.csv")

In [None]:
df = pd.merge(df,df1,how="inner")
df.fillna(df.mode().iloc[0],inplace=True)

In [None]:
X_train, X_val= train_test_split(df,test_size=0.1, random_state=42)

In [None]:
x_cols = [col for col in X_val.columns[2:-1] if col not in ['D_63','D_64']]
y_col = "target"

In [None]:
scaler = MinMaxScaler()

In [None]:
X_train[x_cols] = scaler.fit_transform(X_train[x_cols])

## Metrics and Model Configuration

In [None]:
def amex_metric(y_true, y_pred):
    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)


In [None]:
def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred), True

params = {
        'objective': 'binary',
        'metric': "binary_logloss",
        'boosting': 'dart',
        'seed': 42,
        'num_leaves': 100,
        'learning_rate': 0.01,
        'feature_fraction': 0.20,
        'bagging_freq': 10,
        'bagging_fraction': 0.50,
        'n_jobs': -1,
        'lambda_l2': 2,
        'min_data_in_leaf': 40
        }
    
lgb_train = lgb.Dataset(X_train[x_cols], X_train[y_col])#, categorical_feature = cat_features)
lgb_valid = lgb.Dataset(X_val[x_cols], X_val[y_col])#, categorical_feature = cat_features)
model = lgb.train(
    params=params,
    train_set = lgb_train,
    num_boost_round = 2500,
    valid_sets = [ lgb_valid],
    early_stopping_rounds = 100,
    verbose_eval = 500,
    feval = lgb_amex_metric
    )

## Inferring on test set or validation set

In [None]:
pred = model.predict(X_val[x_cols])
y = X_val[y_col]

## Current CV

In [None]:
amex_metric(y,pred)

## Save the model for future use

In [None]:
model.save_model("lgb_model.json")

## Prediction Notebook

I've separately prepared the inferring notebook at: https://www.kaggle.com/bhavesjain/amex-default-pred-infer