In [None]:
!pip install deepctr_torch

In [None]:
import os
import numpy as np
import sys
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
import cudf
import cupy
import gc


import torch
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV
import math
plt.style.use('ggplot')
import warnings as w
w.filterwarnings(action='ignore')

In [None]:
pd.set_option('display.max_columns',None)

In [None]:
train = pd.read_feather('../input/amexfeather/train_data.ftr')
train = train.groupby('customer_ID').tail(1).set_index('customer_ID')
print("The training data begins on {} and ends on {}.".format(train['S_2'].min().strftime('%m-%d-%Y'),train['S_2'].max().strftime('%m-%d-%Y')))
print("There are {:,.0f} customers in the training set and {} features.".format(train.shape[0],train.shape[1]))

test = pd.read_feather('../input/amexfeather/test_data.ftr')
test = test.groupby('customer_ID').tail(1).set_index('customer_ID')
print("\nThe test data begins on {} and ends on {}.".format(test['S_2'].min().strftime('%m-%d-%Y'),test['S_2'].max().strftime('%m-%d-%Y')))
print("There are {:,.0f} customers in the test set and {} features.".format(test.shape[0],test.shape[1]))

categorical_feature = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_68', 'D_64', 'D_66']
del test['S_2']
del train['S_2']
gc.collect()

In [None]:
train.info()

In [None]:
gc.collect()

### Feature Explain
 1. D_* = Delinquency Variable (criminal?)
 2. S_* = Spend Varibale 
 3. P_* = Payment Variable
 4. B_* = Balance Variable
 5. R_* = Risk variable
 
### Categorical Variable
   * 'B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68'

# EDA skip in this notebook
#### Link: https://www.kaggle.com/code/leejunseok97/amex-default-eda-prediction

### Labeling

In [None]:
numeric_feature = [cols for cols in train.columns if cols not in categorical_feature]
for feature in numeric_feature:
    if train[feature][0].dtype == np.float16:
        train[feature].fillna(-99.0,inplace=True)
        test[feature].fillna(-99.0,inplace=True)
    else:
        pass
train.isna().sum()

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
categorical_feature = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_68', 'D_64', 'D_66']
columns = train.columns.values
for feature in categorical_feature:
    if feature in columns:
        train[feature] = encoder.fit_transform(train[feature])
        test[feature] = encoder.fit_transform(test[feature])
    else:
        pass

In [None]:
dense_feature = [cols for cols in train.columns if cols not in categorical_feature]
dense_feature.remove('target')
sparse_feature = categorical_feature
test_dense_feature = [cols for cols in test.columns if cols not in categorical_feature]
test_sparse_feature = categorical_feature
target = ['target']
print('Dense Feature:',dense_feature)
print('-'*58)
print('sparse_feature:',sparse_feature)
print('-'*58)
print('Target:',target)

In [None]:
dense_feature in sparse_feature

In [None]:
fixlen_feature_columns = [SparseFeat(feat,train[feat].nunique())
                          for feat in sparse_feature] + [DenseFeat(feat,1) for feat in dense_feature]
fixlen_feature_columns

In [None]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

In [None]:
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [None]:
train_model_input = {name:train[name] for name in feature_names}
test_model_input = {name:test[name] for name in feature_names}

In [None]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
model = xDeepFM(linear_feature_columns,dnn_feature_columns,task='binary',
                device=device)
model

In [None]:
model.compile(
    optimizer=torch.optim.AdamW(model.parameters(),lr=3e-4),
    loss='binary_crossentropy',
    metrics=['binary_crossentropy','auc']
)
history = model.fit(train_model_input, train[target].values, batch_size=1024,
                    epochs=50,verbose=1,validation_split=0.2)

In [None]:
pred = model.predict(test_model_input,1024)

In [None]:
submission = pd.read_csv('../input/amex-default-prediction/sample_submission.csv')
submission

In [None]:
submission['prediction'] = pred
submission.head()

In [None]:
submission.to_csv('submission.csv',index=False)