In [None]:
import os
import numpy as np
import sys
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
import gc

from tqdm import tqdm
from sklearn.model_selection import GridSearchCV
import math
plt.style.use('ggplot')
import warnings as w
w.filterwarnings(action='ignore')

In [None]:
pd.set_option('display.max_columns',None)

In [None]:
train = pd.read_feather('../input/amexfeather/train_data.ftr')
train = train.groupby('customer_ID').tail(1).set_index('customer_ID')
print("The training data begins on {} and ends on {}.".format(train['S_2'].min().strftime('%m-%d-%Y'),train['S_2'].max().strftime('%m-%d-%Y')))
print("There are {:,.0f} customers in the training set and {} features.".format(train.shape[0],train.shape[1]))

test = pd.read_feather('../input/amexfeather/test_data.ftr')
test = test.groupby('customer_ID').tail(1).set_index('customer_ID')
print("\nThe test data begins on {} and ends on {}.".format(test['S_2'].min().strftime('%m-%d-%Y'),test['S_2'].max().strftime('%m-%d-%Y')))
print("There are {:,.0f} customers in the test set and {} features.".format(test.shape[0],test.shape[1]))

del test['S_2']
gc.collect()

#### train set is date range 2018-03-01 ~ 2018-03-31 but test set is date range 2019-04-01 ~ 2019-10-31
#### So it's difficult perfectly predict test customer credit default

### Feature Explain
 1. D_* = Delinquency Variable (criminal?)
 2. S_* = Spend Varibale 
 3. P_* = Payment Variable
 4. B_* = Balance Variable
 5. R_* = Risk variable
 
### Categorical Variable
   * 'B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68'
   
#### Feature are all anonimized and normalized because of Personal information protection
#### So I didn't known all original characteristics of features, So I know only the approximate characteristics Spend,Paymnet,Balance...etc

# EDA

## Describe

In [None]:
train.describe()

## Check Null Ratio

In [None]:
feature_null_ratio = round((train.isna().sum()/train.shape[0]*100),2).sort_values(ascending=False).astype(int)
feature_null_ratio = feature_null_ratio.to_frame().rename(columns={0:'Null Ratio(%)'})
feature_null_ratio.head(20)

#### In Top 20 missing value, Feature D is have the largest number of Null ratio

## Target count plot

In [None]:
ex = train.reset_index().groupby('S_2')['customer_ID'].nunique().reset_index()
fig = go.Figure()
fig.add_trace(
    go.Scatter(x=ex.S_2,y=ex.customer_ID,hovertemplate='',mode='lines')
)
fig.update_layout(
    title = 'Frequncy of customer statements',
    xaxis_title = 'Date',
    yaxis_title = 'satements update',
    hovermode = 'x unified'
)
fig.show()

### March 3,10,17,24 is all Saturday So We Knowing that In Saturday is highly increaseing customer statements 

In [None]:
del ex
gc.collect()

## Target ValueCounts

In [None]:
train.target.value_counts(normalize=True).plot(kind='bar',figsize=(10,8),legend=True)
print(train.target.value_counts(normalize=True))

In [None]:
train.target.value_counts(normalize=True).plot(kind='pie',figsize=(10,8),legend=True)
print(train.target.value_counts(normalize=True))

### Target feature : Default customer is more than Normal customer

In [None]:
gc.collect()

### Categorical Columns preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
categorical_feature = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_68', 'D_64', 'D_66']
columns = train.columns.values
for feature in categorical_feature:
    if feature in columns:
        train[feature] = encoder.fit_transform(train[feature])
        test[feature] = encoder.fit_transform(test[feature])
    else:
        pass


In [None]:
gc.collect()

## Distribution

#### Delinquency

In [None]:
cols = [col for col in columns if (col.startswith(('D','T'))) & (col not in categorical_feature)]
cols.append('target')
ex = train[cols]
row = 0
total_row = math.ceil(len(cols) / 5)
col = [0,1,2,3,4] * total_row
fig,ax = plt.subplots(total_row,5,figsize=(16,54))
fig.suptitle('Distribution of Delinquency Variable',fontsize=16)
for i,feature in enumerate(ex.columns[:-1]):
    if (i!=0) and (i%5==0):
        row += 1
    sns.kdeplot(x=feature,hue='target',data=ex,label=['Normal','Overdue'],fill=True,ax=ax[row,col[i]])
    ax[row,col[i]].tick_params(left=False,bottom=False)
    ax[row,col[i]].set(title='\n\n{}'.format(feature),ylabel=('Density' if i%5==0 else ''))
    
for i in range(2,5):
    ax[int(total_row)-1,i].set_visible(False)
handles, _ = ax[0,0].get_legend_handles_labels() 
fig.legend(labels=['Default','Paid'], handles=reversed(handles), ncol=2, bbox_to_anchor=(0.18, 0.983))
sns.despine(bottom=True, trim=True)
plt.tight_layout(rect=[0, 0.2, 1, 0.99])


#### In D feature distribution distribution is nearly same default & normal but In Density D127,D123 default Density is bigger than Noraml Density
#### So I think D127,D123 is more helpful to predict credit default and then other features

In [None]:
del ex
gc.collect()

### Spend

In [None]:
cols = [col for col in columns if (col.startswith(('S','T'))) & (col not in categorical_feature) & (col != 'S_2')]
cols.append('target')
ex = train[cols]
row = 0
total_row = math.ceil(len(cols) / 5)
col = [0,1,2,3,4] * total_row
fig,ax = plt.subplots(total_row,5,figsize=(16,20))
fig.suptitle('Distribution of Spend Variable',fontsize=16)
for i,feature in enumerate(ex.columns[:-1]):
    if (i!=0) and (i%5==0):
        row += 1
    sns.kdeplot(x=feature,hue='target',data=ex,label=['Normal','Overdue'],fill=True,ax=ax[row,col[i]])
    ax[row,col[i]].tick_params(left=False,bottom=False)
    ax[row,col[i]].set(title='\n\n{}'.format(feature),ylabel=('Density' if i%5==0 else ''))
    
for i in range(1,5):
    ax[int(total_row-1),i].set_visible(False)
handles, _ = ax[0,0].get_legend_handles_labels() 
fig.legend(labels=['Default','Normal'], handles=reversed(handles), ncol=2, bbox_to_anchor=(0.18, 0.983))
sns.despine(bottom=True, trim=True)
plt.tight_layout(rect=[0, 0.2, 1, 0.99])


#### In Spend features S_16,S_26,S_24 Default Density is bigger than Normal Density 

In [None]:
del ex
gc.collect()

### Risk

In [None]:
cols = [col for col in train.columns if (col.startswith(('R','T'))) & (col not in categorical_feature)]
cols.append('target')
ex = train[cols]
row = 0
total_row = math.ceil(len(cols) / 5)
fig,ax = plt.subplots(total_row,5,figsize=(16,24))
fig.suptitle('Distribution of Risk Variable',fontsize=16)
col = [0,1,2,3,4] * total_row
for i, feature in enumerate(ex.columns):
    if (i!=0) & (i%5==0):
        row+=1
    sns.kdeplot(x=feature,hue='target',label=['Normal','Overdue'],fill=True,legend=False,
                ax=ax[row,col[i]],data=ex)
    ax[row,col[i]].tick_params(left=False,bottom=False)
    ax[row,col[i]].set(title='\n\n{}'.format(feature),ylabel=('Density') if i%5==0 else '')
    
for i in range(1,5):
    ax[int(total_row-1),i].set_visible(False)
handles, _ = ax[0,0].get_legend_handles_labels() 
fig.legend(labels=['Default','Paid'], handles=reversed(handles), ncol=2, bbox_to_anchor=(0.18, 0.984))
sns.despine(bottom=True, trim=True)
plt.tight_layout(rect=[0, 0.2, 1, 0.99])

#### In Risk features R_20 is highly Density 

In [None]:
del ex
gc.collect()

## Target Correlation

In [None]:
train.drop('S_2',axis=1,inplace=True)

In [None]:
numeric_feature = [cols for cols in train.columns if cols not in categorical_feature]
for feature in numeric_feature:
    if train[feature][0].dtype == np.float16:
        train[feature].fillna(-99.0,inplace=True)
        test[feature].fillna(-99.0,inplace=True)
    else:
        pass
train.isna().sum().sum()

### Let's check top 20 positive & negative high correlation

In [None]:
corr_data = train[train.keys()]
cmap = plt.cm.PuBu
cols_positive = corr_data.corr().nlargest(20,'target')['target'].index
cols_negative = corr_data.corr().nsmallest(20,'target')['target'].index
cols = cols_positive.append(cols_negative)
cm = np.corrcoef(corr_data[cols].values.T)
fig,ax = plt.subplots(figsize=(25,20))
sns.heatmap(cm,vmax=1,vmin=-1,square=True,annot=True,cmap=cmap,xticklabels=cols.values,yticklabels=cols.values)

#### correlation is higher than 0.5(negative): B_18
#### correlation is higher than 0.5(positive): B_9, B_23, D_75, D_58, B_7
#### Generally correlation is higher than 50% is important(good) feature(columns)

In [None]:
del cm,corr_data,cols,cols_positive,cols_negative
gc.collect()

# Model & train & valid split

#### I thought train way SMOTE(oversampling) & Normal 
#### The reason why I applied SMOTE because of taget feature data is very imbalance 

In [None]:
train.head()

### Before apply oversampling(SMOTE) train & valid data split
### Because I don't want affect validation data cause using SMOTE 

In [None]:
train_idx = int(len(train) * 0.8)
valid_idx = len(train) - train_idx
print(train_idx,valid_idx)

In [None]:
train_set = train[:train_idx]
valid_set = train[-valid_idx:]
print(train_set.shape,valid_set.shape)

In [None]:
del train
gc.collect()

In [None]:
y_train = train_set.pop('target')
x_train = train_set
print(x_train.shape,y_train.shape)

In [None]:
y_val = valid_set.pop('target')
x_val = valid_set

In [None]:
del train_set,valid_set
gc.collect()

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()

x_over_train, y_over_train = smote.fit_resample(x_train.values,y_train.values)
print(x_over_train.shape, y_over_train.shape)

In [None]:
pd.DataFrame(y_over_train).value_counts(normalize=True).plot(kind='bar')
print(pd.DataFrame(y_over_train).value_counts(normalize=True))

In [None]:
from lightgbm import LGBMClassifier,Dataset,early_stopping,log_evaluation
from lightgbm import plot_importance,plot_metric
from sklearn.metrics import accuracy_score,roc_auc_score,r2_score

In [None]:
def metrics(y_true: pd.DataFrame, pred: pd.DataFrame) -> float:
    
    def top_foure_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = pd.concat([y_true,pred],axis='columns').sort_values('prediction',ascending=False)
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
    
    def weighted_gini(y_true:pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true,pred],axis='columns').sort_values('prediction',ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()
    
    def normalized_weighted_gini(y_true: pd.DataFrame, pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target':'prediction'})
        return weighted_gini(y_true,pred) / weighted_gini(y_true,pred)
    
    G = normalized_weighted_gini(y_true,pred)
    D = top_foure_percent_captured(y_true,pred)
    
    return 0.5 * (G+D)

    

## Grid Search

In [None]:
# n_estimators = [1000,2000]
# max_depth = [1,100,200,300]
# learning_rate = [0.03,0.05,0.08]
# reg_alpha = [0.001,0.01,0.1]
# reg_lambda = [0.001,0.01,0.1]
# subsample = [0.88]


In [None]:
# params =  {
#     'n_estimators' : n_estimators,
#     'max_depth': max_depth,
#     'learning_rate' : learning_rate,
#     'reg_alpha' : reg_alpha,
#     'reg_lambda' : reg_lambda,
#     'subsample' : subsample,
#     'n_jobs' : [-1]
# }

In [None]:
# %%time
# gsc = GridSearchCV(LGBMClassifier(device='gpu',objective='binary',boosting_type='gbdt')
#                    ,param_grid=params,verbose=10,return_train_score=True,
#                    scoring='roc_auc',cv=3,n_jobs=-1)
# gsc.fit(x_over_train,y_over_train)

In [None]:
# print(gsc.best_params_)
# params = gsc.best_params_

In [None]:
def scoreing(fold,y_true,y_pred):
    acc = accuracy_score(y_true,y_pred)
    auc_score = roc_auc_score(y_true,y_pred)
    R2 = r2_score(y_true,y_pred)
    y_true = pd.DataFrame(data={'target':y_true.reset_index(drop=True)})
    y_pred = pd.DataFrame(data={'prediction':y_pred})
    gini_score = metrics(y_true,y_pred)
    print('Fold{}\tAccuracy:{:.3f}\tR2:{:.3f}\tAUC:{:.3f}\tGini:{:.3f}'.format(fold,acc,R2,auc_score,gini_score))
    return gini_score
    

## Kfold training

In [None]:
 params = {
     'boosting_type': 'gbdt',
     'n_estimators': 5000,
     'num_leaves': 50,
     'learning_rate': 0.05,
     'colsample_bytree': 0.9,
     'min_child_samples': 2000,
     'reg_alpha': 2,
     'objective': 'binary',
     'random_state': 21,
     'device': 'gpu',
     'n_jobs': -1,
     'subsample': 0.88,
     'max_depth': 100
          }

### SMOTE dataset(train)

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5,shuffle=True)
fold = 1
lgb_over_models = []
for train_idx,valid_idx in skf.split(x_over_train,y_over_train):
    print('-'*58)
    print(f'Fold:{fold}')
    train_x, valid_x = x_over_train[train_idx], x_over_train[valid_idx]
    train_y, valid_y = y_over_train[train_idx], y_over_train[valid_idx]
    model = LGBMClassifier(**params)
    model.fit(train_x,train_y,eval_set=[(valid_x,valid_y)],
              callbacks=[early_stopping(200)],
              verbose=200,eval_metric=['binary_logloss','auc'])
    pred = model.predict_proba(x_val)
    lgb_over_models.append(model)
    gini_score = scoreing(fold,y_val,pred)
    if fold == 1:
        best_over_gini_score = gini_score
        best_over_fold = fold
    else:
        if gini_score > best_over_gini_score:
            best_over_fold = fold
            best_over_gini_score = gini_score
    plot_metric(model)
    fold += 1
print(f'Best Fold:{best_over_fold}\tBest Gini score:{best_over_gini_score}')
print(f'So We using {best_over_fold} model')    

In [None]:
del x_over_train,y_over_train
gc.collect()

### Normal dataset

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5,shuffle=True)
lgb_models = []
for fold ,(train_idx,valid_idx) in enumerate(skf.split(x_train,y_train)):
    print('-'*58)
    print(f'Fold:{fold+1}')
    train_x, valid_x = x_train.values[train_idx], x_train.values[valid_idx]
    train_y, valid_y = y_train[train_idx], y_train[valid_idx]
    model = LGBMClassifier(**params)
    model.fit(train_x,train_y,eval_set=[(valid_x,valid_y)],
              callbacks=[early_stopping(200)],
              verbose=200,eval_metric=['binary_logloss','auc'])
    pred = model.predict_proba(x_val)
    lgb_models.append(model)
    gini_score = scoreing(fold,y_val,pred)
    if fold == 0:
        best_gini_score = gini_score
        best_fold = fold
    else:
        if gini_score > best_gini_score:
            best_fold = fold
            best_gini_score = gini_score
    plot_metric(model)
print(f'Best Fold:{best_fold}\tBest Gini score:{best_gini_score}')
print(f'So We using {best_fold} model')    

## Feature Importance

In [None]:
def feature_importance(model,train):
    plt.figure(figsize=(15,10))
    feature = pd.Series(model.feature_importances_,index=train.columns)
    sort_feature = feature.sort_values(ascending=False)[:30]
    return sns.barplot(x=sort_feature,y=sort_feature.index)

In [None]:
best_over_fold

In [None]:
# Oversampling result
best_over_model = lgb_over_models[best_over_fold-1]
feature_importance(best_over_model,x_train)

In [None]:
best_model = lgb_models[best_fold]
feature_importance(best_model,x_train)

In [None]:
pred_over = best_model.predict_proba(test.values)

In [None]:
pred = best_model.predict_proba(test.values)

In [None]:
del test 
gc.collect()

In [None]:
submission = pd.read_csv('../input/amex-default-prediction/sample_submission.csv')
submission

In [None]:
submission['prediction'] = pred
submission.head()

In [None]:
submission.to_csv('submission.csv',index=False)

In [None]:
submission['prediction'] = pred_over
submission.head()

In [None]:
submission.to_csv('submission.csv',index=False)

In [None]:
gc.collect()