In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

### Dataset Description

General Categories:

* D_* : Deliquency variables
* S_* : Spend variables
* P_* : Payment variables
* B_* : Balance varibles
* R_* : Risk variables

Categorical Variables: 
    B_30, B_38, D_63, D_64, D_66, D_68, D_114, D_117, D_120, D_126


In [24]:
train = pd.read_feather("./dataset/train_data.ftr")
test = pd.read_feather("./dataset/test_data.ftr")

In [25]:
with pd.option_context("display.min_rows", 4):
    display(train)
    display(test)

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938477,0.001734,0.008728,1.006836,0.009224,0.124023,0.008774,0.004707,...,,,0.002426,0.003706,0.003819,,0.000569,0.000610,0.002674,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936523,0.005775,0.004925,1.000977,0.006153,0.126709,0.000798,0.002714,...,,,0.003956,0.003166,0.005032,,0.009575,0.005493,0.009216,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5531449,fffff1d38b785cef84adeace64f8f83db3a0c31e8d92ea...,2018-02-06,0.969727,0.442627,0.009857,1.003906,0.005116,0.101807,0.009933,0.008575,...,,,0.005543,0.006565,0.009880,,0.008125,0.001168,0.003983,0
5531450,fffff1d38b785cef84adeace64f8f83db3a0c31e8d92ea...,2018-03-14,0.981934,0.002474,0.000077,0.992676,0.000809,0.119141,0.003286,0.014091,...,,,0.007317,0.002888,0.006207,,0.005112,0.003183,0.001914,0


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-02-19,0.631348,0.001912,0.010727,0.814453,0.007545,0.168701,0.009972,0.002348,...,,,,,0.004669,,,,0.008278,
1,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-03-25,0.586914,0.005276,0.011024,0.811035,0.001817,0.241333,0.000166,0.009132,...,,,,0.000142,0.004940,0.009018,,0.003695,0.003754,0.001460
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11363760,fffffa7cf7e453e1acc6a1426475d5cb9400859f82ff61...,2019-03-19,0.460449,0.002663,0.009529,0.816406,0.009026,,0.008530,0.006851,...,,,,0.007195,0.008453,0.004776,,0.006199,0.005604,0.005970
11363761,fffffa7cf7e453e1acc6a1426475d5cb9400859f82ff61...,2019-04-18,0.454590,0.004402,0.000346,0.810059,0.008736,,0.000937,0.003494,...,,,,0.003399,0.009590,0.006187,,0.007675,0.006443,0.003143


In [26]:
# Reformatting the training and testing dataframes to replace index column with customer ID
train = train.groupby("customer_ID").tail(1).set_index("customer_ID")
test = test.groupby("customer_ID").tail(1).set_index("customer_ID")

In [27]:
# Examining the shape of training and testing data
print(f"There are {train.shape[0]} rows and {train.shape[1]} columns in the training data")
print(f"There are {test.shape[0]} rows and {test.shape[1]} columns in the testing data\n")

# Examining the time period from which the data was collected
print(f"Training data period began \
{train.S_2.min().strftime('%m-%d-%Y')} and ended {train.S_2.max().strftime('%m-%d-%Y')}")
print(f"Testing data period began \
{test.S_2.min().strftime('%m-%d-%Y')} and ended {test.S_2.max().strftime('%m-%d-%Y')}")

# Observing if there are any empty values within the S_2 data period
if not train.S_2.isna().any():
    print("\nThere are no empty values in the training set during period S_2")
else:
    print("\nThere are empty values in the training set during period S_2 ")

if not test.S_2.isna().any():
    print("There are no empty values in the test set during period S_2")
else:
    print("There are empty values in the test set during period S_2 ")

There are 458913 rows and 190 columns in the training data
There are 924621 rows and 189 columns in the testing data

Training data period began 03-01-2018 and ended 03-31-2018
Testing data period began 04-01-2019 and ended 10-31-2019

There are no empty values in the training set during period S_2
There are no empty values in the test set during period S_2


In [28]:
# Examining feature distribution across the various categories
deliquencyTotal = [x for x in train.columns if x.startswith('D_')]
spendTotal = [x for x in train.columns if x.startswith('S_')]
paymentTotal = [x for x in train.columns if x.startswith('P_')]
balanceTotal = [x for x in train.columns if x.startswith('B_')]
riskTotal = [x for x in train.columns if x.startswith('R_')]

In [46]:
# Collecting values to be used for graphing

labels = ['Deliquency', 'Spend', 'Payment', 'Balance', 'Risk']
values = [len(deliquencyTotal), len(spendTotal), len(paymentTotal), len(balanceTotal), len(riskTotal)]

# Creating a pie chart to examination breakdown 
fig1 = go.Figure()
fig1.add_trace(go.Pie(values=values, labels=labels, hole=0.5,
                        hoverinfo='label+percent',
                        showlegend=False,
                        textinfo='label',
                        title='Category Distribution'))

fig1.show()


In [61]:
# Examining missing values 
nullTotal = train.isnull().sum()
nullPercent = (train.isnull().sum() / train.isnull().count())
pd.concat([nullTotal, nullPercent], axis=1, keys=['Total', 'Percent']).transpose()

Unnamed: 0,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
Total,0.0,2969.0,0.0,0.0,31.0,0.0,84970.0,31.0,31.0,399003.0,...,442518.0,442518.0,2830.0,0.0,2830.0,378598.0,2830.0,0.0,2830.0,0.0
Percent,0.0,0.00647,0.0,0.0,6.8e-05,0.0,0.185155,6.8e-05,6.8e-05,0.869452,...,0.964274,0.964274,0.006167,0.0,0.006167,0.824989,0.006167,0.0,0.006167,0.0


In [81]:
targetCount = train.target.value_counts()
targetCount.rename(index={1:'Default',0:'Paid'},inplace=True)
fig2 = go.Figure()
fig2.add_trace(go.Pie(values=targetCount,
                            labels=targetCount.index,
                            hole=0.5,
                            hoverinfo='label+percent',
                            showlegend=False,
                            title='Target Distribution'))



In [72]:
target_class = pd.DataFrame({'count': train.target.value_counts(),
                             'percentage': train['target'].value_counts() / train.shape[0] * 100
})


Int64Index([0, 1], dtype='int64')

In [3]:
caterogicalCols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

In [2]:
# Metric for performance measurement provided by Amex
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [5]:
# Modifying the performance winners methodology to attempt to boost performance gains
enc = LabelEncoder()

for col in caterogicalCols:
    train[col] = enc.fit_transform(train[col])
    test[col] = enc.fit_transform(test[col])

NameError: name 'train' is not defined

In [7]:
# Creating test dataset and then splitting it up into train/test splits
X = train.drop('target', axis=1)
y = train['target']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

NameError: name 'train' is not defined