In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [None]:
application = pd.read_csv('../input/credit-card-approval-prediction/application_record.csv')
credit = pd.read_csv('../input/credit-card-approval-prediction/credit_record.csv')

# only model in the intersection cases between 2 dataset
ids = set(application['ID']).intersection(set(credit['ID']))
application = application[application['ID'].isin(ids)]
credit = credit[credit['ID'].isin(ids)]

# **WOE**

# **Feature Engineering**

OCCUPATION TYPE

Because jobs in OCCUPATION_TYPE are generic, they will be less prone to overfit. Besides,dropping any value may cause a noticable loss of information.

In [None]:
def impute_occupation_type(application):
    
    x = application.copy()
    
    probability = x['OCCUPATION_TYPE'].value_counts().to_numpy()/x['OCCUPATION_TYPE'].value_counts().sum()
    job_list = x['OCCUPATION_TYPE'].value_counts().index.to_numpy()
    indexes = range(len(x['OCCUPATION_TYPE'].value_counts()))
    null_size = len(x[x['OCCUPATION_TYPE'].isnull()]['OCCUPATION_TYPE'])
    
    random_index = np.random.choice(a=indexes, size=null_size, p=probability)
    
    x.loc[:,'IMPUTED_OCCUPATION_TYPE'] = 0
    x.loc[x['OCCUPATION_TYPE'].isnull(),'IMPUTED_OCCUPATION_TYPE'] = 1
    x.loc[x['OCCUPATION_TYPE'].isnull(),'OCCUPATION_TYPE'] = job_list[random_index]

    return x

DAYS_EMPLOYED

In [None]:
def create_unemployed_column(application):
    x = application.copy()
    
    x.loc[x['DAYS_EMPLOYED']<=0,'UNEMPLOYED'] = 0 
    x.loc[x['DAYS_EMPLOYED']>0,'UNEMPLOYED'] = 1 
    
    return x

Transform Skewed Data

In [None]:
before = application['CNT_FAM_MEMBERS'].skew()
after = np.log(application['CNT_FAM_MEMBERS']).skew()
print('Skewness coefficient')
print('CNT_FAM_MEMBERS ------')
print(f'Before: {before}')
print(f'After:  {after}')

before = application['CNT_CHILDREN'].skew()
after = np.power(application['CNT_CHILDREN'],1/7).skew()
print('CNT_CHILDREN ------')
print(f'Before: {before}')
print(f'After:  {after}')

# Only transform the ones < 0 (customers currently being employed)
before = application.loc[application['DAYS_EMPLOYED']<0,'DAYS_EMPLOYED'].skew()
after = (-1*np.sqrt(-1*application.loc[application['DAYS_EMPLOYED']<0,'DAYS_EMPLOYED'])).skew()
print('DAYS_EMPLOYED ------')
print(f'Before: {before}')
print(f'After:  {after}')

In [None]:
def transform_skewed_data(application):
    x = application.copy()
    
    x.loc[:,'CNT_FAM_MEMBERS'] = np.log(x['CNT_FAM_MEMBERS'])
    x.loc[:,'CNT_CHILDREN'] = np.power(x['CNT_CHILDREN'],1/7)
    x.loc[application['DAYS_EMPLOYED']<0,'DAYS_EMPLOYED']  = -1*np.sqrt(-1*x.loc[application['DAYS_EMPLOYED']<0,'DAYS_EMPLOYED'])
    
    return x

Encode dataset

In [None]:
from sklearn.preprocessing import OneHotEncoder
categorical_columns = ['CODE_GENDER','FLAG_OWN_CAR', 'FLAG_OWN_REALTY','NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE','OCCUPATION_TYPE']
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
encoder.fit(application[categorical_columns].dropna())

In [None]:
def encode(application):
    x = application.copy().reset_index()
    x = x.join(pd.DataFrame(encoder.transform(x[categorical_columns])))
    x = x.drop(categorical_columns,axis=1)
    return x

# **Labels**

You may need to modify the function below for your way of classifying good/bad customers

In [None]:
# this method reduced our dataset to 1/3 (36457 -> 15168)
def get_credit_status(credit):
    group=credit.groupby('ID')
    pivot_tb = credit.pivot(index = 'ID', columns = 'MONTHS_BALANCE', values = 'STATUS')
    pivot_tb['open_month'] = group['MONTHS_BALANCE'].min()
    pivot_tb['end_month'] = group['MONTHS_BALANCE'].max() 
    pivot_tb['ID'] = pivot_tb.index
    pivot_tb = pivot_tb[['ID', 'open_month', 'end_month']]
    pivot_tb['window'] = pivot_tb['end_month'] - pivot_tb['open_month'] 
    pivot_tb.reset_index(drop = True, inplace = True)
    credit0 = credit.copy()
    credit0 = pd.merge(credit0, pivot_tb, on = 'ID', how = 'left') 
    credit0=credit0[credit0['window']>15]
    credit0['status']=np.where((credit0['STATUS']=='2')| (credit0['STATUS']=='3')|(credit0['STATUS']=='4')|(credit0['STATUS']=='5'),1,0)
    
    return credit0

# **Processing Data**

In [None]:
def oversampling(data, factor=40):
    positive = data[data['status']==1]
    
    for i in range(factor):
        data = data.append(positive)
    
    data = shuffle(data)
    
    return data

def downsampling(data, remove_amount=0):
    negative = data[data['status']==0]
    assert remove_amount>=0 and remove_amount < len(negative)
    new_negative_len = len(negative) - remove_amount
    
    negative = shuffle(negative)
    negative = negative.iloc[:new_negative_len,:]
    
    data = data[data['status']==1].append(negative)
    data = shuffle(data)
    
    return data

In [None]:
def process_datasets(x_train, x_test, y_train, y_test, oversampling_factor=30, down_sampling_amount=0, transform_skewed = True):
    """
    This function is a wrapper function for all of the preproccessing steps
    
    - x_train: unprocessed train application dataset
    _ x_test: unprocessed test application dataset
    - y_train: unprocessed train credit dataset
    - y_test : unprocessed test credit dataset
    - oversampling_factor:  oversample the positive cases by this factor (because the current label method only has 3% as positive)
    - down_sampling_amount: remove this amount of negative cases to balance positive/negative cases
    _ transform_skewed: if True, transform any skewed continuous data in the application datasset
    """
    
    # encode x
    x_train = impute_occupation_type(x_train)
    x_train = create_unemployed_column(x_train)
    if transform_skewed:
        x_train = transform_skewed_data(x_train)
    x_train = encode(x_train)

    x_test = impute_occupation_type(x_test)
    x_test = create_unemployed_column(x_test)
    if transform_skewed:
        x_test = transform_skewed_data(x_test)
    x_test = encode(x_test)
    
    # encode y
    y_train = get_credit_status(y_train)[['ID','status']]
    y_test = get_credit_status(y_test)[['ID','status']]

    y_train = y_train.groupby('ID').any().reset_index()
    y_test = y_test.groupby('ID').any().reset_index()
    
    # Merge x and y together to make sure the ids matches

    merged_train = x_train.merge(y_train, on='ID')
    merged_test = x_test.merge(y_test, on='ID')
    
    # oversampling
    merged_train = oversampling(merged_train, factor=oversampling_factor)
    merged_train = downsampling(merged_train, remove_amount=down_sampling_amount)
    
    x_train = merged_train.drop(['ID', 'status'],axis=1)
    x_test = merged_test.drop(['ID', 'status'],axis=1)
    y_train = merged_train['status']
    y_test = merged_test['status']
    
    return x_train, x_test, y_train, y_test

# **Cross Validation**

In [None]:
from sklearn.utils import shuffle

train_size = len(application)*80//100
test_size = len(application) - train_size
fold_size = test_size
print(f'Train size: {train_size}, Test size: {test_size}')

In [None]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

def cross_validation(model, application, credit, epochs=10, oversampling_factor=30, down_sampling_amount=0, transform_skewed = True, threshold=0.5):
    """
    This function performs cross validation and acts as a wrapper function for preproccessing, fitting, and evaluating steps
    
    - model: machine learning model
    - application: application dataset
    - credit: credit dataset
    - epochs: this only apply to the Neural Network, number of epochs to train
    - oversampling_factor:  oversample the positive cases by this factor (because the current label method only has 3% as positive)
    - down_sampling_amount: remove this amount of negative cases to balance positive/negative cases
    _ transform_skewed: if True, transform any skewed continuous data in the application datasset
    - threshold: this only apply to the Neural Network, threshold for the decision boundary
    """
    application = shuffle(application)
    total_acc = 0
    total_f1 = 0
    total_precision = 0
    total_recall = 0
    
    for i in range(5):
        x_train = application[:fold_size*i+1].append(application[fold_size*(i+1)-1:]).copy()
        x_test = application[fold_size*i:fold_size*(i+1)].copy()

        y_train = credit[credit['ID'].isin(x_train['ID'])].copy()
        y_test = credit[credit['ID'].isin(x_test['ID'])].copy()
    
    
        x_train, x_test, y_train, y_test = process_datasets(x_train, x_test, y_train, y_test, oversampling_factor=oversampling_factor, 
                                                            down_sampling_amount=down_sampling_amount, transform_skewed=transform_skewed)
        
        if str(type(model)) == "<class 'tensorflow.python.keras.engine.sequential.Sequential'>":
            model.fit(x_train, y_train, epochs=epochs)
        else:
            model = model.fit(x_train, y_train)
        
        predictions = model.predict(x_test)
        if str(type(model)) == "<class 'tensorflow.python.keras.engine.sequential.Sequential'>":
            predictions = predictions > threshold
        
        total_acc = total_acc + accuracy_score(y_test, predictions)
        total_f1 = total_f1 + f1_score(y_test, predictions)
        total_precision = total_precision + precision_score(y_test, predictions)
        total_recall = total_recall + recall_score(y_test,predictions)
    
    return total_acc/5, total_f1/5, total_precision/5, total_recall/5

In [None]:
# TODO: Implement AUC_PR

In [None]:
# TODO: Research SMOTE

# **Train-Test Split**

Regular Train-Test Split (This is used before Cross Validation to make it easier to tune the model)

In [None]:
application = shuffle(application)

x_train = application[:train_size].copy()
x_test = application[train_size:].copy()

y_train = credit[credit['ID'].isin(x_train['ID'])].copy()
y_test = credit[credit['ID'].isin(x_test['ID'])].copy()


x_train, x_test, y_train, y_test = process_datasets(x_train, x_test, y_train, y_test, oversampling_factor=30, down_sampling_amount=0, transform_skewed = True)

In [None]:
y_train.value_counts() # this ratio is after oversampling/downsampling

# **Models Selection**

> **Logistic Regression**

Hyperparameter
* max_iter = 400
* threshold = 0.5
* oversampling_factor = 30
* down_sampling_amount = 0
* transform_skewed = True

In [None]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=400)

In [None]:
acc, f1, precision, recall = cross_validation(lr_model, application, credit, oversampling_factor=30, down_sampling_amount=0, transform_skewed = True)

print(f'Acc: {acc}')
print(f'f1: {f1}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')

K Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
# n_neighbors: number of neighbors nearby, weight: treat all equally, if 'distance' => treat the closest with greater influence
modelK = KNeighborsClassifier(n_neighbors = 2000, weights = 'uniform')

acc, f1, precision, recall = cross_validation(modelK, application, credit)

print(f'Acc: {acc}')
print(f'f1: {f1}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')


# classify(modelK, x, y)

# modelK.fit(x_train, y_train)
#y_predK = modelK.predict(x_test)
#cmK = confusion_matrix(y_test, y_predK)
#cmK

#sns.heatmap(cmK, annot = True)

Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier 
#splitter: choose the best split, max_depth: more accuracy, but less generalized, features: number of feature to look for in the best split
modelTree = DecisionTreeClassifier(splitter='random', max_depth=300, max_features=10)

acc, f1, precision, recall = cross_validation(modelTree, application, credit)

print(f'Acc: {acc}')
print(f'f1: {f1}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')

#classify(modelTree, x, y)

#modelTree.fit(x_train, y_train)
#from sklearn.metrics import confusion_matrix
#y_predTree = modelTree.predict(x_test)
#cmTree = confusion_matrix(y_test, y_predTree)
#cmTree

#sns.heatmap(cmTree, annot = True)

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
#n_estimators: number of tress, max_depth: more accuracy, but less generalized, bootstrap: whether there are samples to use
# if bootstrap = False, whole dataset will use to build each tree => Underfit
modelForest = RandomForestClassifier(n_estimators=100, max_depth=12, bootstrap = True)

acc, f1, precision, recall = cross_validation(modelForest, application, credit)

print(f'Acc: {acc}')
print(f'f1: {f1}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')


#classify(modelForest, x, y)

#modelForest.fit(x_train, y_train)
#from sklearn.metrics import confusion_matrix
#y_predForest = modelForest.predict(x_test)
#cmForest = confusion_matrix(y_test, y_predForest)
#cmForest

#sns.heatmap(cmForest, annot = True)

> **Neuronetwork**

Hyperparameter
* epochs = 10
* threshold = 0.5

In [None]:
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers

nn_model = keras.Sequential([layers.Dense(64, activation='relu'),
                             layers.Dense(128, activation='relu'),
                             layers.Dense(128, activation='relu'),
                             layers.Dense(1, activation='sigmoid')])

nn_model.compile(keras.optimizers.Adam(), keras.losses.BinaryCrossentropy(), metrics=[keras.metrics.BinaryAccuracy()])

threshold = 0.5
epochs = 10

In [None]:
acc, f1, precision, recall = cross_validation(nn_model, application, credit, epochs=epochs, threshold=threshold)

In [None]:
print(f'Acc: {acc}')
print(f'f1: {f1}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')