In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn import preprocessing 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import classification_report
import sklearn.metrics as metrics
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

import os





for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df=pd.read_csv("/kaggle/input/default-of-credit-card-clients-dataset/UCI_Credit_Card.csv")
df.head()

In [None]:
df.isna().sum()

In [None]:
df.drop(["ID"], axis=1, inplace=True)
df.dropna(axis=0, inplace=True)

In [None]:
print("Describe Dataset","\n")
print(df.info())
print(df.head(10))
print(df.describe())

**Categorical Variables**

That are SEX, MARRIAGE, EDUCATION, AGE and determine how our dataset is divided and if there are sparse classes which can cause overfit of our models.

In [None]:
df["EDUCATION"].value_counts()

In [None]:
df['EDUCATION']=np.where(df['EDUCATION'] == 5, 4, df['EDUCATION'])
df['EDUCATION']=np.where(df['EDUCATION'] == 6, 4, df['EDUCATION'])
df['EDUCATION']=np.where(df['EDUCATION'] == 0, 4, df['EDUCATION'])

In [None]:
df["EDUCATION"].value_counts()

In [None]:
df["MARRIAGE"].value_counts()

In [None]:
df['MARRIAGE']=np.where(df['MARRIAGE'] == 0, 3, df['MARRIAGE'])

In [None]:
df["MARRIAGE"].value_counts()

In [None]:
def age(x):
    if x in range(21,41):
        return 1
    elif x in range(41,61):
        return 2
    elif x in range(61,80):
        return 3

df['AGE']=df['AGE'].apply(age)

In [None]:
# List of numerical features
numeric = [
    'LIMIT_BAL',
    'BILL_AMT1',
    'BILL_AMT2',
    'BILL_AMT3',
    'BILL_AMT4',
    'BILL_AMT5',
    'BILL_AMT6',
    'PAY_0',
    'PAY_2',
    'PAY_3',
    'PAY_4',
    'PAY_5',
    'PAY_6',
    'PAY_AMT1',
    'PAY_AMT2',
    'PAY_AMT3',
    'PAY_AMT4',
    'PAY_AMT5',
    'PAY_AMT6',
]

# List of categorical features
categorical = ['SEX', 'EDUCATION', 'MARRIAGE','AGE']

In [None]:
print("Continuous Variables Visualization","\n")
df.hist(column=numeric,figsize=(16,16))
plt.show()

In [None]:
print("Categorical Variables Visualization","\n")
fig, axes = plt.subplots(1, 4, figsize=(25, 5))
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.7, hspace=0.3)
for i, ax in enumerate(axes.ravel()):
    if i > 4:
        ax.set_visible(False)
        continue
    sns.countplot(y = categorical[i], data=df, ax=ax)
plt.show()

In [None]:
print("Correlation Matrix","\n")    
correlation=df.corr(method="pearson")
plt.figure(figsize=(8,7))
sns.heatmap(correlation,vmax=1, cmap="coolwarm", cbar = True,  square = True, annot = False, fmt= '.1f',xticklabels= True, yticklabels= True,linewidths=.5)
plt.show()

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)

In [None]:
def print_confusion_matrix(y_test, prediction):

  print("Confusion Matrix","\n")
  score = round(accuracy_score(y_test, prediction),3)
  cm1 = cm(y_test, prediction)
  sns.heatmap(cm1, annot=True, fmt=".1f", linewidths=.3, 
        square = True, cmap = 'PuBu')
  plt.ylabel('Actual label')
  plt.xlabel('Predicted label')
  plt.title('Accuracy Score: {0}'.format(score), size = 12)
  plt.show()
  print("\n")

In [None]:
def print_roc_auc (y_test, y_pred,model):

  fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred)
  roc_auc = metrics.auc(fpr, tpr)

  plt.title(model + ' ROC')
  plt.plot(fpr, tpr, 'b', label ='AUC= %0.5f' %roc_auc)
  plt.legend(loc='lower right')
  plt.plot([0,1],[0,1], 'r--')
  plt.xlim([0,1])
  plt.ylim([0,1])
  plt.ylabel('True Positive Rate')
  plt.xlabel('False Positive Rate')
  plt.show()

In [None]:
class Columns(BaseEstimator, TransformerMixin):
    def __init__(self, names=None):
        self.names = names

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X):
        return X[self.names]

features = FeatureUnion([
        ('numeric', make_pipeline(Columns(names=numeric),StandardScaler())),
        ('categorical', make_pipeline(Columns(names=categorical),OneHotEncoder(sparse=False)))
        ])

In [None]:
def modelcomparison(X_train, X_test, y_train,y_test):

    tested_models = {
    'LogisticRegression': LogisticRegression(solver='liblinear'),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'KNeighbors': KNeighborsClassifier(),
    'XGB': XGBClassifier(),
    'SVC': SVC(),
    'MLPClassifier' : MLPClassifier(),
    'GaussianNB' : GaussianNB(),
    'GradientBoosting' :  GradientBoostingClassifier(random_state=0),
    'LGBMClassifier'   : LGBMClassifier(random_state=5)
    }

    comparison = pd.DataFrame(columns=['Models', 'Accuracy',  'Precision', 'Recall', 'AreaUnderCurve','MeanSquaredError', 'RootMeanSquaredError', 'MeanAbsoluteError'])
    cv_accuracy=[]

    for model in tested_models:
          pipe = Pipeline([
           ("features", features),                          
          ('model', tested_models[model])
        ])
          print(tested_models[model])
          pipe.fit(X_train, y_train)
          y_pred = pipe.predict(X_test)

          fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred)
          roc_auc = metrics.auc(fpr, tpr)

          comparison = comparison.append({'Models'          :   model, 
                                    'Accuracy'              :   round(accuracy_score(y_test, y_pred),5), 
                                    'Precision'             :   round(precision_score(y_test, y_pred, average='macro'),5),
                                    'Recall'                :   round(recall_score(y_test, y_pred, average='macro'),5),
                                    'AreaUnderCurve'        :   roc_auc,
                                    'MeanSquaredError'      :   mean_squared_error(y_test,y_pred), 
                                    'RootMeanSquaredError'  :   np.sqrt(mean_squared_error(y_test,y_pred )),
                                    'MeanAbsoluteError'     :   mean_absolute_error(y_test,y_pred)
                                    }, ignore_index=True )
    
          print('======================')
          print('Tested Model: ', model)
          print('======================')
          print(classification_report(y_test, y_pred))
          print_confusion_matrix(y_test, y_pred)  
          print_roc_auc(y_test, y_pred,model)
  
    return comparison


In [None]:
ModelsResults=modelcomparison(X_train,X_test, y_train, y_test)

# Model Comparison of Data Set According to accuracy,precison and other metrics

In [None]:
ordered_model_results=ModelsResults.sort_values(by=['Accuracy'], ascending=False)
ordered_model_results

# **K-FOLD CROSS VALIDATION**

As shown below, it is taken models with uppermost accuracy Then applied 10-fold validation.

In [None]:
def ModelValidation(clf, k=10, displayscores=True):
    '''
       k: number of folds
       m: model list
    '''
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    accarr = np.zeros(k)
    precarr = np.zeros(k)
    recarr = np.zeros(k)
    index = 0
    result = pd.DataFrame(columns=['index', 'acuracy',  'precision', 'recall',])

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        pre = precision_score(y_test, y_pred, average='macro')
        rec = recall_score(y_test, y_pred, average='macro')
        accarr[index] = acc
        precarr[index] = pre
        recarr[index] = rec
        result = result.append({'index'       :   index, 
                                'acuracy'     :   acc, 
                                'precision'   :   pre, 
                                'recall'      :   rec
                                }, ignore_index=True )
        index += 1
    # if displayscores:
    #     print(accarr)
    return accarr


In [None]:
def EvaluateModels(clflist=[], k=10):

    KFoldComparison = pd.DataFrame(columns=['Models', 'Fold1',  'Fold2','Fold3','Fold4','Fold5', 'Fold6', 'Fold7','Fold8','Fold9','Fold10', 'AverageAccuracy'])

    for clf in clflist:
        acc = ModelValidation(clf, k)
        #print(clf, acc[0])
        KFoldComparison = KFoldComparison.append({'Models'          :   clf.__class__.__name__, 
                                                  'Fold1'           :   acc[0],
                                                  'Fold2'           :   acc[1],
                                                  'Fold3'           :   acc[2],
                                                  'Fold4'           :   acc[3],
                                                  'Fold5'           :   acc[4],
                                                  'Fold6'           :   acc[5],
                                                  'Fold7'           :   acc[6],
                                                  'Fold8'           :   acc[7],
                                                  'Fold9'           :   acc[8],
                                                  'Fold10'          :   acc[9],
                                                  'AverageAccuracy' :  np.average(acc)
                                                  
                                                }, ignore_index=True  )

    return KFoldComparison

**10 FOLD COMPARISON TABLE AND AVERAGE ACCURACY OF FOLDS**

In [None]:
KFoldComparisonTable

# **FEATURE SELECTION**
In this section, feature selection is applied to standardized data from df.

In [None]:
data=df.copy()
data_X = data.iloc[:, :-1]
data_y = data.iloc[:,-1]

Data Standardization

In [None]:
scaler = StandardScaler()
data_X[numeric] = scaler.fit_transform(data_X[numeric])
dmmy_col1 =pd.get_dummies(data_X['SEX'],prefix ='SEX',dummy_na=False)
dmmy_col2 =pd.get_dummies(data_X['EDUCATION'],prefix ='EDUCATION',dummy_na=False)
dmmy_col3 =pd.get_dummies(data_X['MARRIAGE'],prefix ='MARRIAGE',dummy_na=False)
dmmy_col4 =pd.get_dummies(data_X['AGE'],prefix ='AGE',dummy_na=False)
std_X = pd.concat([data_X[numeric],dmmy_col1,dmmy_col2,dmmy_col3,dmmy_col4],axis=1)
std_X

Train&Test Data Splitting

In [None]:
data_X_train, data_X_test, data_y_train, data_y_test = train_test_split(std_X, data_y, test_size=.2, random_state=123)

In [None]:
def bestFeatureSelection(clf,data_X,data_y):
  NUM_FEATURES = 5
  rfe_stand = RFE(clf, NUM_FEATURES)
  fit_stand = rfe_stand.fit(std_X, data_y)
  #print("St Model Num Features:", fit_stand.n_features_)
  #print("St Model Selected Features:", fit_stand.support_)
  print("Std Model Feature Ranking:", fit_stand.ranking_)
  score_stand = rfe_stand.score(std_X,data_y)
  print("Standardized Model Score with selected features is: %f (%f)" % (score_stand.mean(), score_stand.std()))
  feature_names = np.array(std_X.columns)
  print('Most important features (RFE): %s'% feature_names[rfe_stand.support_])
  return feature_names[rfe_stand.support_], score_stand.mean()

In [None]:
def modelsFeatureSelection(clflist, std_X, data_y):
  
    BestFeaturesOfModels = pd.DataFrame(columns=['Models', 'Feature1',  'Feature2','Feature3','Feature4','Feature5', 'ScorewithSelectedFeatures'])

    for clf in clflist:
        bst = bestFeatureSelection(clf,std_X, data_y)

        BestFeaturesOfModels = BestFeaturesOfModels.append({'Models'                    :   clf.__class__.__name__, 
                                                  'Feature1'                            :   bst[0][0],
                                                  'Feature2'                            :   bst[0][1],
                                                  'Feature3'                            :   bst[0][2],
                                                  'Feature4'                            :   bst[0][3],
                                                  'Feature5'                            :   bst[0][4],
                                                  'ScorewithSelectedFeatures'           :   bst[1]
                                                  
                                                }, ignore_index=True  )
        
    return BestFeaturesOfModels

In [None]:

GradientBoosting      = GradientBoostingClassifier(random_state=0)
XGB                   = XGBClassifier()
LGBMClassifier        = LGBMClassifier()
LogisticRegression    = LogisticRegression(solver='liblinear')
#SVC                   = SVC()
#AdaBoost              = AdaBoostClassifier()
#DecisionTree          = DecisionTreeClassifier(criterion = 'gini', max_depth=None)
#RandomForest          = RandomForestClassifier()
#KNeighbors            = KNeighborsClassifier(n_neighbors=5, weights='uniform')
#GaussianNB            = GaussianNB()

clflist = []

clflist.append(GradientBoosting)
clflist.append(XGB)
clflist.append(LGBMClassifier)
clflist.append(LogisticRegression)
# clflist.append(SVC)
#clflist.append(AdaBoost)
#clflist.append(DecisionTree)
#clflist.append(RandomForest)
#clflist.append(KNeighbors)
#clflist.append(GaussianNB)


FeatureSelectionResult=modelsFeatureSelection(clflist,std_X, data_y)

Scores with selected most important 5 features
Comparing to first model accuracy scores, only LogisticRegression classifier made an improvement.

In [None]:
FeatureSelectionResult

# MODEL USING NEURAL NETWORK

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

In [None]:
model = Sequential()
model.add(Dense(100, input_shape =(None,data_X_train.shape[1])))
model.add(Dense(50, activation="relu"))
model.add(Dense(25, activation="relu"))
model.add(Dense(10, activation="relu"))
model.add(Dense(1))

In [None]:
model.compile(optimizer="adam", loss= "mse", metrics=["accuracy"],)
history=model.fit(data_X_train, data_y_train, epochs=100)

In [None]:
fig, host = plt.subplots()
fig.subplots_adjust(right=1)

par1 = host.twinx()

p1, = host.plot(history.history["loss"], "b-", label="Loss")
p2, = par1.plot(history.history["accuracy"], "r-", label="Accuracy")
host.set_xlabel("Epoch")
host.set_ylabel("Loss")
par1.set_ylabel("Accuracy")

plt.show()

# RESULTS
As a result, comparing neural network vs other classifiers , artificial neural network is the only one that can reach to maximum accuracy and can estimate the real probability of default. After neural network, GradientBoosting, XGB and LGBM classifiers follows by accuracy consecutively.