In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [2]:
train_data = pd.read_csv("data/UCI_Credit_Card.csv")

In [3]:
dummy_columns = ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']

for i in range(0, len(dummy_columns)):
    colname = "df_" + str(i)
    train_data[dummy_columns[i]] = pd.Categorical(train_data[dummy_columns[i]])
    train_data[dummy_columns[i]] = train_data[dummy_columns[i]].cat.codes
    colname = pd.get_dummies(train_data[dummy_columns[i]])
    train_data = pd.concat([train_data, colname], axis = 1)

train_data = train_data.drop(dummy_columns, axis = 1)

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 93 columns):
ID                            30000 non-null int64
LIMIT_BAL                     30000 non-null float64
AGE                           30000 non-null int64
BILL_AMT1                     30000 non-null float64
BILL_AMT2                     30000 non-null float64
BILL_AMT3                     30000 non-null float64
BILL_AMT4                     30000 non-null float64
BILL_AMT5                     30000 non-null float64
BILL_AMT6                     30000 non-null float64
PAY_AMT1                      30000 non-null float64
PAY_AMT2                      30000 non-null float64
PAY_AMT3                      30000 non-null float64
PAY_AMT4                      30000 non-null float64
PAY_AMT5                      30000 non-null float64
PAY_AMT6                      30000 non-null float64
default.payment.next.month    30000 non-null int64
0                             30000 non-null uint

In [5]:
train_data.set_index(['ID'], inplace = True)

In [6]:
train_data.rename(columns = {'default.payment.next.month': 'DEFAULT'}, inplace = True)
train_data.DEFAULT = train_data.DEFAULT.apply(str)

In [7]:
target = train_data.DEFAULT
train_data = train_data.drop(['DEFAULT'], axis = 1)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(train_data, target ,test_size=0.3)

In [9]:
X_train.shape

(21000, 91)

In [10]:
#Create a Gaussian Classifier
model = GaussianNB()

In [11]:
# Train the model using the training sets 
model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [12]:
y_pred = model.predict(X_test)

In [13]:
y_pred_proba = model.predict_proba(X_test)

In [14]:
confusion_matrix(y_test, y_pred)

array([[1553, 5433],
       [ 249, 1765]])

In [15]:
y_pred_proba

array([[0.13496671, 0.86503329],
       [0.16298285, 0.83701715],
       [0.54363044, 0.45636956],
       ...,
       [0.89683061, 0.10316939],
       [0.18084387, 0.81915613],
       [0.29540498, 0.70459502]])

In [16]:
roc_auc_score(y_test, y_pred_proba[:,1])

0.6242806225303494

In [17]:
accuracy_score(y_test, y_pred)

0.36866666666666664