# **Importing Modules**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading Dataset

In [None]:
df = pd.read_csv('../input/default-of-credit-card-clients-dataset/UCI_Credit_Card.csv')
df.head()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
#Check the info() of the dataset whether all the columns in dataset have the same datatype or not.
df.info()

That's pretty great, we have columns of datatype int64 and float64 only. There is no object type data feature. Now let's check whether our dataset has missing values or not.

In [None]:
#Checking for missing values
df.isnull().sum()

# Data Analysis

In [None]:
#Check the distribution of data
df['default.payment.next.month'].value_counts().plot.bar()

From the above result, you can conclude that many of the clients are not interested in a payment next month. Now let's go through some quick data analysis and look at the distribution of data of the other data features.

In [None]:
df['SEX'].value_counts().plot.bar()


It finds that The number of Male credit holder is less than Female.



In [None]:
sns.distplot(df['AGE'],kde=True,bins=30)


There is a large number of clients whose age is between 25 to 40.



In [None]:
df['EDUCATION'].value_counts().plot.bar()


It looks like most of the client's education level belongs to category 2,1 and 3.



In [None]:
df['MARRIAGE'].value_counts().plot.bar()


We noticed that there is very less number of values for category 3 and 0.



In [None]:
sns.countplot(x='SEX', data=df,hue="default.payment.next.month", palette="muted")


For females, the count of default.payment.next.month = 0 is highter than males.



In [None]:
sns.countplot(x='EDUCATION',data=df,hue="default.payment.next.month",palette="muted")


In [None]:
sns.countplot(x='MARRIAGE',data=df,hue="default.payment.next.month", palette="muted")


# Data PreProcessing

Let's do Scalling



In [None]:
df.columns = df.columns.map(str.lower)
col_to_norm = ['limit_bal', 'age', 'bill_amt1', 'bill_amt2', 'bill_amt3', 'bill_amt4', 'bill_amt5', 'bill_amt6', 'pay_amt1', 'pay_amt2', 'pay_amt3', 'pay_amt4', 'pay_amt5', 'pay_amt6']
#you can inbuilt StandardScalar() or MinMaxScalar() also
df[col_to_norm] = df[col_to_norm].apply(lambda x :( x-np.mean(x))/np.std(x))
df.head()

# Correlation

In [None]:
correlation = df.corr()
plt.subplots(figsize=(30,10))
sns.heatmap(correlation, square=True, annot=True, fmt=".1f" )

# Predictive Modelling

In [None]:
df = df.drop(["id"],1)
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values
#We split the data into train(0.75) and test(0.25) size.
 
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25,random_state = 1)


In [None]:
#Start with logistic regression model
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(random_state=1)
logmodel.fit(X_train,y_train)
y_pred = logmodel.predict(X_test)
from sklearn.metrics import  accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
roc=roc_auc_score(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
results = pd.DataFrame([['Logistic Regression', acc,prec,rec, f1,roc]],
columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score','ROC'])
results

In [None]:
#plotting the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, cmap="Blues", annot=True,annot_kws={"size": 16})