# CREDIT CARD APPROVAL

# Explore data in the two tables

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
application_record = pd.read_csv('/kaggle/input/credit-card-approval-prediction/application_record.csv')
application_record.head()

In [None]:
application_record.shape

In [None]:
application_record['ID'].nunique()

* There are 438,510 unique customers on whom we have demographic data

In [None]:
credit_record = pd.read_csv('/kaggle/input/credit-card-approval-prediction/credit_record.csv')
credit_record.head()

In [None]:
credit_record.shape

In [None]:
credit_record['ID'].nunique()

* There are 45,985 unique customers on whom we have credit score/status data
* On average we have 22.8 months worth of records/score/status on each of these 46 thousand customers (see below cell)

In [None]:
credit_record.shape[0]/credit_record['ID'].nunique()

In [None]:
credit_record.dtypes

In [None]:
credit_record.isnull().sum()

In [None]:
application_record.head().T

In [None]:
application_record.dtypes

# Deal with the missing values

In [None]:
application_record.isnull().sum()

In [None]:
application_record.isnull().sum()/application_record.shape[0]

* 30% of values in OCCUPATION_TYPE are missing

In [None]:
application_record['OCCUPATION_TYPE'].value_counts()

In [None]:
application_record['OCCUPATION_TYPE'].value_counts().sort_values().plot(kind='barh', figsize=(9,12), alpha=0.7)

* since the column OCCUPATION_TYPE has varied values it is hard to go for a median kind of value to fill the missing values. so drop the column

In [None]:
application_record.drop('OCCUPATION_TYPE', axis=1, inplace=True)

In [None]:
application_record.head().T

# Decide the cut off criterion in customer_record table for customers to be treated as defaulters or non-defaulters

In [None]:
credit_record.head()

In [None]:
# how many customers instances are there when customer had no loan (20% approx)
credit_record.loc[credit_record['STATUS'] == 'X'].shape[0]/credit_record.shape[0]

In [None]:
# how many customers instances are there when customer had paid off (42% approx)
credit_record.loc[credit_record['STATUS'] == 'C'].shape[0]/credit_record.shape[0]

* so we are left with approx 40% customer months when a customer credit score is 0 to 5. looks like a fairly balanced data 
* it is balanced if you consider all codes 0 to 5 as belonging to another class (default). if you select just 3 to 5 for example then the dataset is highly imbalanced

In [None]:
credit_record['STATUS'].value_counts()/credit_record.shape[0]*100

In [None]:
# code for all defaulters in credit_record. lets say that those who defaulted on their installment by more than 0 days is a defaulter
defaulter_codes= ['0','1','2','3','4','5']

In [None]:
# data labelling. 1 is a defaulter 0 is not
credit_record['Label'] = np.where(credit_record.STATUS.isin(defaulter_codes), 1, 0)

In [None]:
credit_record.head(10)

In [None]:
credit_record.drop('STATUS', axis=1, inplace=True)
# since we do not need this column now

In [None]:
credit_record.head()

In [None]:
credit_record['Label'].value_counts()

# Join customer_record with application_record. Name the new table as record

In [None]:
print('application_record', application_record.shape)
print('credit_record',credit_record.shape)

In [None]:
record = pd.merge(credit_record, application_record, on='ID', how='left')
record.head()

In [None]:
record.shape

In [None]:
record.sample(5).T

In [None]:
record.isnull().sum()
# We need to drop rows where columns have null values

In [None]:
record.dropna(inplace=True)

In [None]:
record.head()

In [None]:
record.shape

In [None]:
record['Label'].value_counts()

* So this is how our dataset is labelled: 0.3 million with label 1 (default status) 0.47 million with label 0. That is about 60:40 ratio

# VISUALISATION

In [None]:
chart=sns.countplot(x='Label', data = record, palette = 'hls')

In [None]:
pd.crosstab(record.CODE_GENDER,record.Label).plot(kind='bar')
plt.title('gender vs Label')
plt.xlabel('gender')
plt.ylabel('Number of Defaulters')

In [None]:
pd.crosstab(record.FLAG_OWN_CAR,record.Label).plot(kind='bar')
plt.title('car ownership vs Label')
plt.xlabel('car owner')
plt.ylabel('Number of Defaulters')

In [None]:
pd.crosstab(record.FLAG_OWN_REALTY,record.Label).plot(kind='bar')
plt.title('property ownership vs Label')
plt.xlabel('property ownder owner')
plt.ylabel('Number of Defaulters')

In [None]:
pd.crosstab(record.CNT_CHILDREN, record.Label).plot(kind='bar', figsize=(12,6))
plt.title('#children Vs Label')
plt.xlabel('number of children')
plt.ylabel('Number of Defaulters')

In [None]:
pd.crosstab(record.NAME_INCOME_TYPE, record.Label).plot(kind='bar', figsize=(12,6))
plt.title('Label by income type')
plt.xlabel('income type')
plt.ylabel('number of defaulters')

In [None]:
pd.crosstab(record.NAME_INCOME_TYPE, record.Label)

In [None]:
pd.crosstab(record.NAME_EDUCATION_TYPE, record.Label).plot(kind='barh', figsize=(9,9))
plt.title('education type vs label')

In [None]:
record.columns

In [None]:
pd.crosstab(record.NAME_FAMILY_STATUS, record.Label).plot(kind='bar', figsize=(9,6))
plt.title('FAMILY_STATUS versus Label')

In [None]:
pd.crosstab(record.NAME_HOUSING_TYPE, record.Label).plot(kind='bar', figsize=(9,6))
plt.title('HOUSING_TYPE versus Label')

In [None]:
pd.crosstab(record.CNT_FAM_MEMBERS,record.Label).plot(kind='bar', figsize=(9,6))

plt.title('Label by number of family')
plt.xlabel('number of family members')
plt.ylabel('number of defaulters')

In [None]:
plt.figure(figsize=(10,6))
record[record['Label']==1]['AMT_INCOME_TOTAL'].hist(alpha=0.7,color='blue',
                                              bins=15,label='Label=1')
record[record['Label']==0]['AMT_INCOME_TOTAL'].hist(alpha=0.3,color='green',
                                                bins=15,label='Label=0')
                                                    
plt.title('defaulters across various income groups')                                                    
plt.legend()
plt.xlabel('AMT_INCOME')
plt.ylabel('Number of Defaulters')

In [None]:
plt.figure(figsize=(12,6))

record[record['Label'] == 1]['DAYS_BIRTH'].hist(alpha=0.7,color='blue', bins=5,label='Label=1')
record[record['Label'] == 0]['DAYS_BIRTH'].hist(alpha=0.3,color='green', bins=5,label='Label=0')

plt.title('days since birth versus number of defaulters')
plt.legend()
plt.xlabel('days since birth')
plt.ylabel('number of defaulters')

In [None]:
plt.figure(figsize=(12,6))

record[record['Label']==1]['DAYS_EMPLOYED'].hist(bins=5, alpha=0.7, color='blue', label='Label=1')
record[record['Label']==0]['DAYS_EMPLOYED'].hist(bins=5, alpha=0.2, color='green', label='Label=0')

plt.legend()
plt.title('days employed versus label')
plt.xlabel('days of employment')
plt.ylabel('number of defaulters')


In [None]:
record['ID'] = record['ID'].astype(str) + '__' + record['MONTHS_BALANCE'].astype(str)

In [None]:
record.head()

# Some attribute construction 
* Adjust Days_Birth and Days_Employed. We wanted to know the age AT THE TIME the credit score for a certain month was assigned.
* income per family member

In [None]:
record['DAYS_BIRTH'] = record['DAYS_BIRTH'] - (record['MONTHS_BALANCE'])*30
record['DAYS_EMPLOYED'] = record['DAYS_EMPLOYED'] - (record['MONTHS_BALANCE'])*30

In [None]:
record.head()

In [None]:
plt.figure(figsize=(12,6))

record[record['Label'] == 1]['DAYS_BIRTH'].hist(alpha=0.7,color='blue', bins=5,label='Label=1')
record[record['Label'] == 0]['DAYS_BIRTH'].hist(alpha=0.3,color='green', bins=5,label='Label=0')

plt.title('days since birth versus number of defaulters')
plt.legend()
plt.xlabel('days since birth')
plt.ylabel('number of defaulters')

In [None]:
record['INCOME_PER_MEMBER'] = record['AMT_INCOME_TOTAL']/record['CNT_FAM_MEMBERS']

In [None]:
plt.figure(figsize=(12,6))

record[record['Label'] == 1]['INCOME_PER_MEMBER'].hist(alpha=0.7,color='blue', bins=15,label='Label=1')
record[record['Label'] == 0]['INCOME_PER_MEMBER'].hist(alpha=0.3,color='green', bins=15,label='Label=0')

plt.title('income per member versus number of defaulters')
plt.legend()
plt.xlabel('income per member')
plt.ylabel('number of defaulters')

# Dummify categorical data

In [None]:
record.columns

In [None]:
record = pd.get_dummies(record, columns=['CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','NAME_INCOME_TYPE',
                                         'NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE'], drop_first = False)

In [None]:
record.head().T   

* month balance column should be dropped becaquse it gives a false ordinal signal. one issue if we do that: there could be multiple lables for the same example. but we have resolved that by re-working employment and age at the time the credit score is awarded. so months balance column can now be dropped. 

In [None]:
record.drop('MONTHS_BALANCE', axis=1, inplace=True)

In [None]:
record.head().T

# Scaling

In [None]:
record_before_scaling = record.copy()

In [None]:
scalable_cols = ['AMT_INCOME_TOTAL', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'INCOME_PER_MEMBER']

In [None]:
from sklearn import preprocessing

In [None]:
#create RobustScaler object
scaler = preprocessing.RobustScaler()

In [None]:
record[['AMT_INCOME_TOTAL', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'INCOME_PER_MEMBER']] = scaler.fit_transform(record[['AMT_INCOME_TOTAL', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'INCOME_PER_MEMBER']])

In [None]:
for variable in scalable_cols:
    fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(9, 5))

    ax1.set_title('BEFORE Scaling')
    sns.distplot(record_before_scaling[variable], ax=ax1)

    ax2.set_title('AFTER Scaling')
    sns.distplot(record[variable], ax=ax2, color='b')

    plt.show()

In [None]:
record.head().T

In [None]:
record.isnull().sum()

In [None]:
record.groupby('Label').mean().transpose()

In [None]:
record.columns

In [None]:
sns.pairplot(record[['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'CNT_FAM_MEMBERS', 'INCOME_PER_MEMBER','Label']])

In [None]:
#Using Pearson Correlation
plt.figure(figsize=(12,10))

corr = record[['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'CNT_FAM_MEMBERS', 'INCOME_PER_MEMBER','Label']].corr()
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns,
        annot=True, cmap=plt.cm.Reds)

In [None]:
record.corr()['Label'].sort_values(ascending=False)

In [None]:
record.corr()['Label'].sort_values(ascending=True).plot(kind='barh', figsize=(9,12))

## variable selection not done yet

# Model

In [None]:
record.columns

In [None]:
Features = ['CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'DAYS_BIRTH', 'DAYS_EMPLOYED', 'FLAG_MOBIL', 'FLAG_WORK_PHONE',
       'FLAG_PHONE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS', 'INCOME_PER_MEMBER',
       'CODE_GENDER_F', 'CODE_GENDER_M', 'FLAG_OWN_CAR_N', 'FLAG_OWN_CAR_Y',
       'FLAG_OWN_REALTY_N', 'FLAG_OWN_REALTY_Y',
       'NAME_INCOME_TYPE_Commercial associate', 'NAME_INCOME_TYPE_Pensioner',
       'NAME_INCOME_TYPE_State servant', 'NAME_INCOME_TYPE_Student',
       'NAME_INCOME_TYPE_Working', 'NAME_EDUCATION_TYPE_Academic degree',
       'NAME_EDUCATION_TYPE_Higher education',
       'NAME_EDUCATION_TYPE_Incomplete higher',
       'NAME_EDUCATION_TYPE_Lower secondary',
       'NAME_EDUCATION_TYPE_Secondary / secondary special',
       'NAME_FAMILY_STATUS_Civil marriage', 'NAME_FAMILY_STATUS_Married',
       'NAME_FAMILY_STATUS_Separated',
       'NAME_FAMILY_STATUS_Single / not married', 'NAME_FAMILY_STATUS_Widow',
       'NAME_HOUSING_TYPE_Co-op apartment',
       'NAME_HOUSING_TYPE_House / apartment',
       'NAME_HOUSING_TYPE_Municipal apartment',
       'NAME_HOUSING_TYPE_Office apartment',
       'NAME_HOUSING_TYPE_Rented apartment', 'NAME_HOUSING_TYPE_With parents']

### using about 100k examples from the dataset

In [None]:
record['Label'].value_counts()

In [None]:
record_sample = record.sample(100000)

In [None]:
record_sample['Label'].value_counts()

# Train Test split

In [None]:
from sklearn import model_selection
from sklearn.model_selection import train_test_split

train, test = train_test_split(record_sample, test_size = 0.3, random_state=21)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

features_train = train[Features]
label_train = train['Label']
features_test = test[Features]
label_test = test['Label']

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

clf.fit(features_train,label_train)

pred_train = clf.predict(features_train)
pred_test = clf.predict(features_test)

from sklearn.metrics import accuracy_score
accuracy_train = accuracy_score(pred_train,label_train)
accuracy_test = accuracy_score(pred_test,label_test)


print("{:.2f}".format(accuracy_train),"{:.2f}".format(accuracy_test))

In [None]:
pd.crosstab(label_test,pd.Series(pred_test),rownames=['ACTUAL'],colnames=['PRED'])

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

print('Accuracy Score')
print(accuracy_score(label_test, pred_test),'\n')

print('Precision Score')
print(precision_score(label_test, pred_test,average = None),'\n')

print('Confusion Matrix')
array = confusion_matrix(label_test, pred_test)
columns = ['Non Defaulter','Defaulter'] 
print(pd.DataFrame(array,columns = columns, index = columns),'\n')

print('Classification Report')
print(classification_report(label_test, pred_test),'\n')

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(random_state=0)
tree.fit(features_train, label_train)

# predict train set
pred_train=tree.predict(features_train)
# predict test set
pred_test=tree.predict(features_test)

from sklearn.metrics import accuracy_score
accuracy_train = accuracy_score(pred_train,label_train)
accuracy_test = accuracy_score(pred_test,label_test)


print("{:.2f}".format(accuracy_train),"{:.2f}".format(accuracy_test))

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

print('Accuracy Score')
print(accuracy_score(label_test, pred_test),'\n')

print('Precision Score')
print(precision_score(label_test, pred_test,average = None),'\n')

print('Confusion Matrix')
array = confusion_matrix(label_test, pred_test)
columns = ['Non Defaulter','Defaulter']  
print(pd.DataFrame(array,columns = columns, index = columns),'\n')

print('Classification Report')
print(classification_report(label_test, pred_test),'\n')

In [None]:
dfz=pd.DataFrame({'features':features_train.columns,'importances':tree.feature_importances_})
dfz.sort_values('importances',inplace=True)
dfz.plot(kind='barh', title = 'Decision Tree Classifier\nFeature Importances', y='importances',x='features',color='brown', figsize=(12,22))