In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# variables

# paths for files
app_path_str = "../input/credit-card-approval-prediction/application_record.csv"
credit_path_str = "../input/credit-card-approval-prediction/credit_record.csv"
# For random forest, a number of trees must be selected.
# The higher number, the more thorough the calculation, but it takes longer to run.
number_of_trees = 200
# Target column for random forest prediction
target_column_name = 'high_risk'
# Usually, decision trees can be large.  Setting this variable to 3 or 4 makes the result tree easier to see and interpret.
tree_depth = 3

In [None]:
#  Load data
# create dataframe from data
df_app = pd.read_csv(app_path_str)
df_app.head()

In [None]:
#  Load data
# create dataframe from data

df_credit = pd.read_csv(credit_path_str)
df_credit.shape

In [None]:
#  Replace C and X with 0, expanding the 0 group to 0-29 days past due, so that we have all numeric categories for delinquency status.
df_credit['STATUS'] = df_credit['STATUS'].replace(['X'],0)
df_credit['STATUS'] = df_credit['STATUS'].replace(['C'],0)

# check rows,cols
df_app.shape

In [None]:
# Convert status to numeric and group-max by status for each unique id.  
# This will be a proxy for whether an applicant will be approved, since there is no yes/no flag for approved in the data set.
df_credit['STATUS'] = df_credit['STATUS'].apply(pd.to_numeric) 
# Select highest status, i.e. the highest level of delinquency for each customer id
df_credit = df_credit.groupby('ID')['STATUS'].max().reset_index()
# export data to csv file
df_credit.to_csv('df_credit.csv',index=False)

df_credit.groupby('ID')['STATUS'].count().reset_index()

In [None]:
#  Join grouped status table to df_app by ID 
df_consol = pd.merge(df_app, df_credit, left_on='ID', right_on='ID')
df_consol.shape

In [None]:
# convert status to binary.  If < 1, then 
df_consol['high_risk'] = np.where(df_consol['STATUS']<1, 0, 1)
# convert days old to years
df_consol['age_years'] = round(df_consol['DAYS_BIRTH']/-365,0).astype(int)
df_consol['years_employed'] = round(df_consol['DAYS_EMPLOYED']/-365,0).astype(int)


df_consol.head()

In [None]:
#  Encode categorical columns
df_formatted = pd.get_dummies(df_consol, columns=['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 
                                   'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE'],               
               prefix=["gender", "own_car", 'own_property', 'income_type', 'education', 'family_status', 'housing_type',
                      'occupation_type'])
# check length-rows and width-columns of data
df_formatted.shape

In [None]:
# drop columns not needed
df_formatted.drop(['ID'], axis=1, inplace=True)
df_formatted.drop(['STATUS'], axis=1, inplace=True)
df_formatted.drop(['DAYS_BIRTH'], axis=1, inplace=True)
df_formatted.drop(['DAYS_EMPLOYED'], axis=1, inplace=True)
df_formatted.drop(['own_car_N'], axis=1, inplace=True)
df_formatted.drop(['own_property_N'], axis=1, inplace=True)

df_formatted.to_csv('df_formatted.csv',index=False)

In [None]:
# Use numpy to convert to arrays.
# NumPy is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, 
# along with a large collection of high-level mathematical functions to operate on these arrays.
import numpy as np

# Assign target variable to separate array
target = np.array(df_formatted[target_column_name])

# Remove target column from features
features = df_formatted.drop(target_column_name, axis = 1)

# Saving feature names for later use
feature_list = list(features.columns)

# convert features dataframe to array
features = np.array(features)

In [None]:
#  Using Skicit-learn to split data into training and testing sets.
#  Scikit-learn (formerly scikits.learn and also known as sklearn) is a free software machine learning library for the Python programming language.
#  It features various classification, #  regression and clustering algorithms including support vector machines, random forests, 
#  gradient boosting, k-means and DBSCAN, and is designed to interoperate with the Python numerical and scientific libraries NumPy and SciPy.
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets.  test_size is n% of the rows. The other % will train the model.
train_features, test_features, train_target, test_target = train_test_split(features, target, test_size = 0.25, random_state = 42)

# Check to see that training features and labels have the same rows, and testing features and labels have the same rows
print('Training Features Shape:', train_features.shape)
print('Training target Shape:', train_target.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing target Shape:', test_target.shape)

In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model.  n_estimators is the number of decision trees you want to use
rf = RandomForestRegressor(n_estimators = number_of_trees, random_state = 42)

# Train the model on training data
rf.fit(train_features, train_target)

In [None]:
# Import tools needed for visualization
from sklearn.tree import export_graphviz
from IPython.display import Image
# pydot may need to be installed. 
try:
    import pydot
except ImportError as e:
    !pip install pydot
    import pydot

In [None]:
# Limit depth of tree to n levels
rf_small = RandomForestRegressor(n_estimators=10, max_depth = tree_depth)
rf_small.fit(train_features, train_target)
# Extract the small tree
tree_small = rf_small.estimators_[5]
# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = feature_list, rounded = True, precision = 1)
(graph, ) = pydot.graph_from_dot_file('small_tree.dot')
graph.write_png('small_tree.png')
# show png file
Image(graph.create_png())

In [None]:
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

In [None]:
dfcorr = df_formatted[['AMT_INCOME_TOTAL','age_years','years_employed', 'high_risk']]

In [None]:
# import packages
import seaborn as sn
import matplotlib.pyplot as plt

In [None]:
# set width and height 
f = plt.figure() 
f.set_figwidth(15) 
f.set_figheight(12) 

# create matrix
sn.heatmap(dfcorr.corr(), annot = True, vmin=-1, vmax=1, center= 0, cmap= 'Blues', linewidths=1, linecolor='black')
# Make x and y descriptions larger so they are easier to read
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

plt.show()