In [9]:
import pandas as pd
import numpy as np
from IPython.display import Markdown, display
import matplotlib.pyplot as plt
# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from sklearn.model_selection import train_test_split

In [10]:
train_data = pd.read_csv("credit_train.csv", sep = ',')
test_data = pd.read_csv("credit_train",sep = ',')
train_data.head()

Unnamed: 0,Loan ID,Customer ID,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,14dd8831-6af5-400b-83ec-68e61888a048,981165ec-3274-42f5-a3b4-d104041a9ca9,Fully Paid,445412,Short Term,709.0,1167493.0,8 years,Home Mortgage,Home Improvements,5214.74,17.2,,6,1,228190,416746.0,1.0,0.0
1,4771cc26-131a-45db-b5aa-537ea4ba5342,2de017a3-2e01-49cb-a581-08169e83be29,Fully Paid,262328,Short Term,,,10+ years,Home Mortgage,Debt Consolidation,33295.98,21.1,8.0,35,0,229976,850784.0,0.0,0.0
2,4eed4e6a-aa2f-4c91-8651-ce984ee8fb26,5efb2b2b-bf11-4dfd-a572-3761a2694725,Fully Paid,99999999,Short Term,741.0,2231892.0,8 years,Own Home,Debt Consolidation,29200.53,14.9,29.0,18,1,297996,750090.0,0.0,0.0
3,77598f7b-32e7-4e3b-a6e5-06ba0d98fe8a,e777faab-98ae-45af-9a86-7ce5b33b1011,Fully Paid,347666,Long Term,721.0,806949.0,3 years,Own Home,Debt Consolidation,8741.9,12.0,,9,0,256329,386958.0,0.0,0.0
4,d4062e70-befa-4995-8643-a0de73938182,81536ad9-5ccf-4eb8-befb-47a4d608658e,Fully Paid,176220,Short Term,,,5 years,Rent,Debt Consolidation,20639.7,6.1,,15,0,253460,427174.0,0.0,0.0


In [11]:
#removing the first, second and third column as it doesnot play role for decision making.
train_data = train_data.drop(['Loan ID', 'Customer ID', 'Loan Status', 'Months since last delinquent'], axis=1)
test_data = test_data.drop(['Loan ID', 'Customer ID', 'Months since last delinquent'], axis=1)

In [12]:
train_data.head()

Unnamed: 0,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,445412,Short Term,709.0,1167493.0,8 years,Home Mortgage,Home Improvements,5214.74,17.2,6,1,228190,416746.0,1.0,0.0
1,262328,Short Term,,,10+ years,Home Mortgage,Debt Consolidation,33295.98,21.1,35,0,229976,850784.0,0.0,0.0
2,99999999,Short Term,741.0,2231892.0,8 years,Own Home,Debt Consolidation,29200.53,14.9,18,1,297996,750090.0,0.0,0.0
3,347666,Long Term,721.0,806949.0,3 years,Own Home,Debt Consolidation,8741.9,12.0,9,0,256329,386958.0,0.0,0.0
4,176220,Short Term,,,5 years,Rent,Debt Consolidation,20639.7,6.1,15,0,253460,427174.0,0.0,0.0


In [13]:

#change years in current to numbers
def getYears(yrs):
    if isinstance(yrs,str):
        if yrs[0] == '<':
            return 0
        if yrs.find("10+") != -1:
            return 10
        arr = [int(s) for s in yrs.split() if s.isdigit()]
        if len(arr) >0:
            return arr[0]
    return 0
train_data['Years in current job'] = train_data['Years in current job'].apply(getYears)
test_data['Years in current job'] = test_data['Years in current job'].apply(getYears)

In [14]:
train_data =train_data.dropna()
test_data =test_data.dropna()

In [15]:
train_features = train_data.copy()
train_labels = train_features.pop('Bankruptcies')
test_features = test_data.copy()
test_labels = test_features.pop('Bankruptcies')

In [27]:
input = tf.keras.Input(shape=(), dtype=tf.float32)

inputs = {}

for name, column in train_features.items():
    dtype = column.dtype
    if dtype == object:
        dtype = tf.string
    else:
        dtype = tf.float32
    inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype)
    
numeric_inputs = {name:input for name,input in inputs.items()
                  if input.dtype==tf.float32}

x = layers.Concatenate()(list(numeric_inputs.values()))
norm = preprocessing.Normalization()
norm.adapt(np.array(train_data[numeric_inputs.keys()]))
all_numeric_inputs = norm(x)

preprocessed_inputs = [all_numeric_inputs]

for name, input in inputs.items():
    if input.dtype == tf.float32:
        continue
    lookup = preprocessing.StringLookup(vocabulary=np.unique(train_features[name]))
    one_hot = preprocessing.CategoryEncoding(max_tokens=lookup.vocab_size())
    x = lookup(input)
    x = one_hot(x)
    preprocessed_inputs.append(x)
 
preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)
train_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat)
train_features_dict = {name: np.array(value) for name, value in train_features.items()}
test_features_dict = {name: np.array(value) for name, value in test_features.items()}

features_dict = {name:values[:] for name, values in train_features_dict.items()}
test_features_dict = {name:values[:] for name, values in test_features_dict.items()}

train_X = train_preprocessing(features_dict).numpy()
test_X = test_preprocessing(test_features_dict).numpy()

In [28]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(train_X, train_labels)


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=20,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [None]:
y_pred = regressor.predict(test_X)
print(confusion_matrix(train_labels,y_pred))
print(classification_report(test_labels,y_pred))
print(accuracy_score(test_labels, y_pred))