In [1]:
import pandas as pd
import numpy as np
from IPython.display import Markdown, display
import matplotlib.pyplot as plt
# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
train_data = pd.read_csv("credit_train.csv", sep = ',')
test_data = pd.read_csv("credit_test.csv",sep = ',')
test_data.head()

Unnamed: 0,Loan ID,Customer ID,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,f738779f-c726-40dc-92cf-689d73af533d,ded0b3c3-6bf4-4091-8726-47039f2c1b90,445,Short Term,628.0,12000.0,3 years,Rent,Debt Consolidation,0.0,3.0,9.0,1,0,445,1500,0.0,0.0
1,f738779f-c726-40dc-92cf-689d73af533d,ded0b3c3-6bf4-4091-8726-47039f2c1b90,611314,Short Term,747.0,2074116.0,10+ years,Home Mortgage,Debt Consolidation,42000.83,21.8,,9,0,621908,1058970,0.0,0.0
2,6dcc0947-164d-476c-a1de-3ae7283dde0a,1630e6e3-34e3-461a-8fda-09297d3140c8,266662,Short Term,734.0,1919190.0,10+ years,Home Mortgage,Debt Consolidation,36624.4,19.4,,11,0,679573,904442,0.0,0.0
3,f7744d01-894b-49c3-8777-fc6431a2cff1,2c60938b-ad2b-4702-804d-eeca43949c52,153494,Short Term,709.0,871112.0,2 years,Rent,Debt Consolidation,8391.73,12.5,10.0,10,0,38532,388036,0.0,0.0
4,83721ffb-b99a-4a0f-aea5-ef472a138b41,12116614-2f3c-4d16-ad34-d92883718806,176242,Short Term,727.0,780083.0,10+ years,Rent,Debt Consolidation,16771.87,16.5,27.0,16,1,156940,531322,1.0,0.0


In [3]:
#removing the first, second and third column as it doesnot play role for decision making.
train_data = train_data.drop(['Loan ID', 'Customer ID', 'Loan Status', 'Months since last delinquent'], axis=1)
test_data = test_data.drop(['Loan ID', 'Customer ID','Months since last delinquent'], axis=1)

In [4]:

#change years in current to numbers
def getYears(yrs):
    if isinstance(yrs,str):
        if yrs[0] == '<':
            return 0
        if yrs.find("10+") != -1:
            return 10
        arr = [int(s) for s in yrs.split() if s.isdigit()]
        if len(arr) >0:
            return arr[0]
    return 0
train_data['Years in current job'] = train_data['Years in current job'].apply(getYears)
test_data['Years in current job'] = test_data['Years in current job'].apply(getYears)
train_data =train_data.dropna()
test_data =test_data.dropna()
train_data['Bankruptcies'] = train_data['Bankruptcies'].apply(lambda x : int(x))
test_data['Bankruptcies'] = test_data['Bankruptcies'].apply(lambda x: int(x))
train_data = train_data[train_data.Bankruptcies <2 ]
test_data = test_data[test_data.Bankruptcies <2]
temp = test_data.groupby(by='Bankruptcies').count()
temp1 = test_data[test_data['Bankruptcies'] > 1]
temp1

Unnamed: 0,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens


In [5]:
train_features = train_data.copy()
train_labels = train_features.pop('Bankruptcies')
test_features = test_data.copy()
test_labels = test_features.pop('Bankruptcies')

In [6]:
input = tf.keras.Input(shape=(), dtype=tf.float32)

inputs = {}

for name, column in train_features.items():
    dtype = column.dtype
    if dtype == object:
        dtype = tf.string
    else:
        dtype = tf.float32
    inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype)
    
numeric_inputs = {name:input for name,input in inputs.items()
                  if input.dtype==tf.float32}

x = layers.Concatenate()(list(numeric_inputs.values()))
norm = preprocessing.Normalization()
norm.adapt(np.array(train_data[numeric_inputs.keys()]))
all_numeric_inputs = norm(x)

preprocessed_inputs = [all_numeric_inputs]

for name, input in inputs.items():
    if input.dtype == tf.float32:
        continue
    lookup = preprocessing.StringLookup(vocabulary=np.unique(train_features[name]))
    one_hot = preprocessing.CategoryEncoding(max_tokens=lookup.vocab_size())
    x = lookup(input)
    x = one_hot(x)
    preprocessed_inputs.append(x)
 
preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)
train_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat)
train_features_dict = {name: np.array(value) for name, value in train_features.items()}
test_features_dict = {name: np.array(value) for name, value in test_features.items()}

features_dict = {name:values[:] for name, values in train_features_dict.items()}
test_features_dict = {name:values[:] for name, values in test_features_dict.items()}

train_X = train_preprocessing(features_dict).numpy()
test_X = train_preprocessing(test_features_dict).numpy()

In [7]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(train_X, train_labels)


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=20,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [8]:
y_pred = regressor.predict(test_X)
print(confusion_matrix(test_labels,y_pred.round()))
print(classification_report(test_labels,y_pred.round()))
print(accuracy_score(test_labels, y_pred.round()))


[[7070   64]
 [  12  800]]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      7134
           1       0.93      0.99      0.95       812

    accuracy                           0.99      7946
   macro avg       0.96      0.99      0.97      7946
weighted avg       0.99      0.99      0.99      7946

0.9904354392146992


In [9]:
y_pred[:20]

array([0.  , 0.  , 0.  , 0.  , 0.95, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ])

In [10]:
test_labels[:20]

0     0
1     0
2     0
3     0
4     1
5     0
6     0
7     0
9     0
12    0
13    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
Name: Bankruptcies, dtype: int64

In [11]:
test_X

array([[-0.416, -0.304, -1.263, ...,  0.   ,  0.   ,  0.   ],
       [-0.398, -0.223,  0.642, ...,  0.   ,  0.   ,  0.   ],
       [-0.408, -0.232,  0.499, ...,  0.   ,  0.   ,  0.   ],
       ...,
       [-0.412, -0.243, -0.569, ...,  0.   ,  0.   ,  0.   ],
       [ 2.459, -0.241, -0.376, ...,  0.   ,  0.   ,  0.   ],
       [ 2.459, -0.223, -0.276, ...,  0.   ,  0.   ,  0.   ]],
      dtype=float32)