# E402 Final Project

This Machine Learning model takes borrower characteristics into account on a credit application and predicts whether or not the applicant will default on their loan. The model is trained/tested on a dataset of 307k+ instances, using features including but not limited to: Gender, Income, Realty Status, and more.

In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.72 µs


In [58]:
df_test = pd.read_csv('/Users/sampence/Documents/IU Bloom/E402 - Computational Methods in Macro/Final Project/home-credit-default-risk/application_test.csv')
df_test = df_test[['CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','CNT_CHILDREN','AMT_INCOME_TOTAL','AMT_CREDIT']]
df_train = pd.read_csv('/Users/sampence/Documents/IU Bloom/E402 - Computational Methods in Macro/Final Project/home-credit-default-risk/application_train.csv')


mapping = {'Y': 1, 'N': 0, 'M': 1, 'F': 0}
cols = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']
df_test[cols] = df_test[cols].applymap(mapping.get)
df_test = df_test[['CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','CNT_CHILDREN','AMT_INCOME_TOTAL','AMT_CREDIT']]

boolean = df_test.isna()
true_rows = boolean[boolean.any(axis=1)]
print(df_test)

       CODE_GENDER  FLAG_OWN_CAR  FLAG_OWN_REALTY  CNT_CHILDREN  \
0                0             0                1             0   
1                1             0                1             0   
2                1             1                1             0   
3                0             0                1             2   
4                1             1                0             1   
...            ...           ...              ...           ...   
48739            0             0                1             0   
48740            0             0                0             2   
48741            0             1                1             1   
48742            1             0                0             0   
48743            0             1                0             0   

       AMT_INCOME_TOTAL  AMT_CREDIT  
0              135000.0    568800.0  
1               99000.0    222768.0  
2              202500.0    663264.0  
3              315000.0   1575000.0  
4    

In [60]:
class CreditDefault():
    """
    This class takes 5 or 6 features from a loan applicant and uses a machine learning model trained on close to 
    307,000 entries. This model will make a prediction of whether or not the credit applicant is likely to repay
    their loans.
    """
    
    def __init__(self):
        
        # Load in training dataset from Home Credit Group
        self.df_train = pd.read_csv('/Users/sampence/Documents/IU Bloom/E402 - Computational Methods in Macro/Final Project/home-credit-default-risk/application_train.csv')
        
        # Convert all binary values to 1 and 0 
        mapping = {'Y': 1, 'N': 0, 'M': 1, 'F': 0}
        cols = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']
        self.df_train[cols] = self.df_train[cols].applymap(mapping.get)

        # Label for targeting
        self.label = np.where(self.df_train['TARGET'] == 1, 'PAYMENT TROUBLES', 'PAID OFF')
        
        # Feature Selection
        self.train_features = self.df_train[['CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','CNT_CHILDREN','AMT_INCOME_TOTAL','AMT_CREDIT']]
        # -- Fill NA values in gender with median value [0] -- #
        self.train_features.loc[[35657, 38566, 83382, 189640], 'CODE_GENDER'] = 0
        
        # Scaling features to Mean = 0 and Std. Dev = 1
        scaler = StandardScaler()
        self.train_features = scaler.fit_transform(self.train_features)
        
        # Create a RandomForestClassifier to fit on the training data.
        self.model = RandomForestClassifier(n_estimators=10, random_state=40)
        self.model.fit(self.train_features, self.label)
        
    def predict(self, X_test):
        """
        Predicts the loan repayment status of a set of loan applicants.

        Args:
            X_test (pandas.DataFrame): A DataFrame containing the loan applicant information to be predicted.
        
        Returns:
            numpy.ndarray: An array of binary predictions for each loan applicant in X_test.
        """
        
        # Feature Selection
        self.test_features = X_test[['CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','CNT_CHILDREN','AMT_INCOME_TOTAL','AMT_CREDIT']]

        # Scaling features to Mean = 0 and Std. Dev = 1
        scaler = StandardScaler()
        self.test_features = scaler.fit_transform(self.test_features)
        
        # Make predictions on test set
        self.predictions = self.model.predict(self.test_features)
        
        # Return predictions
        return self.predictions

%time

CPU times: user 3 µs, sys: 2 µs, total: 5 µs
Wall time: 8.11 µs


In [61]:
obj = CreditDefault()
predictions = obj.predict(df_test)
print(predictions)
%time

['PAID OFF' 'PAID OFF' 'PAID OFF' ... 'PAID OFF' 'PAID OFF' 'PAID OFF']
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.96 µs


In [62]:
type(predictions)

numpy.ndarray

In [64]:
# 'predictions' is a numpy array: Change to pandas DataFrame
df_predictions = pd.DataFrame({'prediction': predictions})

# Keep the rows where prediction is 1:
df_predictions_payment_troubles = df_predictions[df_predictions['prediction'] == 'PAYMENT TROUBLES']

df_predictions_payment_troubles

Unnamed: 0,prediction
17,PAYMENT TROUBLES
111,PAYMENT TROUBLES
175,PAYMENT TROUBLES
185,PAYMENT TROUBLES
199,PAYMENT TROUBLES
...,...
48432,PAYMENT TROUBLES
48600,PAYMENT TROUBLES
48636,PAYMENT TROUBLES
48695,PAYMENT TROUBLES
