# E402 Final Project

This Machine Learning model takes borrower characteristics into account on a credit application and predicts whether or not the applicant will default on their loan. The model is trained/tested on a dataset of 307k+ instances, using features including but not limited to: Amount of Credit, Income, Realty Status, and more.

In [56]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [57]:
class CreditDefaultRF():
    """
    Upon initialization, this class trains a Random Forest Classifier on a dataset of 200,000+ loan applicants from 
    Home Credit Group. The model will train on 6 features: Gender, Car Ownership, Realty Status, Number of Children,
    Income and Amount of Credit. 
    
     NOTE: Before using this class, you must download the dataset from the Home Credit Default Risk Kaggle competition
    (https://www.kaggle.com/c/home-credit-default-risk/data) and place the 'application_train.csv' file in a local
    directory. The path to this directory must be specified in the 'pd.read_csv()' call in the code below.
    """
    
    def __init__(self):
        
        # Load in training dataset from Home Credit Group
        self.df_train = pd.read_csv('/Users/sampence/Documents/IU Bloom/E402 - Computational Methods in Macro/Final Project/home-credit-default-risk/application_train.csv')
        
        # Convert all binary values to 1 and 0 
        mapping = {'Y': 1, 'N': 0, 'M': 1, 'F': 0}
        cols = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']
        self.df_train[cols] = self.df_train[cols].applymap(mapping.get)

        # Label for targeting
        self.label = np.where(self.df_train['TARGET'] == 1, 1, 0)
        
        # Feature Selection
        self.train_features = self.df_train[
            ['CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','CNT_CHILDREN','AMT_INCOME_TOTAL','AMT_CREDIT']
        ]
        # -- Fill NA values in gender with median value [0] -- #
        self.train_features.loc[[35657, 38566, 83382, 189640], 'CODE_GENDER'] = 0
        
        # Scaling features to Mean = 0 and Std. Dev = 1
        scaler = StandardScaler()
        self.train_features = scaler.fit_transform(self.train_features)
        
         # Create a RandomForestClassifier to fit on the training data.
        self.model = RandomForestClassifier(n_estimators=10, random_state=40)
        
        # Split the data into training and validation sets
        X_train, X_val, y_train, y_val = train_test_split(
            self.train_features, self.label, test_size=0.3, random_state=42)

        # Train your model on the training set
        self.model.fit(X_train, y_train)

        # Make predictions on the validation set
        self.y_pred = self.model.predict(X_val)

        # Evaluate the performance of your model on the validation set
        self.accuracy = accuracy_score(y_val, self.y_pred)
        self.precision = precision_score(y_val, self.y_pred)
        self.recall = recall_score(y_val, self.y_pred)
        self.f1 = f1_score(y_val, self.y_pred)
        self.conf_matrix = confusion_matrix(y_val, self.y_pred)

        
    def predict(self, df_test):
        """
        Predicts the loan repayment status of a set of loan applicants.

        Args:
            X_test (pandas.DataFrame): A DataFrame containing the loan applicant information to be predicted.
        
        Returns:
            pandas.DataFrame: A DataFrame of binary predictions for each loan applicant in X_test.
        """
        mapping = {'Y': 1, 'N': 0, 'M': 1, 'F': 0}
        cols = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']
        df_test[cols] = df_test[cols].applymap(mapping.get)
        df_test = df_test[['CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','CNT_CHILDREN','AMT_INCOME_TOTAL','AMT_CREDIT']]

        # Feature Selection
        test_features = df_test[['CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','CNT_CHILDREN','AMT_INCOME_TOTAL','AMT_CREDIT']]

        # Scaling features to Mean = 0 and Std. Dev = 1
        scaler = StandardScaler()
        test_features = scaler.fit_transform(test_features)
        
        # Make predictions on test set
        predictions = self.model.predict(test_features)
        pred_df = pd.DataFrame(predictions, columns=['Prediction'])
        # Return predictions
        return pred_df

%time

CPU times: user 5 µs, sys: 5 µs, total: 10 µs
Wall time: 8.11 µs


In [58]:
# Instantiate
obj = CreditDefaultRF()
#print(obj.accuracy)
%time

obj.train_features

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.25 µs


array([[ 1.38817559, -0.71791354,  0.66453139, -0.57753784,  0.14212925,
        -0.47809496],
       [-0.72036996, -0.71791354, -1.5048198 , -0.57753784,  0.42679193,
         1.7254498 ],
       [ 1.38817559,  1.39292539,  0.66453139, -0.57753784, -0.4271961 ,
        -1.15288792],
       ...,
       [-0.72036996, -0.71791354,  0.66453139, -0.57753784, -0.06662338,
         0.19537871],
       [-0.72036996, -0.71791354,  0.66453139, -0.57753784,  0.00928667,
        -0.56875681],
       [-0.72036996, -0.71791354, -1.5048198 , -0.57753784, -0.04764587,
         0.18875991]])

In [59]:
# Predict
df_test = pd.read_csv('/Users/sampence/Documents/IU Bloom/E402 - Computational Methods in Macro/Final Project/home-credit-default-risk/application_test.csv')
df_test = df_test[['CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','CNT_CHILDREN','AMT_INCOME_TOTAL','AMT_CREDIT']]
predictions = obj.predict(df_test)

# Filter out bad loans
predictions = predictions[predictions['Prediction'] == 1]
print(predictions)

       Prediction
89              1
111             1
119             1
124             1
129             1
...           ...
48655           1
48665           1
48695           1
48707           1
48734           1

[1243 rows x 1 columns]
