# E402 Final Project

This Machine Learning model takes borrower characteristics into account on a credit application and predicts whether or not the applicant will default on their loan. The model is trained/tested on a dataset of 307k+ instances, using features including but not limited to: Gender, Income, Realty Status, and more.

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
%matplotlib inline
%pwd

'/Users/sampence/Documents/IU Bloom/E402 - Computational Methods in Macro/Final Project'

In [6]:
df_test = pd.read_csv('/Users/sampence/Documents/IU Bloom/E402 - Computational Methods in Macro/Final Project/home-credit-default-risk/application_test.csv')

df_train = pd.read_csv('/Users/sampence/Documents/IU Bloom/E402 - Computational Methods in Macro/Final Project/home-credit-default-risk/application_train.csv')

df_train

mapping = {'Y': 1, 'N': 0, 'M': 1, 'F': 0}
cols = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']
df_train[cols] = df_train[cols].applymap(mapping.get)
df_train


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,1.0,0,1,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,0.0,0,0,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,1.0,1,1,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,0.0,0,1,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,1.0,0,1,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,Cash loans,1.0,0,0,0,157500.0,254700.0,27558.0,...,0,0,0,0,,,,,,
307507,456252,0,Cash loans,0.0,0,1,0,72000.0,269550.0,12001.5,...,0,0,0,0,,,,,,
307508,456253,0,Cash loans,0.0,0,1,0,153000.0,677664.0,29979.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,456254,1,Cash loans,0.0,0,1,0,171000.0,370107.0,20205.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
class CreditDefault():
    """
    This class takes 5 or 6 features from a loan applicant and uses a machine learning model trained on close to 
    307,000 entries. This model will make a prediction of whether or not the credit applicant is likely to repay
    their loans.
    """
    
    def __init__(self):
        
        # Load in test & training datasets from Home Credit Group
        self.df_test = pd.read_csv('/Users/sampence/Documents/IU Bloom/E402 - Computational Methods in Macro/Final Project/home-credit-default-risk/application_test.csv')
        self.df_train = pd.read_csv('/Users/sampence/Documents/IU Bloom/E402 - Computational Methods in Macro/Final Project/home-credit-default-risk/application_train.csv')
        
        # Convert all binary values to 1 and 0 
        mapping = {'Y': 1, 'N': 0, 'M': 1, 'F': 0}
        cols = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']
        self.df_test[cols] = self.df_test[cols].applymap(mapping.get)
        self.df_train[cols] = self.df_train[cols].applymap(mapping.get)
        
        # Label for targeting
        self.label = np.where(self.df_train['TARGET'] == 1, 'PAYMENT TROUBLES', 'PAID OFF')
        
        # Feature Selection
        self.train_features = self.df_train[['CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','CNT_CHILDREN','AMT_INCOME_TOTAL','AMT_CREDIT']]
        self.train_features.dropna(inplace=True)
        # Convert back to numpy array
        self.train_features = self.train_features.to_numpy()
        
        
        self.test_features = self.df_test[['CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','CNT_CHILDREN','AMT_INCOME_TOTAL','AMT_CREDIT']]
        
        # Scaling features to Mean = 0 and Std. Dev = 1
        scaler = StandardScaler()
        self.train_features = scaler.fit_transform(self.train_features)
        self.test_features = scaler.transform(self.test_features)
        
        # Create a RandomForestClassifier to fit on the training data.
        self.model = RandomForestClassifier(n_estimators=1000, random_state=40)
        self.model.fit(self.train_features, self.label)
        self.predictions = self.model.predict(self.test_features)
        
    def show(self):
        """
        Returns arrays for feature values of all instances in both the training and test sets. 
        """
        return self.train_features, self.test_features
    
    


In [36]:
obj = CreditDefault()
train, test = obj.show()
print(train)
print(test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.train_features.dropna(inplace=True)


ValueError: Found input variables with inconsistent numbers of samples: [307507, 307511]

In [41]:
print(np.isnan(obj.train_features)[True])    # output: 1
print(np.isinf(obj.train_features).sum()) 
print(np.isnan(obj.test_features).sum())    # output: 1
print(np.isinf(obj.test_features).sum())

[[[False False False False False False]
  [False False False False False False]
  [False False False False False False]
  ...
  [False False False False False False]
  [False False False False False False]
  [False False False False False False]]]
0
0
0
