# E402 Final Project

This code is a machine learning model built to predict loan default risk using historical data from a financial institution. The model is built using a random forest classifier, and includes a number of data preprocessing and feature engineering steps.

The code begins by loading in the training dataset, which includes information on applicants' personal and financial characteristics. The data is then preprocessed and cleaned, including imputing missing values, encoding categorical variables, and scaling the data. The resulting dataset is then split into training and validation sets, and a random forest classifier is fit on the training set. Once the model is trained, it is used to make predictions on the validation set, and various performance metrics are calculated, including accuracy, precision, recall, and F1 score. Finally, the model is used to predict the likelihood of loan defaults on a separate test dataset, and the results are returned as a pandas dataframe. Overall, this code aims to provide a robust and effective solution for predicting loan default risk using machine learning techniques.

In [None]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [None]:
class CreditDefaultRF():
    """
    Upon initialization, this class trains a Random Forest Classifier on a dataset of 200,000+ loan applicants from 
    Home Credit Group. The model will train on 6 features: Gender, Car Ownership, Realty Status, Number of Children,
    Income and Amount of Credit. 
    
     NOTE: Before using this class, you must download the dataset from the Home Credit Default Risk Kaggle competition
    (https://www.kaggle.com/c/home-credit-default-risk/data) and place the 'application_train.csv' file in a local
    directory. The path to this directory must be specified in the 'pd.read_csv()' call in the code below.
    """
    
    def __init__(self):
        
        # -- LOAD IN TRAINING DATASET FROM HOME CREDIT GROUP -- #
        self.df_train = pd.read_csv('/Users/sampence/Documents/IU Bloom/E402 - Computational Methods in Macro/Final Project/home-credit-default-risk/application_train.csv')
        
        # Convert all binary values to 1 and 0 
        mapping = {'Y': 1, 'N': 0, 'M': 1, 'F': 0}
        cols = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']
        self.df_train[cols] = self.df_train[cols].applymap(mapping.get)

        # Label for targeting
        self.label = np.where(self.df_train['TARGET'] == 1, 1, 0)
        
        # Feature Selection
        self.train_features = self.df_train[
            ['CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','CNT_CHILDREN','AMT_INCOME_TOTAL','AMT_CREDIT']
        ]
        # -- Fill NA values in gender with median value [0] -- #
        self.train_features.loc[[35657, 38566, 83382, 189640], 'CODE_GENDER'] = 0
        
        # Scaling features to Mean = 0 and Std. Dev = 1
        scaler = StandardScaler()
        self.train_features = scaler.fit_transform(self.train_features)
        
         # Create a RandomForestClassifier to fit on the training data.
        self.model = RandomForestClassifier(n_estimators=10, random_state=40)
        
        # Split the data into training and validation sets
        X_train, X_val, y_train, y_val = train_test_split(
            self.train_features, self.label, test_size=0.3, random_state=42)

        # Train your model on the training set
        self.model.fit(X_train, y_train)

        # Make predictions on the validation set
        self.y_pred = self.model.predict(X_val)

        # Evaluate the performance of your model on the validation set
        self.accuracy = accuracy_score(y_val, self.y_pred)
        self.precision = precision_score(y_val, self.y_pred)
        self.recall = recall_score(y_val, self.y_pred)
        self.f1 = f1_score(y_val, self.y_pred)
        self.conf_matrix = confusion_matrix(y_val, self.y_pred)

       
    def metrics(self):
        """
        Returns a dataframe with the model evaluation metrics.
        """
        # Unpack attributes of self
        accuracy, precision, recall, f1 = self.accuracy, self.precision, self.recall, self.f1
        
        # Create a dictionary with the metric names and their values
        metrics_dict = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        }
    
        # Create a dataframe from the metrics dictionary
        metrics_df = pd.DataFrame.from_dict(metrics_dict, orient='index', columns=['Value'])
    
        # Set the index name
        metrics_df.index.name = 'Metric'
    
        # Return the dataframe
        return metrics_df
           
    
    
    def predict(self, df_test):
        """
        Predicts the loan repayment status of a set of loan applicants.

        Args:
            X_test (pandas.DataFrame): A DataFrame containing the loan applicant information to be predicted.
        
        Returns:
            pandas.DataFrame: A DataFrame of binary predictions for each loan applicant in X_test.
        """
        mapping = {'Y': 1, 'N': 0, 'M': 1, 'F': 0}
        cols = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']
        df_test[cols] = df_test[cols].applymap(mapping.get)
        df_test = df_test[['CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','CNT_CHILDREN','AMT_INCOME_TOTAL','AMT_CREDIT']]

        # Feature Selection
        test_features = df_test[['CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','CNT_CHILDREN','AMT_INCOME_TOTAL','AMT_CREDIT']]

        # Scaling features to Mean = 0 and Std. Dev = 1
        scaler = StandardScaler()
        test_features = scaler.fit_transform(test_features)
        
        # Make predictions on test set
        predictions = self.model.predict(test_features)
        pred_df = pd.DataFrame(predictions, columns=['Prediction'])
        # Return predictions
        return pred_df
    
    def defaults(self, pred_df):
        """
        Filter out loans that are predicted to default.

        Parameters:
        -----------
        pred_df : pandas.DataFrame
            A DataFrame containing loan predictions(obtained by calling 'predict' method), where each row corresponds to a loan and has a
            'Prediction' column with values of 0 or 1, indicating whether the loan is predicted to default or not.

        Returns:
        --------
        pandas.DataFrame
            A new DataFrame containing only the rows from `pred_df` where 'Prediction' == 1, indicating that the
            loan is predicted to default.
        """

        # Check if the input dataframe contains the Prediction column
        if 'Prediction' not in pred_df.columns:
            raise ValueError("Input dataframe does not contain the Prediction column")
            
        # Filter out loans that are likely to default
        pred_df = pred_df[pred_df['Prediction'] == 1]
        
        # Return the dataframe of positive predictions
        return pred_df
    
           
    
%time

In [None]:
# Instantiate
obj = CreditDefaultRF()
obj.train_features

In [None]:
# Evaluate performance metrics
metrics = obj.metrics()

In [None]:
# Predict
df_test = pd.read_csv('/Users/sampence/Documents/IU Bloom/E402 - Computational Methods in Macro/Final Project/home-credit-default-risk/application_test.csv')
df_test = df_test[['CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','CNT_CHILDREN','AMT_INCOME_TOTAL','AMT_CREDIT']]
predictions = obj.predict(df_test)

In [None]:
# Filter out bad loans
positives = obj.defaults(predictions)