In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
# Get multiple outputs in the same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# To supress future warnings
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_colwidth',2000)
# pd.options.display.float_format='{:.2f}'.format

### Feauture Encoder

In [4]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
import numpy as np
import pandas as pd


class FeatureEncoder(BaseEstimator, TransformerMixin):
    """
    Encodes categorical columns using LabelEncoding.
    LabelEncoding is used for binary categorical columns
    """

    def __init__(self, cols=None, label_encoder_cols=None):
        """
        Parameters
        ----------
        cols : list of str
            Columns to encode.  Default is to one-hot/target/label encode all categorical columns in the DataFrame.
        """
        self.label_encoder_maps = None

        if isinstance(cols, str):
            self.cols = [cols]
        else:
            self.cols = cols

        if isinstance(label_encoder_cols, str):
            self.label_encoder_cols = [label_encoder_cols]
        else:
            self.label_encoder_cols = label_encoder_cols

        # self.reduce_df = reduce_df

    def fit(self, X, y):
        """Fit label/one-hot/target encoder to X and y

        Parameters
        ----------
        X : pandas DataFrame, shape [n_samples, n_columns]
            DataFrame containing columns to encode
        y : pandas Series, shape = [n_samples]
            Target values.

        Returns
        -------
        self : encoder
            Returns self.
        """
        # Encode all categorical cols by default
        if self.cols is None:
            self.cols = [c for c in X if str(X[c].dtype) == 'object']

        # Check columns are in X
        for col in self.cols:
            if col not in X:
                raise ValueError('Column \'' + col + '\' not in X')

        # Separating out lcols, ohecols and tcols
        if self.label_encoder_cols is None:
            self.label_encoder_cols = [c for c in self.cols if X[c].nunique() <= 5]

        # Create Label Encoding mapping
        self.label_encoder_maps = dict()
        for col in self.label_encoder_cols:
            self.label_encoder_maps[col] = dict(zip(X[col].values, X[col].astype('category').cat.codes.values))

        # Return the fit object
        return self

    def transform(self, X, y=None):
        """Perform label/one-hot/target encoding transformation.

        Parameters
        ----------
        X : pandas DataFrame, shape [n_samples, n_columns]
            DataFrame containing columns to label encode

        Returns
        -------
        pandas DataFrame
            Input DataFrame with transformed columns
        """
        # Xo = pd.DataFrame(X)
        Xo = X.copy()
        # Perform label encoding transformation
        for col, lmap in self.label_encoder_maps.items():
            # Map the column
            # print(col)
            # print(lmap)
            # print(type(Xo))
            Xo[col] = Xo[col].map(lmap)
            Xo[col].fillna(-1, inplace=True)  # Filling new values with -1

        # Return encoded DataFrame
        return Xo

    def fit_transform(self, X, y=None):
        """Fit and transform the data via label/one-hot/target encoding.

        Parameters
        ----------
        X : pandas DataFrame, shape [n_samples, n_columns]
            DataFrame containing columns to encode
        y : pandas Series, shape = [n_samples]
            Target values (required!).

        Returns
        -------
        pandas DataFrame
            Input DataFrame with transformed columns
        """

        return self.fit(X, y).transform(X, y)


### Feature Scaling

In [5]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
import numpy as np


class FeatureScaling(BaseEstimator, TransformerMixin):
    """
    A custom standard scaler class with the ability to apply scaling on selected columns
    """

    def __init__(self, scaling_cols=None):
        """
        Parameters
        ----------
        scaling_cols : list of str
            Columns on which to perform scaling and normalization. Default is to scale all numerical columns

        """
        self.scaling_maps = None
        self.scaling_cols = scaling_cols

    def fit(self, X, y=None):
        """
        Parameters
        ----------
        X : pandas DataFrame, shape [n_samples, n_columns]
            DataFrame containing columns to scale
        """

        # Scaling all non-categorical columns if user doesn't provide the list of columns to scale
        if self.scaling_cols is None:
            self.scaling_cols = [c for c in X if
                                 ((str(X[c].dtype).find('float') != -1) or (str(X[c].dtype).find('int') != -1))]

        # Create mapping corresponding to scaling and normalization
        self.scaling_maps = dict()
        for col in self.scaling_cols:
            self.scaling_maps[col] = dict()
            self.scaling_maps[col]['mean'] = np.mean(X[col].values).round(2)
            self.scaling_maps[col]['std_dev'] = np.std(X[col].values).round(2)

        # Return fit object
        return self

    def transform(self, X):
        """
        Parameters
        ----------
        X : pandas DataFrame, shape [n_samples, n_columns]
            DataFrame containing columns to scale
        """
        Xo = X.copy()

        # Map transformation to respective columns
        for col in self.scaling_cols:
            Xo[col] = (Xo[col] - self.scaling_maps[col]['mean']) / self.scaling_maps[col]['std_dev']

        # Return scaled and normalized DataFrame
        return Xo

    def fit_transform(self, X, y=None):
        """
        Parameters
        ----------
        X : pandas DataFrame, shape [n_samples, n_columns]
            DataFrame containing columns to scale
        """
        # Fit and return transformed dataframe
        return self.fit(X).transform(X)

### Outlier Treatment

In [6]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
import numpy as np
import pandas as pd

class OutlierTreatment(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.lower_limits = {}
        self.upper_limits = {}

    def fit(self, X, columns=None):
        if columns is None:
            return self  # No fitting is applied
        
        # getting the lower nd upper limit range for 
        # outlier treatment
        for col in columns:
            data = pd.Series(sorted(X[col]))
            Q1 = data.min()
            Q3 = data.quantile(0.90)
            IQR = Q3 - Q1
            llimit = Q1 - (1.5 * IQR)
            hlimit = Q3 + (1.5 * IQR)
            self.lower_limits[col] = llimit
            self.upper_limits[col] = hlimit
        return self

    def transform(self, X):
               
        X_transformed = X.copy()  # Create a copy of the original data
        
        # Apply outlier treatment for each specified column
        for col in self.lower_limits.keys():
            
            ll = self.lower_limits[col]
            ul = self.upper_limits[col]
            
            # Apply transformation using lower limit and upper limit
            X_transformed[col] = np.where(X_transformed[col] > ul, ul, X_transformed[col])
            X_transformed[col] = np.where(X_transformed[col] < ll, ll, X_transformed[col])
        return X_transformed


    def fit_transform(self, X, columns=None):
        # Fit the preprocessor and transform the data
        return self.fit(X, columns).transform(X)

### Imputing Numerical Values

In [7]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
import numpy as np
import pandas as pd

class ImputeNumericalValues(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.mv_median = None

    def fit(self, X, columns=None):
        Xo = X.copy()
        
        self.mv_median = {}
        
        if columns is None:
            return self  # No fitting is applied
        
        # getting the lower nd upper limit range for 
        # outlier treatment
        for col in columns:
            self.mv_median[col] = Xo[col].median()
            
        return self

    def transform(self, X):
               
        X_transformed = X.copy()  # Create a copy of the original data
        for idx, median in self.mv_median.items():
            # Apply transformation using lower limit and upper limit
            X_transformed[idx] = X_transformed[idx].fillna(median)
        return X_transformed


    def fit_transform(self, X, columns=None):
        # Fit the preprocessor and transform the data
        return self.fit(X, columns).transform(X)

### Imputing Categorical Values

In [8]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
import numpy as np
import pandas as pd

class ImputeCategoricalValues(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.mv_mode = None

    def fit(self, X, columns=None):
        
        self.mv_mode = {}
    
        Xo = X.copy()
        if columns is None:
            return self  # No fitting is applied
        
        # getting the lower nd upper limit range for 
        # outlier treatment
        for col in columns:
            self.mv_mode[col] = Xo[col].mode()[0]
            
        return self

    def transform(self, X):
               
        X_transformed = X.copy()  # Create a copy of the original data
        for idx, mode in self.mv_mode.items():
            # Apply transformation using lower limit and upper limit
            X_transformed[idx] = X_transformed[idx].fillna(mode)
        return X_transformed


    def fit_transform(self, X, columns=None):
        # Fit the preprocessor and transform the data
        return self.fit(X, columns).transform(X)

### Data PreProcessing Methods

In [9]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
import numpy as np
import pandas as pd
import re

class DataPreProcessing(BaseEstimator, TransformerMixin):
    """
    PreProcess the data. It includes processes such as:
    Missing Values - Updating missing values for training data
    Outlier Treatment - It will treate outliers in the manner as it does for training data
    Feature Update - It Updates feature values with unique values so that data encoding can be done easily

    """

    def __init__(self, drop_columns=None, clean_numerical_columns=None, clean_categorical_columns=None):
        """
        Parameters
        ----------
        cols : list of str
        drop_columns:                 Columns name that are not needed for our models such as customer id, loan id
        clean_numerical_columns:      Columns name that should have only numeric values
        clean_categorical_columns:    Columns name that should have only categorical values
        """

        if isinstance(drop_columns, str):
            self.drop_columns = [drop_columns]
        else:
            self.drop_columns = drop_columns

        if isinstance(clean_numerical_columns, str):
            self.clean_numerical_columns = [clean_numerical_columns]
        else:
            self.clean_numerical_columns = clean_numerical_columns

        if isinstance(clean_categorical_columns, str):
            self.clean_categorical_columns = [clean_categorical_columns]
        else:
            self.clean_categorical_columns = clean_categorical_columns


    def fit(self, X, columns=None):
        """Fit to X and y

        Parameters
        ----------
        X : pandas DataFrame, shape [n_samples, n_columns]
            DataFrame containing columns to encode
        y : pandas Series, shape = [n_samples]
            Target values.

        Returns
        -------
        self : encoder
            Returns self.
        """
        
        # Return the fit object
        return self
    
    def transform(self, X, columns=None):
        """Perform Data Cleaning.

        Parameters
        ----------
        X : pandas DataFrame, shape [n_samples, n_columns]
            DataFrame containing columns to label encode

        Returns
        -------
        pandas DataFrame
            Input DataFrame with transformed columns
        """
        
        # Xo = pd.DataFrame(X)
        Xo = X.copy()
        
        # Dropping the columns that are not needed for our models
        for col in self.drop_columns:
            Xo.drop(columns = col, inplace=True, errors='ignore')
        
        # Remove special characters from specified columns
        if self.clean_numerical_columns:
            for col in self.clean_numerical_columns:
               
                Xo[col] = Xo[col].astype(str)
                Xo[col] = Xo[col].apply(lambda x: re.sub(r'[^0-9\s]', '', x))
                Xo[col] = Xo[col].str.strip()
                Xo[col] = Xo[col].replace('#VALUE!', np.nan, regex=True)
        
        # Replace categories in specified columns
        if self.clean_categorical_columns:
            for col in self.clean_categorical_columns:
                if col == 'Purpose':
                    Xo[col] = Xo[col].str.replace('other', 'Other', regex=True)
                elif col == 'Home Ownership':
                    Xo[col] = Xo[col].str.replace('HaveMortgage', 'Home Mortgage', regex=True)

        # Return encoded DataFrame
        return Xo

    def fit_transform(self, X, columns=None):
        """Fit and transform the data via label/one-hot/target encoding.

        Parameters
        ----------
        X : pandas DataFrame, shape [n_samples, n_columns]
            DataFrame containing columns to encode
        y : pandas Series, shape = [n_samples]
            Target values (required!).

        Returns
        -------
        pandas DataFrame
            Input DataFrame with transformed columns
        """

        return self.fit(X).transform(X)

### Loading the Data used for Training the Models

In [10]:
df = pd.read_csv('../input/LoansTrainingSetV2.csv')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111107 entries, 0 to 111106
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Loan ID                       111107 non-null  object 
 1   Customer ID                   111107 non-null  object 
 2   Loan Status                   111107 non-null  object 
 3   Current Loan Amount           111107 non-null  int64  
 4   Term                          111107 non-null  object 
 5   Credit Score                  89769 non-null   float64
 6   Years in current job          106414 non-null  object 
 7   Home Ownership                111107 non-null  object 
 8   Annual Income                 89769 non-null   float64
 9   Purpose                       111107 non-null  object 
 10  Monthly Debt                  111107 non-null  object 
 11  Years of Credit History       111107 non-null  float64
 12  Months since last delinquent  52104 non-null

### Setting columns for Automated Pipeline

In [12]:
columns_to_be_dropped = ['Loan ID', 'Customer ID']
numeric_columns_to_be_imputed = ['Credit Score', 'Years in current job', 'Annual Income', 'Monthly Debt', 'Months since last delinquent', 
                                 'Maximum Open Credit','Bankruptcies', 'Tax Liens']
columns_to_be_encoded = ['Term', 'Home Ownership', 'Purpose']
columns_for_outlier_treatment = ['Current Loan Amount', 'Credit Score', 'Annual Income', 'Monthly Debt', 'Current Credit Balance', 
                                 'Maximum Open Credit']
categorical_columns_to_be_imputed = ['Loan Status', 'Term', 'Home Ownership', 'Purpose']

### Setting up Pipeline Sequences

In [13]:
preprocessor = DataPreProcessing(drop_columns=columns_to_be_dropped, 
                                 clean_numerical_columns= numeric_columns_to_be_imputed,
                                 clean_categorical_columns=columns_to_be_encoded
                                )
preprocessed_df = preprocessor.fit_transform(df)

In [14]:
for col in numeric_columns_to_be_imputed:
    preprocessed_df[col] = pd.to_numeric(preprocessed_df[col], errors='coerce')

In [15]:
for col in columns_for_outlier_treatment:
    preprocessed_df[col] = pd.to_numeric(preprocessed_df[col], errors='coerce')

In [16]:
imputer = ImputeNumericalValues()

imputer_output_df = imputer.fit_transform(preprocessed_df, columns= numeric_columns_to_be_imputed)

In [17]:
imputer_output_df['Loan Status'] = imputer_output_df['Loan Status'].replace({'Loan Refused': 0, 'Loan Given':1})

In [18]:
X = imputer_output_df.iloc[:,1:]
Y = imputer_output_df.iloc[:,0:1]

In [19]:
outlierprocessor = OutlierTreatment()
outlier_treatment_df = outlierprocessor.fit_transform(X, columns=columns_for_outlier_treatment)

In [20]:
encoder = FeatureEncoder(label_encoder_cols=columns_to_be_encoded)

encoded_df = encoder.fit_transform(outlier_treatment_df)

### Saving the DataPreProcessing, ImputeNumericalValues, OutlierTreatment and FeatureEncoder

In [21]:
import pickle

# Open the file containing the saved model in binary read mode.
with open('../output/dataPreProcessing.pkl', 'wb') as f:
    # Use the pickle.load() function to load the model from the file.
    pickle.dump(preprocessor, f)

# Close the file.
f.close()

In [22]:
import pickle

# Open the file containing the saved model in binary read mode.
with open('../output/imputeNumericalValues.pkl', 'wb') as f:
    # Use the pickle.load() function to load the model from the file.
    pickle.dump(imputer, f)

# Close the file.
f.close()

In [23]:
import pickle

# Open the file containing the saved model in binary read mode.
with open('../output/outlierTreatment.pkl', 'wb') as f:
    # Use the pickle.load() function to load the model from the file.
    pickle.dump(outlierprocessor, f)

# Close the file.
f.close()

In [24]:
import pickle

# Open the file containing the saved model in binary read mode.
with open('../output/categoricalEncoding.pkl', 'wb') as f:
    # Use the pickle.load() function to load the model from the file.
    pickle.dump(encoder, f)

# Close the file.
f.close()

### Loading the Test Dataset for Inference Pipeline

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(encoded_df, Y, test_size=0.3, random_state=1234, 
                                                    stratify=Y['Loan Status'])

print('X_train.shape - ',X_train.shape)
print('y_train.shape - ',y_train.shape)
print('X_test.shape - ',X_test.shape)
print('y_test.shape - ',y_test.shape)

X_train.shape -  (77774, 16)
y_train.shape -  (77774, 1)
X_test.shape -  (33333, 16)
y_test.shape -  (33333, 1)


### Extreme Gradient Boosting Classifier using Threshold

In [26]:
from xgboost import XGBClassifier

xgb_threshold_model = XGBClassifier(random_state=123)
xgb_threshold_model.fit(X_train, y_train)

In [27]:
y_xgb_train_prob_predicted = xgb_threshold_model.predict_proba(X_train)
y_xgb_test_prob_predicted = xgb_threshold_model.predict_proba(X_test)

In [28]:
from sklearn.metrics import classification_report, confusion_matrix

y_xgb_train_predicted = [1 if i >= 0.20 else 0 for i in y_xgb_train_prob_predicted[:,1]]
y_xgb_test_predicted =  [1 if i >= 0.20 else 0 for i in y_xgb_test_prob_predicted[:,1]]

print('Classification Report for Training data -\n',classification_report(y_train, y_xgb_train_predicted))
print('\n\n\nClassification Report for Test data -\n',classification_report(y_test, y_xgb_test_predicted))
print('\nConfusion Matrix for Training Data- \n',confusion_matrix(y_train, y_xgb_train_predicted))
print('\nConfusion Matrix for Test Data - \n',confusion_matrix(y_test, y_xgb_test_predicted))

Classification Report for Training data -
               precision    recall  f1-score   support

           0       1.00      0.21      0.34     17621
           1       0.81      1.00      0.90     60153

    accuracy                           0.82     77774
   macro avg       0.91      0.60      0.62     77774
weighted avg       0.85      0.82      0.77     77774




Classification Report for Test data -
               precision    recall  f1-score   support

           0       1.00      0.20      0.34      7552
           1       0.81      1.00      0.90     25781

    accuracy                           0.82     33333
   macro avg       0.90      0.60      0.62     33333
weighted avg       0.85      0.82      0.77     33333


Confusion Matrix for Training Data- 
 [[ 3625 13996]
 [    0 60153]]

Confusion Matrix for Test Data - 
 [[ 1536  6016]
 [    3 25778]]


## Saving the Models

In [29]:
import pickle

with open('../output/xgb_threshold_model.pkl', 'wb') as f:
    pickle.dump(xgb_threshold_model, f)

### Inference on Test Data

In [30]:
test_df = pd.read_csv('../input/TestData.csv')

In [31]:
test_df

Unnamed: 0,Loan ID,Customer ID,Current Loan Amount,Term,Credit Score,Years in current job,Home Ownership,Annual Income,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,6cf51492-02a2-423e-b93d-676f05b9ad53,7c202b37-2add-44e8-9aea-d5b119aea935,12232,Short Term,7280,< 1 year,Rent,46643,Debt Consolidation,777.39,18.0,10.0,12,0,6762,7946,0,0
1,552e7ade-4292-4354-9ff9-c48031697d72,e7217b0a-07ac-47dd-b379-577b5a35b7c6,25014,Long Term,7330,10+ years,Home Mortgage,81099,Debt Consolidation,892.09,26.7,,14,0,35706,77961,0,0
2,9b5e32b3-8d76-4801-afc8-d729d5a2e6b9,0a62fc41-16c8-40b5-92ff-9e4b763ce714,16117,Short Term,7240,9 years,Home Mortgage,60438,Home Improvements,1244.02,16.7,32.0,11,1,11275,14815,1,0
3,5419b7c7-ac11-4be2-a8a7-b131fb6d6dbe,30f36c59-5182-4482-8bbb-5b736849ae43,11716,Short Term,7400,3 years,Rent,34171,Debt Consolidation,990.94,10.0,,21,0,7009,43533,0,0
4,1450910f-9495-4fc9-afaf-9bdf4b9821df,70c26012-bba5-42c0-8dcb-75295ada31bb,9789,Long Term,6860,10+ years,Home Mortgage,47003,Home Improvements,503.71,16.7,25.0,13,1,16913,19553,1,0
5,7e9f6759-6a13-48ef-adfa-ef9fbcfcfcc8,05301ee5-29ec-48b6-b3e5-0813ea25d684,11911,Short Term,7420,2 years,Home Mortgage,70475,other,886.81,17.7,,13,0,28212,59897,0,0
6,c2b0aa3e-555b-441b-a4aa-e0e4a52c87d3,db3e74ec-2123-4608-be8d-a93e98cabfcf,28988,Short Term,7420,3 years,Home Mortgage,58074,Debt Consolidation,871.11,22.8,,9,0,14423,54018,0,0
7,233e0119-6dfe-47c0-a212-b4cc00371c8d,def66e46-951e-4bff-bf96-87ae6e092e19,17705,Long Term,6630,3 years,Own Home,49180,Debt Consolidation,274.59,30.2,,10,1,4252,25012,1,0
8,fc9d109b-15c2-4fe3-b7d8-27610b89f80b,f54c53e4-acd7-4e29-8f55-d2a247a4b62a,16812,Short Term,7360,7 years,Rent,50945,Debt Consolidation,590.12,14.6,,9,0,12903,15379,0,0
9,e81871e5-745d-4dcd-9cc9-31c2686eb6a1,cf3160eb-5de5-4c5e-b18d-3e2ae70d27ef,19678,Long Term,679,6 years,Home Mortgage,51163,Debt Consolidation,1006.21,9.0,,11,0,12089,15539,0,0


In [32]:
preprocessed_df = preprocessor.transform(test_df)

In [33]:
for col in numeric_columns_to_be_imputed:
    preprocessed_df[col] = pd.to_numeric(preprocessed_df[col], errors='coerce')

In [34]:
for col in columns_for_outlier_treatment:
    preprocessed_df[col] = pd.to_numeric(preprocessed_df[col], errors='coerce')

In [35]:
imputer_output_df = imputer.transform(preprocessed_df)

In [36]:
outlier_treatment_df = outlierprocessor.transform(imputer_output_df)

In [37]:
encoded_df = encoder.transform(outlier_treatment_df)

In [38]:
encoded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Current Loan Amount           10 non-null     float64
 1   Term                          10 non-null     int8   
 2   Credit Score                  10 non-null     float64
 3   Years in current job          10 non-null     int64  
 4   Home Ownership                10 non-null     int8   
 5   Annual Income                 10 non-null     float64
 6   Purpose                       10 non-null     int8   
 7   Monthly Debt                  10 non-null     float64
 8   Years of Credit History       10 non-null     float64
 9   Months since last delinquent  10 non-null     float64
 10  Number of Open Accounts       10 non-null     int64  
 11  Number of Credit Problems     10 non-null     int64  
 12  Current Credit Balance        10 non-null     float64
 13  Maximum 

In [39]:
encoded_df

Unnamed: 0,Current Loan Amount,Term,Credit Score,Years in current job,Home Ownership,Annual Income,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,12232.0,1,7280.0,1,2,46643.0,3,77739.0,18.0,100.0,12,0,6762.0,7946.0,0,0
1,25014.0,0,7330.0,10,0,81099.0,3,89209.0,26.7,320.0,14,0,35706.0,77961.0,0,0
2,16117.0,1,7240.0,9,0,60438.0,5,124402.0,16.7,320.0,11,1,11275.0,14815.0,1,0
3,11716.0,1,7400.0,3,2,34171.0,3,99094.0,10.0,320.0,21,0,7009.0,43533.0,0,0
4,9789.0,0,6860.0,10,0,47003.0,5,50371.0,16.7,250.0,13,1,16913.0,19553.0,1,0
5,11911.0,1,7420.0,2,0,70475.0,7,88681.0,17.7,320.0,13,0,28212.0,59897.0,0,0
6,28988.0,1,7420.0,3,0,58074.0,3,87111.0,22.8,320.0,9,0,14423.0,54018.0,0,0
7,17705.0,0,6630.0,3,1,49180.0,3,27459.0,30.2,320.0,10,1,4252.0,25012.0,1,0
8,16812.0,1,7360.0,7,2,50945.0,3,59012.0,14.6,320.0,9,0,12903.0,15379.0,0,0
9,19678.0,0,3420.0,6,0,51163.0,3,100621.0,9.0,320.0,11,0,12089.0,15539.0,0,0


### Loading the trained model

In [40]:
import pickle

# Open the file containing the saved model in binary read mode.
with open('../output/xgb_threshold_model.pkl', 'rb') as f:
    # Use the pickle.load() function to load the model from the file.
    loaded_model = pickle.load(f)

# Close the file.
f.close()

In [41]:
loaded_model

In [42]:
test_prediction = loaded_model.predict(encoded_df)

In [43]:
test_prediction

array([0, 0, 0, 1, 0, 0, 1, 1, 1, 0])

In [44]:
test_df['Loan Status'] = test_prediction

In [45]:
test_df['Loan Status'] = test_df['Loan Status'].replace({1: 'Loan Given', 0: 'Loan Rejected'})

In [46]:
test_df

Unnamed: 0,Loan ID,Customer ID,Current Loan Amount,Term,Credit Score,Years in current job,Home Ownership,Annual Income,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens,Loan Status
0,6cf51492-02a2-423e-b93d-676f05b9ad53,7c202b37-2add-44e8-9aea-d5b119aea935,12232,Short Term,7280,< 1 year,Rent,46643,Debt Consolidation,777.39,18.0,10.0,12,0,6762,7946,0,0,Loan Rejected
1,552e7ade-4292-4354-9ff9-c48031697d72,e7217b0a-07ac-47dd-b379-577b5a35b7c6,25014,Long Term,7330,10+ years,Home Mortgage,81099,Debt Consolidation,892.09,26.7,,14,0,35706,77961,0,0,Loan Rejected
2,9b5e32b3-8d76-4801-afc8-d729d5a2e6b9,0a62fc41-16c8-40b5-92ff-9e4b763ce714,16117,Short Term,7240,9 years,Home Mortgage,60438,Home Improvements,1244.02,16.7,32.0,11,1,11275,14815,1,0,Loan Rejected
3,5419b7c7-ac11-4be2-a8a7-b131fb6d6dbe,30f36c59-5182-4482-8bbb-5b736849ae43,11716,Short Term,7400,3 years,Rent,34171,Debt Consolidation,990.94,10.0,,21,0,7009,43533,0,0,Loan Given
4,1450910f-9495-4fc9-afaf-9bdf4b9821df,70c26012-bba5-42c0-8dcb-75295ada31bb,9789,Long Term,6860,10+ years,Home Mortgage,47003,Home Improvements,503.71,16.7,25.0,13,1,16913,19553,1,0,Loan Rejected
5,7e9f6759-6a13-48ef-adfa-ef9fbcfcfcc8,05301ee5-29ec-48b6-b3e5-0813ea25d684,11911,Short Term,7420,2 years,Home Mortgage,70475,other,886.81,17.7,,13,0,28212,59897,0,0,Loan Rejected
6,c2b0aa3e-555b-441b-a4aa-e0e4a52c87d3,db3e74ec-2123-4608-be8d-a93e98cabfcf,28988,Short Term,7420,3 years,Home Mortgage,58074,Debt Consolidation,871.11,22.8,,9,0,14423,54018,0,0,Loan Given
7,233e0119-6dfe-47c0-a212-b4cc00371c8d,def66e46-951e-4bff-bf96-87ae6e092e19,17705,Long Term,6630,3 years,Own Home,49180,Debt Consolidation,274.59,30.2,,10,1,4252,25012,1,0,Loan Given
8,fc9d109b-15c2-4fe3-b7d8-27610b89f80b,f54c53e4-acd7-4e29-8f55-d2a247a4b62a,16812,Short Term,7360,7 years,Rent,50945,Debt Consolidation,590.12,14.6,,9,0,12903,15379,0,0,Loan Given
9,e81871e5-745d-4dcd-9cc9-31c2686eb6a1,cf3160eb-5de5-4c5e-b18d-3e2ae70d27ef,19678,Long Term,679,6 years,Home Mortgage,51163,Debt Consolidation,1006.21,9.0,,11,0,12089,15539,0,0,Loan Rejected
