In [248]:
import numpy as np
import pandas as pd

import logging

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.compose import make_column_transformer

from sklearn.metrics import r2_score, accuracy_score, make_scorer
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
import re
from matplotlib import pyplot as plt

logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)

In [249]:
train_data = pd.DataFrame()
predict_data = pd.DataFrame()

In [250]:
def LabelEncoderMultiColumns(data, columns_list):
    """Encoding categorical feature in the dataframe

    Parameters
    ----------
    data: input dataframe 
    columns_list: categorical features list

    Return
    ------
    data: new dataframe where categorical features are encoded
    """
    labelencoder = LabelEncoder()
    for col in columns_list:
        data[col] = labelencoder.fit_transform(data[col])
    return data


In [251]:
def score(y_true, y_pred, **kwargs):
    return max(0, 100*r2_score(y_true, y_pred))
r2 = make_scorer(score, greater_is_better = False)


In [252]:
def KFoldCV(n_splits = 10):
    # k-fold cross validation
    kfold = KFold(n_splits=n_splits, shuffle=True)
    return kfold
    

In [253]:
def LogisticRegressionModel(X, y):
    logging.info('Started Grid Search for Logistic Regression...')
    
    # Add column for classification labels, to tell if the person is eligible for loan sanction or not
    X['Loan Eligibility'] = np.where(y>0, 1, 0)
    y = X['Loan Eligibility']
    X.drop('Loan Eligibility', axis = 1, inplace = True)
    
    cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
    # enumerate splits
    outer_results = list()
    for train_ix, test_ix in cv_outer.split(X):
        # split data
        X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
        y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
        # configure the cross-validation procedure
        cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
        # define the model
        model = LogisticRegression(random_state=0, max_iter = 500)
        # define search space
        param_grid = [
                {'penalty': ['l2'], 'solver': [ 'lbfgs', 'liblinear', 'sag', 'saga', 'newton-cg']},
                {'penalty': ['l1'], 'solver': ['liblinear', 'saga']},
        ]
        # define search
        search = GridSearchCV(model, param_grid = param_grid, scoring='accuracy', cv=cv_inner, refit=True)
        # execute search
        result = search.fit(X_train, y_train)
        # get the best performing model fit on the whole training set
        best_model = result.best_estimator_
        # evaluate model on the hold out dataset
        yhat = best_model.predict(X_test)
        # evaluate the model
        acc = accuracy_score(y_test, yhat)
        # store the result
        outer_results.append(acc)
        # report progress
        logging.info('acc = {}, est = {}, cfg = {}'.format(acc, result.best_score_, result.best_params_))
    # summarize the estimated performance of the model
    logging.info('\nAccuracy: {} (Standard Deviation: {})'.format(np.mean(outer_results), np.std(outer_results)))
    logging.info('Stopped Grid Search for Logistic Regression')


In [254]:
def DecisionTreeModel(X, y):
    logging.info('Started Grid Search for Decision Tree...')
    
    # Add column for classification labels, to tell if the person is eligible for loan sanction or not
    X['Loan Eligibility'] = np.where(y>0, 1, 0)
    y = X['Loan Eligibility']
    X.drop('Loan Eligibility', axis = 1, inplace = True)
    
    cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
    # enumerate splits
    outer_results = list()
    for train_ix, test_ix in cv_outer.split(X):
        # split data
        X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
        y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
        # configure the cross-validation procedure
        cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
        # define the model
        model = DecisionTreeClassifier(random_state=0)
        # define search space
        param_grid = [
                {'criterion': ['entropy', 'gini'], 'max_depth': np.arange(3, 15)}, 
                {'min_samples_leaf': np.arange(1,10)},
        ]
        # define search
        search = GridSearchCV(model, param_grid = param_grid, scoring='accuracy', cv=cv_inner, refit=True)
        # execute search
        result = search.fit(X_train, y_train)
        # get the best performing model fit on the whole training set
        best_model = result.best_estimator_
        # evaluate model on the hold out dataset
        yhat = best_model.predict(X_test)
        # evaluate the model
        acc = accuracy_score(y_test, yhat)
        # store the result
        outer_results.append(acc)
        # report progress
        logging.info('acc = {}, \n best score = {}, \n best params = {}, \n'.format(acc, result.best_score_, result.best_params_))
    # summarize the estimated performance of the model
    logging.info('\nAccuracy: {} (Standard Deviation: {})'.format(np.mean(outer_results), np.std(outer_results)))
    logging.info(best_model)
    logging.info('Stopped Grid Search for Decision Tree')

In [255]:
def LinearRegressionModel(X, y):
    logging.info('Started Grid Search for Linear Regression...')
        
    cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
    # enumerate splits
    outer_results = list()
    for train_ix, test_ix in cv_outer.split(X):
        # split data
        X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
        y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
        # configure the cross-validation procedure
        cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
        # define the model
        model = LinearRegression()
        # define search space
        space = dict()
        space['fit_intercept'] = [True, False]
        space['normalize'] = [True, False]
       
        # define search
        search = GridSearchCV(model, space, scoring=r2, cv=cv_inner, refit=True)
        # execute search
        result = search.fit(X_train, y_train)
        # get the best performing model fit on the whole training set
        best_model = result.best_estimator_
        # evaluate model on the hold out dataset
        yhat = best_model.predict(X_test)
        # evaluate the model
        acc = score(y_test, yhat)
        # store the result
        outer_results.append(acc)
        # report progress
        logging.info('acc = {}, est = {}, cfg = {}'.format(acc, result.best_score_, result.best_params_))
    # summarize the estimated performance of the model
    logging.info('\nAccuracy: {} (Standard Deviation: {})'.format(np.mean(outer_results), np.std(outer_results)))
    logging.info('Stopped Grid Search for Linear Regression')

In [256]:
def LassoRegressionModel(X, y):
    logging.info('Started Grid Search for Lasso Regression...')
        
    cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
    # enumerate splits
    outer_results = list()
    for train_ix, test_ix in cv_outer.split(X):
        # split data
        X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
        y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
        # configure the cross-validation procedure
        cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
        # define the model
        model = Lasso(max_iter = 2000)
        # define search space
        space = dict()
        space['alpha'] = np.logspace(1e-5, 1)
        space['fit_intercept'] = [True, False]
        space['normalize'] = [True, False]
       
        # define search
        search = GridSearchCV(model, space, scoring=r2, cv=cv_inner, refit=True)
        # execute search
        result = search.fit(X_train, y_train)
        # get the best performing model fit on the whole training set
        best_model = result.best_estimator_
        # evaluate model on the hold out dataset
        yhat = best_model.predict(X_test)
        # evaluate the model
        acc = score(y_test, yhat)
        # store the result
        outer_results.append(acc)
        # report progress
        logging.info('acc = {}, est = {}, cfg = {}'.format(acc, result.best_score_, result.best_params_))
    # summarize the estimated performance of the model
    logging.info('\nAccuracy: {} (Standard Deviation: {})'.format(np.mean(outer_results), np.std(outer_results)))
    logging.info('Stopped Grid Search for Lasso Regression')


In [257]:
def SVRModel(X, Y):
    logging.info('Started Grid Search for Support Vector Regression...')
        
    cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
    # enumerate splits
    outer_results = list()
    for train_ix, test_ix in cv_outer.split(X):
        # split data
        X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
        y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
        # configure the cross-validation procedure
        cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
        # define the model
        model = SVR()
        # define search space
        space = dict()
        space['C']= [0.1, 1]
        space['gamma'] = [1,0.1,0.001]
        space['kernel'] = ['rbf', 'sigmoid']
        
        # define search
        search = GridSearchCV(model, space, scoring=r2, cv=cv_inner, refit=True)
        # execute search
        result = search.fit(X_train, y_train)
        # get the best performing model fit on the whole training set
        best_model = result.best_estimator_
        # evaluate model on the hold out dataset
        yhat = best_model.predict(X_test)
        # evaluate the model
        acc = score(y_test, yhat)
        # store the result
        outer_results.append(acc)
        # report progress
        logging.info('acc = {}, est = {}, cfg = {}'.format(acc, result.best_score_, result.best_params_))
    # summarize the estimated performance of the model
    logging.info('\nAccuracy: {} (Standard Deviation: {})'.format(np.mean(outer_results), np.std(outer_results)))
    logging.info('Stopped Grid Search for Support Vector Regression')

In [258]:
def RandomForestModel(X, Y):
    logging.info('Started Grid Search for Random Forest Regression...')
        
    cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
    # enumerate splits
    outer_results = list()
    for train_ix, test_ix in cv_outer.split(X):
        # split data
        X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
        y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
        # configure the cross-validation procedure
        cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
        # define the model
        model = RandomForestRegressor(random_state = 0)
        # define search space
        space = dict()
        space['n_estimators'] = [100,1000,10000]
        space['max_features'] = ["auto", "sqrt", "log2"]
        space['min_samples_split'] = [2,4,8]
        space['bootstrap'] = [True, False]
        
        # define search
        search = GridSearchCV(model, space, scoring=r2, cv=cv_inner, refit=True)
        # execute search
        result = search.fit(X_train, y_train)
        # get the best performing model fit on the whole training set
        best_model = result.best_estimator_
        # evaluate model on the hold out dataset
        yhat = best_model.predict(X_test)
        # evaluate the model
        acc = score(y_test, yhat)
        # store the result
        outer_results.append(acc)
        # report progress
        logging.info('acc = {}, est = {}, cfg = {}'.format(acc, result.best_score_, result.best_params_))
    # summarize the estimated performance of the model
    logging.info('\nAccuracy: {} (Standard Deviation: {})'.format(np.mean(outer_results), np.std(outer_results)))
    logging.info('Stopped Grid Search for Random Forest Regression')

In [259]:
def read_data(train_location, predict_location):
    global train_data
    train_data = pd.read_csv(train_location)
    global predict_data 
    predict_data = pd.read_csv(predict_location)

In [260]:
def missing_values(data):
    data['Dependents'].fillna(data['Dependents'].value_counts().idxmax(), inplace=True)
    # For numeric columns
    data.fillna(data.mean(), inplace = True)
    
    # For string columns
    nan_columns = data.isna().any()
    nan_columns = nan_columns[nan_columns == True]
    # Replace missing value with the most common value of that column
    for column in nan_columns.iteritems():
        data[column[0]].fillna(data[column[0]].value_counts().idxmax(), inplace=True)
    return data

In [261]:
def categorical_values(data, test):
    # For columns having two categories
    
    data = LabelEncoderMultiColumns(data, ['Gender', 'Income Stability', 'Expense Type 1', 'Expense Type 2'])

    # For columns having multiple categories
    if test==True:
        data = onehotencoder.transform(data)
    else:
        onehotencoder.fit(data)
        data = onehotencoder.transform(data)
    column_names = onehotencoder.get_feature_names()
    # Remove 'onehotencoder' from column labels
    for index, name in enumerate(column_names):
        column_names[index] = re.sub(r'onehotencoder__x', '', name)
    data = pd.DataFrame(data, columns=column_names)
    return data

In [262]:
def normalization(data, test):   
    if test==True:
        data = pd.DataFrame(minmaxscaler.transform(data), columns = data.columns)
    else:
        minmaxscaler.fit(data)
        data = pd.DataFrame(minmaxscaler.transform(data), columns = data.columns)
    return data
    

In [271]:
def clean_train_data():
    global train_data
    # 1. Remove columns containing unimportant information
    train_data.drop(['Customer ID', 'Name', 'Property ID'], axis = 1, inplace = True)
    # 2. Replace any datapoints containing unwanted characters with NaN
    train_data.replace({r'[+=!~`:;?<>@#$%^&*]': None}, regex = True, inplace = True)
    # 3. Treat missing values
    train_data = missing_values(train_data)
    # 4. Seperate X_train and y_train for further pre-processing
    train_Y = train_data['Loan Sanction Amount (USD)']
    train_data = train_data.drop(['Loan Sanction Amount (USD)'], axis = 1)
    # 5. Deal with columns containing categorical values - Dummy variables
    global onehotencoder 
    onehotencoder = make_column_transformer((OneHotEncoder(), ['Location', 'Has Active Credit Card', 'Property Location', 'Profession', 'Type of Employment']), remainder='passthrough')

    train_data = categorical_values(train_data, False)
    # 6. Normalization
    global minmaxscaler
    minmaxscaler = MinMaxScaler()
    train_data = normalization(train_data, False)
    train_data['Loan Sanction Amount (USD)'] = train_Y
    del train_Y

In [275]:
def clean_predict_data():
    global predict_data
    global predict_index
    # 1. Remove columns containing unimportant information
    predict_index = predict_data['Customer ID']
    predict_data.drop(['Customer ID', 'Name', 'Property ID'], axis = 1, inplace = True)
    # 2. Replace any datapoints containing unwanted characters with NaN
    predict_data.replace({r'[+=!~`:;?<>@#$%^&*]': None}, regex = True, inplace = True)
    # 3. Treat missing values
    predict_data = missing_values(predict_data)
    # 4. Deal with columns containing categorical values - Dummy variables
    predict_data = categorical_values(predict_data, True)
    # 6. Normalization
    predict_data = normalization(predict_data, True)

In [276]:
read_data('train.csv', 'test.csv')
logging.info(train_data.head())
clean_train_data()
y = train_data['Loan Sanction Amount (USD)']
X = train_data.drop(['Loan Sanction Amount (USD)'], axis = 1)


2021-06-28 10:18:32,005 :   Customer ID               Name Gender  Age  Income (USD) Income Stability  \
0     C-36995   Frederica Shealy      F   56       1933.05              Low   
1     C-33999  America Calderone      M   32       4952.91              Low   
2      C-3770      Rosetta Verne      F   65        988.19             High   
3     C-26480         Zoe Chitty      F   65           NaN             High   
4     C-23459       Afton Venema      F   31       2614.77              Low   

  Profession     Type of Employment    Location  Loan Amount Request (USD)  \
0    Working            Sales staff  Semi-Urban                   72809.58   
1    Working                    NaN  Semi-Urban                   46837.47   
2  Pensioner                    NaN  Semi-Urban                   45593.04   
3  Pensioner                    NaN       Rural                   80057.92   
4    Working  High skill tech staff  Semi-Urban                  113858.89   

   ...  Credit Score No. of De

In [23]:
LogisticRegressionModel(X, y)
DecisionTreeModel(X, y)

2021-06-28 01:04:33,888 : Started Grid Search for Logistic Regression...
2021-06-28 01:04:57,039 : acc = 0.823, est = 0.8201111111111111, cfg = {'penalty': 'l1', 'solver': 'liblinear'}
2021-06-28 01:05:19,556 : acc = 0.821, est = 0.8194444444444443, cfg = {'penalty': 'l1', 'solver': 'liblinear'}
2021-06-28 01:05:41,833 : acc = 0.8126666666666666, est = 0.8207407407407407, cfg = {'penalty': 'l1', 'solver': 'liblinear'}
2021-06-28 01:06:03,512 : acc = 0.8156666666666667, est = 0.8205555555555556, cfg = {'penalty': 'l1', 'solver': 'saga'}
2021-06-28 01:06:24,881 : acc = 0.8243333333333334, est = 0.8196296296296296, cfg = {'penalty': 'l1', 'solver': 'liblinear'}
2021-06-28 01:06:44,794 : acc = 0.8193333333333334, est = 0.8191111111111112, cfg = {'penalty': 'l2', 'solver': 'sag'}
2021-06-28 01:07:03,465 : acc = 0.8233333333333334, est = 0.8200740740740741, cfg = {'penalty': 'l2', 'solver': 'liblinear'}
2021-06-28 01:07:20,945 : acc = 0.8146666666666667, est = 0.8221851851851852, cfg = {'pen

In [None]:
train_data_copy = train_data.copy()
train_data_copy.drop(train_data_copy[train_data_copy['Loan Sanction Amount (USD)'] == 0].index, inplace = True)
y_copy = train_data_copy['Loan Sanction Amount (USD)']
X_copy = train_data_copy.drop(['Loan Sanction Amount (USD)'], axis = 1)

In [None]:
LinearRegressionModel(X_copy, y_copy)
LassoRegressionModel(X_copy, y_copy)
SVRModel(X, y)

2021-06-28 01:10:53,999 : Started Grid Search for Linear Regression...
2021-06-28 01:10:54,794 : acc = 90.83977995026393, est = 0.0, cfg = {'fit_intercept': True, 'normalize': True}
2021-06-28 01:10:55,383 : acc = 93.09239330284854, est = -61.0973963357464, cfg = {'fit_intercept': True, 'normalize': True}
2021-06-28 01:10:55,925 : acc = 91.71756457204714, est = -30.780047434266805, cfg = {'fit_intercept': False, 'normalize': True}
2021-06-28 01:10:56,461 : acc = 91.57824465917909, est = -30.536068265330226, cfg = {'fit_intercept': True, 'normalize': False}
2021-06-28 01:10:57,057 : acc = 94.28404432867194, est = -30.329477419006214, cfg = {'fit_intercept': True, 'normalize': False}
2021-06-28 01:10:57,563 : acc = 0, est = -61.71757643657153, cfg = {'fit_intercept': True, 'normalize': True}
2021-06-28 01:10:58,330 : acc = 89.982385849823, est = -61.29881161018286, cfg = {'fit_intercept': True, 'normalize': False}
2021-06-28 01:10:58,896 : acc = 93.88084158517997, est = -30.9607574786315

In [245]:
def aggregate_model(X, y):
    # Model 1 - Classify whether Loan eligibility is True or False
    
    train_X, test_X, train_Y, test_Y = train_test_split(X, y, test_size=0.20, random_state=1)
    logging.info('Started training the aggregate model...')
    # Add column for classification labels, to tell if the person is eligible for loan sanction or not
    X_copy = train_X.copy()
    X_copy['Loan Eligibility'] = np.where(train_Y>0, 1, 0)
    y_copy = X_copy['Loan Eligibility']
    X_copy.drop('Loan Eligibility', axis = 1, inplace = True)
    
    X_test_copy = test_X.copy()
    X_test_copy['Loan Eligibility'] = np.where(test_Y>0, 1, 0)
    y_test_copy = X_test_copy['Loan Eligibility']
    X_test_copy.drop('Loan Eligibility', axis = 1, inplace = True)
    
    global classification_model
    classification_model = DecisionTreeClassifier(random_state=0, criterion = 'entropy', max_depth = 5)
    classification_model.fit(X_copy, y_copy)
    yhat_classification = classification_model.predict(X_test_copy)
    yhat_classification = pd.DataFrame(yhat_classification, index = X_test_copy.index)
    yhat_classification.drop(yhat_classification[yhat_classification[0] == 1].index, inplace = True)
    #logging.info('Predict: {} Shape: {}'.format(prediction.head(), prediction.shape))
    
    # Model 2 - If Loan eligibility is True, predict the loan sanction amount
    
    # Keep only those data points where Loan Sanction Amount is greater that 0
    X_copy = train_X.copy()
    y_copy = train_Y.copy()
    X_copy.drop(X_copy[train_Y == 0].index, inplace = True)
    y_copy.drop(y_copy[train_Y == 0].index, inplace = True)
    
    X_test_copy = test_X.copy()
    y_test_copy = test_Y.copy()
    X_test_copy.drop(yhat_classification[yhat_classification[0] == 0].index, inplace = True)
    y_test_copy.drop(yhat_classification[yhat_classification[0] == 0].index, inplace = True)
    global regression_model
    regression_model = LinearRegression()
    regression_model.fit(X_copy, y_copy)
    yhat_regression = regression_model.predict(X_test_copy)
    yhat_regression = pd.DataFrame(yhat_regression, index = X_test_copy.index)
    prediction = pd.concat([yhat_classification, yhat_regression])
    prediction = prediction.sort_index(ascending=True)
    logging.info(prediction.head())
    test_Y = test_Y.sort_index(ascending=True)
    scores = score(test_Y, prediction[0])
    logging.info('Finished training the aggregate model.')
    logging.info('Score: {}'.format(scores))
    

In [246]:
aggregate_model(X, y)


2021-06-28 10:12:50,535 : Started training the aggregate model...
2021-06-28 10:12:50,722 :           0
2   36528.0
11      0.0
19  78096.0
22  11552.0
23  82256.0
2021-06-28 10:12:50,727 : Finished training the aggregate model.
2021-06-28 10:12:50,728 : Score: 75.88097445433519


In [277]:
clean_predict_data()

In [293]:
def predict_loan_sanction_amount():
    predict_dat = predict_data.copy()
    logging.info('Predicting based on the aggregate model...')
    
    yhat_classification = classification_model.predict(predict_dat)
    yhat_classification = pd.DataFrame(yhat_classification, index = predict_dat.index)
    yhat_classification.drop(yhat_classification[yhat_classification[0] == 1].index, inplace = True)
    #logging.info('Predict: {} Shape: {}'.format(prediction.head(), prediction.shape))
    
    # Model 2 - If Loan eligibility is True, predict the loan sanction amount
    
    # Keep only those data points where Loan Sanction Amount is greater that 0
    
    predict_dat.drop(yhat_classification[yhat_classification[0] == 0].index, inplace = True)
    yhat_regression = regression_model.predict(predict_dat)
    yhat_regression = pd.DataFrame(yhat_regression, index = predict_dat.index)
    prediction = pd.concat([yhat_classification, yhat_regression])
    prediction = prediction.sort_index(ascending=True)
    prediction['Customer ID'] = predict_index
    prediction = prediction.rename(columns = {0: 'Loan sanction Amount (USD)'})
    prediction = prediction[['Customer ID', 'Loan sanction Amount (USD)']]
    logging.info('Shape of prediction file: {}'.format(prediction.shape))
    logging.info('Saving to csv file...')
    prediction.to_csv('prediction.csv', index = False)
    logging.info('Saved')


In [294]:
predict_loan_sanction_amount()

2021-06-28 10:26:32,330 : Predicting based on the aggregate model...
2021-06-28 10:26:32,370 : Shape of prediction file: (20000, 2)
2021-06-28 10:26:32,371 : Saving to csv file...
2021-06-28 10:26:32,683 : Saved
