# Scoring Function

In [68]:
from copy import deepcopy
from sklearn.linear_model import LogisticRegression
import category_encoders as ce    
import pickle
import numpy as np

def project_2_scoring(data):
    
    X = data.copy()
    
    '''Load Artifacts'''
    
    artifacts_dict_file = open("artifacts_dictnew_file.pkl", "rb")
    artifacts_dict = pickle.load(file=artifacts_dict_file)
    artifacts_dict_file.close()

    col_remove_dollar = artifacts_dict["col_remove_dollar"]
    cat_encoders = artifacts_dict["cat_encoders"]
    model = artifacts_dict["model"]
    threshold = artifacts_dict["threshold"]  
    columns_to_score = artifacts_dict["columns_to_score"]
   
    values_to_fill = {}
    for col in X.columns:
        if X[col].isna().any() == True:
            if pd.api.types.is_numeric_dtype(X[col].dtype):
                values_to_fill[col] = 0
            else:
                values_to_fill[col] = "Missing"
    X.fillna(value=values_to_fill,inplace=True)
   
    
    # Convert specific columns to string type
    for col in col_remove_dollar:
        X[col] = X[col].astype(str)
    
    # Remove Dollars from numerical vars
    for col in col_remove_dollar:
        X[col] = X[col].str.replace('$','',regex=True).replace(',','',regex=True)
        X[col] = X[col].astype(float)
    
    # Adding New Features
   # wheter the bank and customer are in the same state
    X['state_and_bankstate'] = np.where(X['State'] == X['BankState'], 1, 0)
    # Total number of jobs created or retained 
    X['TotalJobs'] = X['CreateJob'] + X['RetainedJob']
    # Calculate loan utilization
    X['LoanUtilization'] = X['DisbursementGross'] / X['GrAppv']
    # Number of franchises
    X['NumFranchises'] = (X['FranchiseCode'] != 0).astype(int)
    # Ratio of SBA_Appv to GrAppv
    X['SBAtoGrRatio'] = X['SBA_Appv'] / X['GrAppv']
    # ApprovedFullAmount (1 if DisbursementGross equals GrAppv, 0 otherwise)
    X['ApprovedFullAmount'] = (X['DisbursementGross'] == X['GrAppv']).astype(int)
    # HasRevolvingLineOfCredit (1 if RevLineCr equals 'Y', 0 otherwise)
    X['HasRevolvingLineOfCredit'] = (X['RevLineCr'] == 'Y').astype(int)
    # LowDocumentation (1 if LowDoc equals 'Y', 0 otherwise)
    X['LowDocBool'] = X['LowDoc'].apply(lambda x: 1 if x == 'Y' else 0)
    # The difference between initial approved amount and final amount of loan
    X['diff_loan_amt1'] = X['GrAppv'] - X['SBA_Appv']
    # The difference between approved amount of loan and the requested amount of loan
    X['diff_loan_amt2'] = X['DisbursementGross'] - X['SBA_Appv']
     #cleaning the data    
    # -- Drop NewExist = 0
    index_exist0 = []
    for item in list(X[X['NewExist'] == 0].index):
        index_exist0.append(item)

    X.drop(index_exist0, inplace=True)

    # Change NewExist values to 1 or 0
    X['NewExist'] = X['NewExist'].apply(lambda x: 1 if x == 1 else 0)
    
    # -- change franchise
    X['FranchiseCode'] = X['FranchiseCode'].apply(lambda x: 0 if x in [0, 1] else 1)
    #dropping values other than Y or N for LowDoc
    index_drop_lowdoc = []
    for index in list(X[(X['LowDoc'] != 'N') & (X['LowDoc'] != 'Y')].index):
        index_drop_lowdoc.append(index)

    X.drop(index_drop_lowdoc,inplace=True)
    
    #dropping values other than Y, N ,0,T
    index_drop_RevLineCr = []
    for index in list(X[(X['RevLineCr'] != 'N') & (X['RevLineCr'] != 'Y') & (X['RevLineCr'] != '0') & (X['RevLineCr'] != 'T')].index):
        index_drop_RevLineCr.append(index)

    X.drop(index_drop_RevLineCr, inplace=True)


  
    from sklearn.preprocessing import OneHotEncoder
    from category_encoders import TargetEncoder
    from copy import deepcopy

    '''Save original columns that need to be droped or not used
      Save One-hot and Label encoders for future use
    '''
       # Categorical encoding
    # Columns to drop from ML models
    cols_to_drop = ["City", "State", "Bank", "BankState", "RevLineCr"]
    # Categorical encoding
    cols_to_drop = ["City", "State", "Bank", "BankState", "RevLineCr"]
    cat_enc_columns = []

    for col, (encoder, enc_type) in cat_encoders.items():
        if col not in cols_to_drop:
            if enc_type == "ohe":
                ohe = encoder
                result = ohe.transform(X[[col]])
                ohe_columns = [col + "_" + str(x) for x in ohe.categories_[0]]
                cat_enc_columns = cat_enc_columns + ohe_columns
                new_encoded = pd.DataFrame(result, columns=ohe_columns, index=X.index)
            elif enc_type == "trg":
                trg = encoder
                new_col_name = col + "_trg"
                result = trg.transform(X[[col]])
                new_encoded = pd.DataFrame(result, columns=[new_col_name], index=X.index)
            X=pd.concat([X, new_encoded.reindex(X.index)], axis=1, join="inner")

    X.drop(cols_to_drop, axis=1, inplace=True)
    
    
      
    # light gbm predict
    columns_to_score=artifacts_dict['columns_to_score']
    X_test=columns_to_score
    model=artifacts_dict['model']
    y_pred_proba = model.predict(X_test, pred_leaf=False, pred_contrib=False)
    y_pred_binary = (y_pred_proba >= threshold).astype(np.int16)

    # Calculate the probability for the 0 class
    probability_0 = 1 - y_pred_proba

    y_pred_proba_rounded = np.round(y_pred_proba, 4)  # Round to 4 decimal places
    probability_0_rounded = np.round(probability_0, 4)  # Round to 4 decimal places

    d = {
        "label": y_pred_binary,
        "probability_0": probability_0_rounded,
        "probability_1": y_pred_proba_rounded
        }

    return pd.DataFrame(d)
    


In [70]:
import pandas as pd

# Load the original dataset
test = pd.read_csv("SBA_loans_project_2.zip")
test = test.drop(columns=['MIS_Status'])

# Sample 100,000 records randomly from the dataset
test_data = test.sample(n=100000, random_state=473)
test_data = test_data.reset_index(drop=True)

In [71]:
project_2_scoring(test_data)

Unnamed: 0,label,probability_0,probability_1
0,0,0.9004,0.0996
1,0,0.9855,0.0145
2,0,0.8184,0.1816
3,0,0.8364,0.1636
4,0,0.5887,0.4113
...,...,...,...
557988,0,0.4987,0.5013
557989,0,0.4756,0.5244
557990,0,0.8208,0.1792
557991,0,0.6263,0.3737
