In [1]:
import pickle

def load_and_print_artifacts_dict(path):
    artifacts_dict = pickle.load(open(path, "rb"))

    print("Target encoder mapping:")
    print([ac for ac in artifacts_dict["target_encoder"].mapping])

    print("Columns to train:")
    print([ac for ac in artifacts_dict["columns_to_train"]])

if __name__ == "__main__":
    load_and_print_artifacts_dict("./Artifacts/artifacts_dict_file.pkl")

Target encoder mapping:
['City', 'State', 'Bank', 'BankState', 'RevLineCr', 'LowDoc', 'NewExist']
Columns to train:
['City_trg', 'State_trg', 'Bank_trg', 'BankState_trg', 'RevLineCr_trg', 'LowDoc_trg', 'NewExist_trg', 'NAICS', 'NoEmp', 'NewExist', 'CreateJob', 'RetainedJob', 'FranchiseCode', 'UrbanRural', 'DisbursementGross', 'BalanceGross', 'GrAppv', 'SBA_Appv', 'Log_DisbursementGross', 'Log_GrAppv', 'Log_SBA_Appv', 'Log_BalanceGross', 'TotalJobs', 'IncomeToLoanRatio', 'EmployeesToLoanRatio', 'JobPerLoan', 'Gauren_SBA_Appv']


In [2]:
def scoring(data):
    """
    Function to score input dataset.
    
    Input: dataset in Pandas DataFrame format
    Output: Python list of labels in the same order as input records
    
    Flow:
        - Load artifacts
        - Transform dataset
        - Score dataset
        - Return labels
    
    """
    
    if "index" in data.columns:
        data.drop(columns="index", inplace=True)
    #Load Artifacts
    artifacts_dict_file = open("./Artifacts/artifacts_dict_file.pkl", "rb")
    artifacts_dict = pickle.load(file=artifacts_dict_file)
    artifacts_dict_file.close()
    

    clf = artifacts_dict["model"]
    te = artifacts_dict["target_encoder"]
    te_columns = artifacts_dict["te_columns"]
    columns_to_score = artifacts_dict["columns_to_train"]
    threshold = artifacts_dict["threshold"]
    category_cols = artifacts_dict["category_cols"]
    numerical_columns = artifacts_dict["numerical_columns"]
    scaler = artifacts_dict["scaler"]

     # Replacing the missing values
    for i in data['RevLineCr']:
        if i not in ['Y','N']:
            data['RevLineCr'].replace(i,'N',inplace=True)

    for i in data['LowDoc']:
        if i not in ['Y','N']:
            data['LowDoc'].replace(i,'N',inplace=True)

    for i in data['NewExist']:
        if i not in [1,2]:
            data['NewExist'].replace(i,None,inplace=True)

    for column in category_cols:
        data[column]=data[column].fillna(data[column].mode()[0])

    
    # 10 New Feature Extractions
    import numpy as np
    # Apply the log transformation to the specific feature in your training data
    small_constant = 1e-10  # You can adjust this constant as needed
    # df['LogColumn'] = np.log(df['OriginalColumn'] + small_constant)
    data['Log_DisbursementGross'] = np.log1p(data['DisbursementGross'])
    data['Log_GrAppv'] = np.log1p(data['GrAppv'])
    data['Log_SBA_Appv'] = np.log1p(data['SBA_Appv'])
    data['Log_BalanceGross'] = np.log1p(data['BalanceGross'])
    data['TotalJobs'] = data['CreateJob'] + data['RetainedJob']
    #train_encoded['Loan_Efficiency'] = train_encoded['DisbursementGross'] / (train_encoded['CreateJob'] + train_encoded['RetainedJob'] + 1)
    # Calculate 'LoanToIncomeRatio' as a ratio of 'SBA_Appv' to 'DisbursementGross'
    data['IncomeToLoanRatio'] = data['DisbursementGross'] / data['SBA_Appv']
    # Calculate 'LoanToEmployeesRatio' as a ratio of 'SBA_Appv' to 'NoEmp'
    data['EmployeesToLoanRatio'] = data['NoEmp'] / data['SBA_Appv']
    # Create a binary feature to indicate loans with a balance ('BalanceGross' > 0)
    #train_encoded['HasBalance'] = (train_encoded['BalanceGross'] > 0).astype(int)
    # Calculate 'LoanPerJob' as a ratio of 'SBA_Appv' to 'TotalJobs'
    data['JobPerLoan'] = data['TotalJobs'] / data['SBA_Appv'] 
    # Calculate SBA's Gaurenteed Portion of Approved Loan
    data['Gauren_SBA_Appv'] = data['GrAppv'] / data['SBA_Appv']


    # Scaling the numerical columns
    data[numerical_columns] = scaler.transform(data[numerical_columns])                             
    
    # Target encoding the categorical columns
    data_encoded = te.transform(data[te_columns])
    data_encoded = data_encoded.add_suffix('_trg')
    data_encoded = pd.concat([data_encoded, data], axis=1)
    
    # Renaming the columns
    
    for column in te_columns:
        data_encoded[column + "_trg"].fillna(data_encoded[column + "_trg"].mean(), inplace=True)
    
    # Predicting the probabilities
    y_prob = clf.predict_proba(data_encoded[columns_to_score])
    y_pred = (y_prob[:,0] < threshold).astype(int)
    d = {
        "index": data.index,
        "label": y_pred,
        "probability_0": y_prob[:,0],
        "probability_1": y_prob[:,1],
        "threshold":threshold
    }
    #print(y_prob)
    return pd.DataFrame(d)

    
    

In [3]:
import pandas as pd
ef = pd.read_csv(filepath_or_buffer="D:/Work/Gre/UTD/Courses/Fall/MIS6341/Softwares/Python/ml-fall-2023/Project1/SBA_loans_project_1_holdout_students_valid.csv", sep=",", header='infer')
ef1 = ef.copy()

In [4]:
print(scoring(ef1))

       index  label  probability_0  probability_1  threshold
0          0      0       0.852914       0.147086   0.505051
1          1      0       0.736220       0.263780   0.505051
2          2      0       0.785097       0.214903   0.505051
3          3      0       0.736333       0.263667   0.505051
4          4      0       0.878069       0.121931   0.505051
...      ...    ...            ...            ...        ...
89912  89912      1       0.500004       0.499996   0.505051
89913  89913      0       0.850401       0.149599   0.505051
89914  89914      0       0.641032       0.358968   0.505051
89915  89915      0       0.567922       0.432078   0.505051
89916  89916      0       0.789032       0.210968   0.505051

[89917 rows x 5 columns]
