In [10]:
def project_2_scoring(data):
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    %matplotlib inline
    import category_encoders as ce
    import h2o
    from h2o.estimators import H2OGradientBoostingEstimator
    import os
    from h2o.grid.grid_search import H2OGridSearch
    import pickle
    from copy import deepcopy
    
    print("In scoring function")
    '''Load Artifacts'''
    artifacts_dict_file = open("artifacts/artifacts_dict_file.pkl", "rb")
    artifacts_dict = pickle.load(file=artifacts_dict_file)
    artifacts_dict_file.close()

    #LOad encoders
    target_encoder = artifacts_dict["target_encoder"]
    #Load h2o model
    #Start H2O
    h2o.init(max_mem_size = "4G")             #specify max number of bytes. uses all cores by default.
    h2o.remove_all()                          #clean slate, in case cluster was already running
    model = h2o.load_model(path='artifacts/best_model')
    
    '''Transform dataset'''
    #Clean dataset
    
    #Convert the strings styled as '$XXXX.XX' to float values. Columns = ['DisbursementGross', 'BalanceGross', 'GrAppv', 'SBA_Appv'] #This helps us to convert them into float values as they are numerical values
    Columns = ['DisbursementGross', 'BalanceGross', 'GrAppv', 'SBA_Appv']
    for col in Columns:
        data[col] = data[col].apply(lambda x: x.replace(',', '').replace('$',''))
        data[col] = data[col].astype(float)
    #Convert columns to categorical
    cat_columns = ["City","State","Bank","BankState", "NewExist", "RevLineCr","LowDoc","Zip"]
    #Ensure the above columns are categorical in the data
    data[cat_columns] = data[cat_columns].astype(object)
    
    #Fill missing values
    for col in data.columns:
        if data[col].isna().any() == True:
            if data[col].dtype == 'object':
                data[col].fillna(data[col].mode()[0],inplace=True)
            elif data[col].dtype == 'float64' or data[col].dtype == 'int64':
                data[col].fillna(data[col].mean(),inplace=True)  
    
    #Drop index column
    data.drop("index",axis=1,inplace=True)
    
    #Feature Engineering
    #NAICS Code - This helps us to extract the first 2 digits from the NAICS code. The first two digits of the NAICS code specify the industry. Hence it makes sense to have the NAICS code as an emgineered feature
    data['NAICS_Code'] = data['NAICS'].apply(lambda x : str(x)[0:2])
    data['NAICS_Code'] = data['NAICS_Code'].astype('object')
    data.drop("NAICS", axis = 1,inplace=True)
    
    #Similarly apply binning on NoEmp column
    groups = ['Low', 'Med', 'High', 'Very High']
    bins = [-1, 1000, 5000, 7500,9999]
    data['NoEmp_cut'] = pd.cut(data['NoEmp'], labels=groups,bins=bins)
    data.drop("NoEmp", axis = 1,inplace=True)
    
    #Similarly apply binning on CreateJob column
    groups = ['Low', 'Med', 'High', 'Very High']
    bins = [-1, 100, 5000, 7500,9999]
    data['CreateJob_cut'] = pd.cut(data['CreateJob'], labels=groups,bins=bins)
    data.drop("CreateJob", axis = 1,inplace=True)
    
    #Similarly apply binning on RetainedJob column
    groups = ['Low', 'Med', 'High', 'Very High']
    bins = [-1, 100, 5000, 7500,9999]
    data['RetainedJob_cut'] = pd.cut(data['RetainedJob'], labels=groups,bins=bins)
    data.drop("RetainedJob", axis = 1,inplace=True)
    
    #DisbursementGross column
    data['log_DisbursementGross'] = np.log2(data['DisbursementGross']+1)
    data.drop("DisbursementGross", axis = 1,inplace=True)
    
    #GrAppv column
    data['log_GrAppv'] = np.log2(data['GrAppv'])
    data.drop("GrAppv", axis = 1,inplace=True)
    
    #SBA_Appv column
    data['log_SBA_Appv'] = np.log2(data['SBA_Appv'])
    data.drop("SBA_Appv", axis = 1,inplace=True)
    
    #BalanceGross column
    data['log_BalanceGross'] = np.log2(data['BalanceGross']+1)
    data.drop("BalanceGross", axis = 1,inplace=True)
    
    #Gross of DisbursementGross per franchise
    data['Franchise_DisbursementGross_Mean'] = data.groupby(['FranchiseCode'])['log_DisbursementGross']\
                                     .transform(lambda x: x.mean())
    
    #Convert New EXist column to categorical
    data["NewExist"] = data["NewExist"].astype(object)
    
    cat_columns_new = cat_columns + ["NoEmp_cut","CreateJob_cut","RetainedJob_cut"]
    
    ##Encoding categorical variables
    #Columns to drop from ML models
    cols_to_drop = []
    #Categorical encoders disctionary
    cat_encoders = {}
    #New categorical (encoded) columns
    cat_enc_columns = []
    
    for col in cat_columns_new:
        if data[col].dtype == 'object' or data[col].dtype == 'category':
            if data[col].nunique() < 10:
                print("encoded ", col)
                '''Encode Testing'''
                cat_enc_columns = cat_enc_columns + ohe_columns
                result_test = pd.DataFrame(result, columns=ohe_columns)
                result_test.index = data.index
                data = pd.concat([data, result_test], axis=1)
                cat_encoders[col] = [deepcopy(enc),"ohe"]
                data[col+"_Unknown"]=0
        cols_to_drop.append(col)
    
    #Adding columns for target encoding, remove the original columns
    trg_columns = ["City","State","Bank","BankState", "RevLineCr","Zip"]
    
    for col in trg_columns:
        data[col+"_trg"] = data[col]
    
    #print(data.columns)
    
    tar_enc_cols = encoder.get_feature_names()
        
    #Handle categories in test dataset that are not there in train dataset
    for col in data.columns:
        if(col not in tar_enc_cols):
            print("Removing column: ",col)
            data[("_".join((col).split("_")[:-1]))+"_Unknown"]= data[col]
            data.drop(col,axis=1,inplace=True)
    
    
    data = target_encoder.transform(data)
    
    #Drop original columns
    data.drop(cat_columns_new,axis=1,inplace=True)
    
    data.to_csv("transformed_data.csv")

    #Convert pandas dataframe to h2o frame
    h2o_data = h2o.H2OFrame(data)
    h2o_data = h2o_data.asnumeric()
    
    print(h2o_data.head(5))
    
    '''Score dataset'''
    #Score the dataset using the model
    prediction = (model.predict(h2o_data)).as_data_frame()
    prediction.reset_index()
    prediction.rename(columns={'predict':'label','p0':'probability_0','p1':'probability_1'})
    print(prediction)
    
    '''Return pandas DF'''
    return prediction

In [9]:

# Load new data and remove the target column (if it's present)
import pandas as pd
import pickle
new_data = pd.read_csv('C:\Shiva Files\Shiva SSD\Shiva\MSBA Cohort\Semester 2\Applied Machine Learning\Project 1\SBA_loans_project_2_holdout_students_valid.csv')

# Get predictions using the scoring function
results_df = project_2_scoring(new_data)
results_df.head()

Unnamed: 0,predict,probability_0,probability_1
0,0,0.951011,0.048989
1,0,0.936405,0.063595
2,1,0.103346,0.896654
3,0,0.757923,0.242077
4,0,0.961737,0.038263
