In [117]:
import h2o
import pandas as pd
import pickle
import joblib

# Initialize the H2O cluster
h2o.init()

def project_1_scoring(data):
    import warnings
    warnings.filterwarnings('ignore')
    pd.set_option('display.max_columns', 1500)

    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()

    #Function to take the first 2 digits of the code
    def get_code(industry_code):
        return int(str(industry_code)[:2])

    # Caution should be taken to avoid overfitting. 
    poly_colname = []
    def add_sqr_feature(df, col):
        col_name = col + "_squared"
        df[col_name] = df[col].apply(lambda x: x**2)
        poly_colname.append(col_name)
        return df

    def add_cube_feature(df, column_name):
        new_column_name = column_name + "_cube"
        df[new_column_name] = df[column_name] ** 3
        poly_colname.append(new_column_name)
        return df


    #Feature engineered columns using interaction features 
    def add_interaction_features(data,interaction_cols):
        # Create interaction features for DisbursementGross_sc,  and NoEmp_sc
        for i in range(len(interaction_cols)):
            for j in range(i+1, len(interaction_cols)):
                colname = interaction_cols[i] + "_" + interaction_cols[j]+ "interaction" 
                data[colname] = data[interaction_cols[i]] * data[interaction_cols[j]]
                
        return data

    # Automatically detect categorical columns
    def get_categorical_columns(data):
        categorical_columns = [col for col in data.columns if data[col].dtype == 'object']
        return categorical_columns

    

    
    '''Load Artifacts'''
    artifacts_dict_file = open("./artifacts_dict_file.pkl", "rb")
    artifacts_dict = pickle.load(file=artifacts_dict_file)
    artifacts_dict_file.close()
    cat_encoders = artifacts_dict["cat_encoders"]
    cat_enc_columns = artifacts_dict["cat_enc_columns"]
    cols_numerical_orig = artifacts_dict["cols_numerical_orig"]
    top10_features = artifacts_dict["top10_features"]
    threshold = artifacts_dict["threshold"]
    
    

    values_to_fill = {}
    for col in data.columns:
        if data[col].isna().any() == True:
            if pd.api.types.is_numeric_dtype(data[col].dtype):
                values_to_fill[col] = 0
            else:
                values_to_fill[col] = "Unknown"

    
    # Feature Engineering - Adding a new column SBA_Bank_Gurantee_Ratio which signifies The portion which is guranteed by the SBA Bank
    data['SBA_Appv'] = pd.to_numeric(data['SBA_Appv'])
    data['GrAppv'] = pd.to_numeric(data['GrAppv'])

    data['SBA_Bank_Gurantee_Ratio'] = data['SBA_Appv'] / data['GrAppv']

    industry_code = data['NAICS']


    ## Feature Engineering - Adding a new column namely industry_code
    data['industry_code'] = data.NAICS.apply(get_code)

    index_col = 'index'
    label_col = 'MIS_Status'


    
    for col in data.drop(columns=[index_col]).columns:
        if data[col].dtype == 'object':
            if data[col].nunique() < 10:
                
                enc = cat_encoders[col][0]
                result = enc.transform(data[[col]])
                if hasattr(enc, 'categories_'):  # Check if the encoder has 'categories_' attribute
                    ohe_columns = [col+"_"+str(x) for x in enc.categories_[0]]
                    result_train = pd.DataFrame(result, columns=ohe_columns)
                    data = pd.concat([data, result_train], axis=1)
            else:
                
                woe_encoder = cat_encoders[col][0]
                new_col_name = col+"_woe"
                data[new_col_name] = woe_encoder.transform(data[[col]])

        
    '''Scale only original numerical columns'''
    for col in data[cols_numerical_orig]:
      if pd.api.types.is_numeric_dtype(data[col].dtype):
        
        scaler.fit(data[[col]])
        data[col+"_sc"] = scaler.transform(data[[col]])


    
    data = add_sqr_feature(data, "Bank_woe")

    data = add_sqr_feature(data, "City_woe")

    data = add_sqr_feature(data, "BankState_woe")

    data = add_sqr_feature(data, "UrbanRural_sc")

    
    interaction_cols = ["DisbursementGross_sc", "SBA_Bank_Gurantee_Ratio_sc"]    
    data = add_interaction_features(data, interaction_cols)

    data_hf = h2o.H2OFrame(data)

    trained_model = h2o.load_model('/Users/akilsurya.s/ml-fall-2023/Project-1/grid_finale_model_11')
    
    threshold = 0.29723170136047936
    
    top10_features = list(top10_features)
    y_pred_probability = trained_model.predict(data_hf[top10_features]).as_data_frame()


    y_pred = (y_pred_probability['p1'] > threshold).astype(int)

    # Convert H2OFrame to pandas dataframe
    index_df = data_hf["index"].as_data_frame()

    # Create the dictionary of results
    d = {"index": index_df.iloc[:,0],
     "label": y_pred,
     "probability_0": y_pred_probability["p0"].values.flatten(),
     "probability_1": y_pred_probability["p1"].values.flatten() }
    return pd.DataFrame(d)

    # Load the trained H2O model
    #my_local_model = h2o.download_model(model=saved_model, path='./artifacts')

    


   


Checking whether there is an H2O instance running at http://localhost:54321. connected.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,4 mins 02 secs
H2O_cluster_timezone:,America/Chicago
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.42.0.2
H2O_cluster_version_age:,3 months and 11 days
H2O_cluster_name:,H2O_from_python_akilsurya_s_x9gcl6
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,5.429 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [118]:
result = pd.read_csv('SBA_loans_project_1_holdout_students_valid.csv')
scored_data = project_1_scoring(result)
scored_data.head()

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,index,label,probability_0,probability_1
0,0,1,0.641304,0.358696
1,1,0,0.950392,0.049608
2,2,0,0.918449,0.081551
3,3,0,0.991462,0.008538
4,4,0,0.91136,0.08864
