# Section 1: Data & H20 Python module Preparation

## Sec 1.1: Load data, split dataset, adjust variable names

In [1]:
import numpy as np
import datetime
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv("XYZloan_default_selected_vars.csv")

#for binary classification, response should be factor
data = data.astype(object)

In [2]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.40, random_state=42)

In [3]:
var = pd.DataFrame(train.dtypes).reset_index()
var.columns = ['varname','dtype'] 
var['source'] = var['varname'].str[:2]
var.head()

Unnamed: 0,varname,dtype,source
0,Unnamed: 0.1,object,Un
1,Unnamed: 0,object,Un
2,id,object,id
3,loan_default,object,lo
4,AP001,object,AP


In [4]:
## Sec 1.2: Remove bad feature

In [5]:
# "AP004" is a bad data field and should be removed.
MB_list = list(var[var['source']=='MB']['varname'])
AP_list = list(var[(var['source']=='AP') & (var['varname']!='AP004')]['varname'])
TD_list = list(var[var['source']=='TD']['varname'])
CR_list = list(var[var['source']=='CR']['varname'])
PA_list = list(var[var['source']=='PA']['varname'])
CD_list = list(var[var['source']=='CD']['varname'])

In [6]:
train['loan_default'].value_counts(dropna=False)
#train['loan_default'] = train['loan_default'].astype('category')

0    38736
1     9264
Name: loan_default, dtype: int64

## Sec 1.3: Prepare H20 Python Module for modeling

In [8]:
#pip install h2o
import h2o
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "20.0.1" 2023-04-18; Java(TM) SE Runtime Environment (build 20.0.1+9-29); Java HotSpot(TM) 64-Bit Server VM (build 20.0.1+9-29, mixed mode, sharing)
  Starting server from /Users/ruxizhou/opt/anaconda3/lib/python3.9/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/4_/810dy4d13s9c2627491y7jdh0000gn/T/tmppcjs7a6e
  JVM stdout: /var/folders/4_/810dy4d13s9c2627491y7jdh0000gn/T/tmppcjs7a6e/h2o_ruxizhou_started_from_python.out
  JVM stderr: /var/folders/4_/810dy4d13s9c2627491y7jdh0000gn/T/tmppcjs7a6e/h2o_ruxizhou_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,04 secs
H2O_cluster_timezone:,America/New_York
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.2
H2O_cluster_version_age:,29 days
H2O_cluster_name:,H2O_from_python_ruxizhou_gj4hj0
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.983 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


# Section 2: Modeling

## Sec 2.1: GLM

### What's GLM?
The GLM stands for Generalized linear models. The GLM generalizes linear regression by allowing the linear model to be related to the response variable via a link function and by allowing the magnitude of the variance of each measurement to be a function of its predicted value.
GLM with regularization is incorporating regularization techniques like L1(LASSO) and L2(Ridge) regularization to prevent overfitting and improve model generalization.

### Why GLM?
GLMs allow for flexibility in choosing the probability distribution that best fits the characteristics of the data. This is particularly useful when dealing with non-Gaussian or non-normally distributed data, which is common in anomaly detection scenarios; GLMs are computationally efficient and scalable to large datasets, making them suitable for applications with a substantial amount of data.

## Sec 2.1.1: Build model with 10% data

In [12]:
target='loan_default'
predictors = CR_list + TD_list + AP_list + MB_list + CR_list + PA_list

In [13]:
train_smpl = train.sample(frac=0.1, random_state=1)
test_smpl = test.sample(frac=0.1, random_state=1)
train_hex = h2o.H2OFrame(train_smpl)
test_hex = h2o.H2OFrame(test_smpl)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [14]:
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

GLM_WITH = H2OGeneralizedLinearEstimator(family= "binomial",
                                          lambda_search = True)
GLM_WITH.train(predictors, target, training_frame= train_hex)

glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,family,link,regularization,lambda_search,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
,binomial,logit,"Elastic Net (alpha = 0.5, lambda = 1.22E-5 )","nlambda = 100, lambda.max = 0.1112, lambda.min = 1.22E-5, lambda.1se = -1.0",76,75,107,Key_Frame__upload_928b0a2e75a26c43e8d4b4ae549e4ecd.hex

Unnamed: 0,0,1,Error,Rate
0,2412.0,1486.0,0.3812,(1486.0/3898.0)
1,310.0,592.0,0.3437,(310.0/902.0)
Total,2722.0,2078.0,0.3742,(1796.0/4800.0)

metric,threshold,value,idx
max f1,0.1825513,0.3973154,232.0
max f2,0.1252577,0.5637565,304.0
max f0point5,0.2669119,0.3440911,147.0
max accuracy,0.5333327,0.8147917,22.0
max precision,0.8782721,1.0,0.0
max recall,0.0485368,1.0,382.0
max specificity,0.8782721,1.0,0.0
max absolute_mcc,0.1825513,0.2168927,232.0
max min_per_class_accuracy,0.1854996,0.6331452,228.0
max mean_per_class_accuracy,0.1758381,0.6378471,241.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.01,0.5014388,3.1042129,3.1042129,0.5833333,0.5824233,0.5833333,0.5824233,0.0310421,0.0310421,210.421286,210.421286,0.0259113
2,0.02,0.4463466,2.3281596,2.7161863,0.4375,0.4720524,0.5104167,0.5272378,0.0232816,0.0543237,132.8159645,171.6186253,0.0422663
3,0.03,0.4160742,2.2172949,2.5498891,0.4166667,0.4277528,0.4791667,0.4940762,0.0221729,0.0764967,121.72949,154.9889135,0.057256
4,0.04,0.3953913,1.4412417,2.2727273,0.2708333,0.405839,0.4270833,0.4720169,0.0144124,0.0909091,44.1241685,127.2727273,0.0626895
5,0.05,0.3778098,1.2195122,2.0620843,0.2291667,0.3866734,0.3875,0.4549482,0.0121951,0.1031042,21.9512195,106.2084257,0.0653926
6,0.1,0.3180314,1.8403548,1.9512195,0.3458333,0.3444416,0.3666667,0.3996949,0.0920177,0.195122,84.0354767,95.1219512,0.1171332
7,0.15,0.278468,1.6851441,1.8625277,0.3166667,0.2953434,0.35,0.3649111,0.0842572,0.2793792,68.5144124,86.2527716,0.1593176
8,0.2,0.2515316,1.3968958,1.7461197,0.2625,0.2650387,0.328125,0.339943,0.0698448,0.3492239,39.6895787,74.6119734,0.1837545
9,0.3,0.2159993,1.3414634,1.6112343,0.2520833,0.2320565,0.3027778,0.3039808,0.1341463,0.4833703,34.1463415,61.1234294,0.2258023
10,0.4,0.1886679,1.2749446,1.5271619,0.2395833,0.2020389,0.2869792,0.2784953,0.1274945,0.6108647,27.4944568,52.7161863,0.259659

Unnamed: 0,timestamp,duration,iteration,lambda,predictors,deviance_train,alpha,iterations,training_rmse,training_logloss,training_r2,training_auc,training_pr_auc,training_lift,training_classification_error
,2023-12-07 12:52:43,0.000 sec,1,.11E0,1,0.9663759,0.5,,,,,,,,
,2023-12-07 12:52:43,0.038 sec,3,.1E0,3,0.9637551,0.5,,,,,,,,
,2023-12-07 12:52:43,0.060 sec,5,.92E-1,4,0.9610649,0.5,,,,,,,,
,2023-12-07 12:52:43,0.093 sec,7,.84E-1,4,0.9586035,0.5,,,,,,,,
,2023-12-07 12:52:43,0.116 sec,8,.77E-1,4,0.9565215,0.5,,,,,,,,
,2023-12-07 12:52:43,0.131 sec,9,.7E-1,4,0.9547877,0.5,,,,,,,,
,2023-12-07 12:52:43,0.148 sec,10,.64E-1,4,0.9533288,0.5,,,,,,,,
,2023-12-07 12:52:43,0.168 sec,11,.58E-1,4,0.9521014,0.5,,,,,,,,
,2023-12-07 12:52:43,0.197 sec,12,.53E-1,6,0.9502490,0.5,,,,,,,,
,2023-12-07 12:52:43,0.220 sec,13,.48E-1,7,0.9480656,0.5,,,,,,,,

variable,relative_importance,scaled_importance,percentage
MB007.CHANGHONG,4.2765799,1.0,0.0770138
MB007.LINGWIN,3.5858400,0.8384831,0.0645748
MB007.MEDIATEK,3.1465077,0.7357533,0.0566632
MB007.MEITU,2.5922594,0.6061525,0.0466821
MB007.COOLPAD,2.5509791,0.5964998,0.0459387
MB007.ZTE,2.2937958,0.5363622,0.0413073
MB007.LENOVO,2.1371155,0.4997254,0.0384858
MB007.IPHONE4,2.1294012,0.4979215,0.0383468
MB007.LETV,2.0535243,0.4801791,0.0369804
MB007.QIKU,2.0506310,0.4795026,0.0369283


In [15]:
y_pred = GLM_WITH.predict(test_hex).as_data_frame()
y_actual = test_hex[target].as_data_frame()
y_pred.head()

glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,predict,p0,p1
0,1,0.757716,0.242284
1,0,0.86494,0.13506
2,1,0.794428,0.205572
3,0,0.817654,0.182346
4,0,0.864791,0.135209


In [16]:
GLM_WITH_actual_predict = pd.concat([y_actual,y_pred['predict']],axis=1)
GLM_WITH_actual_predict.columns = ['actual','pred']
GLM_WITH_actual_predict.head()

Unnamed: 0,actual,pred
0,0,1
1,0,0
2,0,1
3,0,0
4,0,0


In [17]:
from sklearn.metrics import roc_auc_score
GLM_WITH_roc_auc_value = roc_auc_score(GLM_WITH_actual_predict['actual'],GLM_WITH_actual_predict['pred'])
GLM_WITH_roc_auc_value

0.5981410256410257

In [18]:
def createGains(model):
    predictions = model.predict(test_hex)
    test_scores = test_hex['loan_default'].cbind(predictions).as_data_frame()

    #sort on prediction (descending), add id, and decile for groups containing 1/10 of datapoints
    test_scores = test_scores.sort_values(by='predict',ascending=False)
    test_scores['row_id'] = range(0,0+len(test_scores))
    test_scores['decile'] = ( test_scores['row_id'] / (len(test_scores)/10) ).astype(int)
    #see count by decile
    test_scores.loc[test_scores['decile'] == 10]=9
    test_scores['decile'].value_counts()

    #create gains table
    gains = test_scores.groupby('decile')['loan_default'].agg(['count','sum'])
    gains.columns = ['count','actual']
    gains

    #add features to gains table
    gains['non_actual'] = gains['count'] - gains['actual']
    gains['cum_count'] = gains['count'].cumsum()
    gains['cum_actual'] = gains['actual'].cumsum()
    gains['cum_non_actual'] = gains['non_actual'].cumsum()
    gains['percent_cum_actual'] = (gains['cum_actual'] / np.max(gains['cum_actual'])).round(2)
    gains['percent_cum_non_actual'] = (gains['cum_non_actual'] / np.max(gains['cum_non_actual'])).round(2)
    gains['if_random'] = np.max(gains['cum_actual']) /10 
    gains['if_random'] = gains['if_random'].cumsum()
    gains['lift'] = (gains['cum_actual'] / gains['if_random']).round(2)
    gains['K_S'] = np.abs( gains['percent_cum_actual'] -  gains['percent_cum_non_actual'] ) * 100
    gains['gain']=(gains['cum_actual']/gains['cum_count']*100).round(2)
    gains = pd.DataFrame(gains)
    return(gains)

createGains(GLM_WITH)

glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


Unnamed: 0_level_0,count,actual,non_actual,cum_count,cum_actual,cum_non_actual,percent_cum_actual,percent_cum_non_actual,if_random,lift,K_S,gain
decile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,320,89,231,320,89,231,0.15,0.09,60.0,1.48,6.0,27.81
1,320,78,242,640,167,473,0.28,0.18,120.0,1.39,10.0,26.09
2,320,88,232,960,255,705,0.42,0.27,180.0,1.42,15.0,26.56
3,320,63,257,1280,318,962,0.53,0.37,240.0,1.32,16.0,24.84
4,320,78,242,1600,396,1204,0.66,0.46,300.0,1.32,20.0,24.75
5,320,34,286,1920,430,1490,0.72,0.57,360.0,1.19,15.0,22.4
6,320,33,287,2240,463,1777,0.77,0.68,420.0,1.1,9.0,20.67
7,320,46,274,2560,509,2051,0.85,0.79,480.0,1.06,6.0,19.88
8,320,42,278,2880,551,2329,0.92,0.9,540.0,1.02,2.0,19.13
9,320,49,271,3200,600,2600,1.0,1.0,600.0,1.0,0.0,18.75


## Sec 2.1.2: Hyperparameter using Grid Search

In [19]:
from h2o.grid.grid_search import H2OGridSearch

glm_params1 = {
    'alpha': [0, 0.2, 0.4, 0.6, 0.8, 1],  
    'lambda': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]  
}


glm_model = H2OGeneralizedLinearEstimator()

# Perform grid search
grid = H2OGridSearch(glm_model, glm_params1)
grid.train(x=predictors, y=target, training_frame=train_hex)

glm Grid Build progress: |███████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,alpha,lambda,model_ids,residual_deviance
,0.0,1e-05,Grid_GLM_Key_Frame__upload_928b0a2e75a26c43e8d4b4ae549e4ecd.hex_model_python_1701971517102_3_model_1,688.4066102
,0.2,1e-05,Grid_GLM_Key_Frame__upload_928b0a2e75a26c43e8d4b4ae549e4ecd.hex_model_python_1701971517102_3_model_2,688.5428771
,0.4,1e-05,Grid_GLM_Key_Frame__upload_928b0a2e75a26c43e8d4b4ae549e4ecd.hex_model_python_1701971517102_3_model_3,688.5770260
,0.6,1e-05,Grid_GLM_Key_Frame__upload_928b0a2e75a26c43e8d4b4ae549e4ecd.hex_model_python_1701971517102_3_model_4,688.6021770
,0.8,1e-05,Grid_GLM_Key_Frame__upload_928b0a2e75a26c43e8d4b4ae549e4ecd.hex_model_python_1701971517102_3_model_5,688.6402659
,1.0,1e-05,Grid_GLM_Key_Frame__upload_928b0a2e75a26c43e8d4b4ae549e4ecd.hex_model_python_1701971517102_3_model_6,688.6597562
,0.0,0.0001,Grid_GLM_Key_Frame__upload_928b0a2e75a26c43e8d4b4ae549e4ecd.hex_model_python_1701971517102_3_model_7,688.7894516
,0.2,0.0001,Grid_GLM_Key_Frame__upload_928b0a2e75a26c43e8d4b4ae549e4ecd.hex_model_python_1701971517102_3_model_8,689.3381400
,0.4,0.0001,Grid_GLM_Key_Frame__upload_928b0a2e75a26c43e8d4b4ae549e4ecd.hex_model_python_1701971517102_3_model_9,689.7824882
,0.6,0.0001,Grid_GLM_Key_Frame__upload_928b0a2e75a26c43e8d4b4ae549e4ecd.hex_model_python_1701971517102_3_model_10,690.5186726


In [20]:
# Get the best GLM model from the grid search
best_glm_model = grid.get_grid()[0]
best_glm_model

Unnamed: 0,family,link,regularization,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
,gaussian,identity,Ridge ( lambda = 1.0E-5 ),76,76,1,Key_Frame__upload_928b0a2e75a26c43e8d4b4ae549e4ecd.hex

Unnamed: 0,timestamp,duration,iterations,negative_log_likelihood,objective,training_rmse,training_deviance,training_mae,training_r2
,2023-12-07 12:52:48,0.000 sec,0,732.4991667,0.152604,,,,
,2023-12-07 12:52:48,0.050 sec,1,,,0.3787057,0.143418,0.2878647,0.0601947

variable,relative_importance,scaled_importance,percentage
MB007.CHANGHONG,0.8703395,1.0,0.0887145
MB007.LINGWIN,0.7563132,0.8689865,0.0770917
MB007.MEDIATEK,0.6663361,0.7656048,0.0679203
MB007.IPHONE4,0.5048226,0.5800295,0.0514571
MB007.IPAD5,0.4239426,0.4871003,0.0432129
MB007.ZTE,0.3721032,0.4275380,0.0379288
MB007.QIKU,0.3509078,0.4031850,0.0357684
MB007.LETV,0.3381387,0.3885136,0.0344668
MB007.IPHONE8,0.3195485,0.3671539,0.0325719
MB007.IPHONE7,0.3056778,0.3512167,0.0311580


In [21]:
# call the createGains function with the best_glm_model
createGains(best_glm_model)

glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


Unnamed: 0_level_0,count,actual,non_actual,cum_count,cum_actual,cum_non_actual,percent_cum_actual,percent_cum_non_actual,if_random,lift,K_S,gain
decile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,320,110,210,320,110,210,0.18,0.08,60.0,1.83,10.0,34.38
1,320,75,245,640,185,455,0.31,0.18,120.0,1.54,13.0,28.91
2,320,82,238,960,267,693,0.44,0.27,180.0,1.48,17.0,27.81
3,320,63,257,1280,330,950,0.55,0.37,240.0,1.38,18.0,25.78
4,320,63,257,1600,393,1207,0.66,0.46,300.0,1.31,20.0,24.56
5,320,63,257,1920,456,1464,0.76,0.56,360.0,1.27,20.0,23.75
6,320,53,267,2240,509,1731,0.85,0.67,420.0,1.21,18.0,22.72
7,320,35,285,2560,544,2016,0.91,0.78,480.0,1.13,13.0,21.25
8,320,27,293,2880,571,2309,0.95,0.89,540.0,1.06,6.0,19.83
9,320,29,291,3200,600,2600,1.0,1.0,600.0,1.0,0.0,18.75


In [22]:
# Get the parameters of the best GLM model
best_glm_model_params = best_glm_model.params

# Extract specific parameter values
alpha_value = best_glm_model_params['alpha']
lambda_value = best_glm_model_params['lambda']

# Print the parameter values
print("alpha =", alpha_value)
print("lambda =", lambda_value)

alpha = {'default': None, 'actual': [0.0], 'input': [0.0]}
lambda = {'default': None, 'actual': [1e-05], 'input': [1e-05]}


## Sec 2.1.3: Using best paramaters for whole dataset: GLM with regularization

In [24]:
train_hex2 = h2o.H2OFrame(train)
test_hex2 = h2o.H2OFrame(test)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [25]:
# Create a new GLM model with specific hyperparameters
GLM_WITH2 = H2OGeneralizedLinearEstimator(family="binomial",
                                           lambda_search=False,  
                                           alpha=[0.0],          
                                           lambda_=[1e-05])   

# Train the GLM_WITH2 model
GLM_WITH2.train(predictors, target, training_frame=train_hex)

glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,family,link,regularization,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
,binomial,logit,Ridge ( lambda = 1.0E-5 ),76,76,4,Key_Frame__upload_928b0a2e75a26c43e8d4b4ae549e4ecd.hex

Unnamed: 0,0,1,Error,Rate
0,2340.0,1558.0,0.3997,(1558.0/3898.0)
1,292.0,610.0,0.3237,(292.0/902.0)
Total,2632.0,2168.0,0.3854,(1850.0/4800.0)

metric,threshold,value,idx
max f1,0.1786492,0.3973941,240.0
max f2,0.1257924,0.5640845,307.0
max f0point5,0.2671583,0.3447456,149.0
max accuracy,0.5329636,0.8147917,21.0
max precision,0.8580315,1.0,0.0
max recall,0.0471418,1.0,384.0
max specificity,0.8580315,1.0,0.0
max absolute_mcc,0.1786492,0.2171085,240.0
max min_per_class_accuracy,0.1863738,0.6330377,231.0
max mean_per_class_accuracy,0.1786492,0.6382914,240.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.01,0.502106,3.1042129,3.1042129,0.5833333,0.5807068,0.5833333,0.5807068,0.0310421,0.0310421,210.421286,210.421286,0.0259113
2,0.02,0.4464584,2.3281596,2.7161863,0.4375,0.4721133,0.5104167,0.52641,0.0232816,0.0543237,132.8159645,171.6186253,0.0422663
3,0.03,0.4158372,2.2172949,2.5498891,0.4166667,0.4278112,0.4791667,0.4935437,0.0221729,0.0764967,121.72949,154.9889135,0.057256
4,0.04,0.3952795,1.4412417,2.2727273,0.2708333,0.4058273,0.4270833,0.4716146,0.0144124,0.0909091,44.1241685,127.2727273,0.0626895
5,0.05,0.3783618,1.3303769,2.0842572,0.25,0.3871036,0.3916667,0.4547124,0.0133038,0.1042129,33.037694,108.4257206,0.0667578
6,0.1,0.317717,1.8181818,1.9512195,0.3416667,0.3446541,0.3666667,0.3996832,0.0909091,0.195122,81.8181818,95.1219512,0.1171332
7,0.15,0.2783629,1.6851441,1.8625277,0.3166667,0.2954404,0.35,0.3649356,0.0842572,0.2793792,68.5144124,86.2527716,0.1593176
8,0.2,0.2516521,1.3968958,1.7461197,0.2625,0.2651024,0.328125,0.3399773,0.0698448,0.3492239,39.6895787,74.6119734,0.1837545
9,0.3,0.2158841,1.3414634,1.6112343,0.2520833,0.2321678,0.3027778,0.3040408,0.1341463,0.4833703,34.1463415,61.1234294,0.2258023
10,0.4,0.1886961,1.2638581,1.5243902,0.2375,0.2020883,0.2864583,0.2785527,0.1263858,0.6097561,26.3858093,52.4390244,0.2582938

Unnamed: 0,timestamp,duration,iterations,negative_log_likelihood,objective,training_rmse,training_logloss,training_r2,training_auc,training_pr_auc,training_lift,training_classification_error
,2023-12-07 12:53:23,0.000 sec,0,2319.3022535,0.483188,,,,,,,
,2023-12-07 12:53:23,0.032 sec,1,2183.7481741,0.4553885,,,,,,,
,2023-12-07 12:53:23,0.046 sec,2,2177.1065557,0.4539714,,,,,,,
,2023-12-07 12:53:23,0.057 sec,3,2176.4592919,0.453925,,,,,,,
,2023-12-07 12:53:23,0.065 sec,4,2176.3668983,0.4539226,0.3787529,0.4534098,0.0599608,0.6746406,0.3229427,3.1042129,0.3854167

variable,relative_importance,scaled_importance,percentage
MB007.CHANGHONG,4.0035114,1.0,0.0677270
MB007.LINGWIN,3.3633845,0.8401086,0.0568980
MB007.MEDIATEK,2.9575913,0.7387493,0.0500333
MB007.MEITU,2.4726684,0.6176249,0.0418299
MB007.COOLPAD,2.4082487,0.6015341,0.0407401
MB007.IPHONE4,2.2917428,0.5724332,0.0387692
MB007.ZTE,2.2763736,0.5685943,0.0385092
MB007.LENOVO,2.0673900,0.5163942,0.0349738
MB007.LETV,2.0414672,0.5099192,0.0345353
MB007.QIKU,2.0236623,0.5054719,0.0342341


In [26]:
y_pred2 = GLM_WITH2.predict(test_hex).as_data_frame()
y_actual2 = test_hex[target].as_data_frame()
y_pred.head()

glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,predict,p0,p1
0,1,0.757716,0.242284
1,0,0.86494,0.13506
2,1,0.794428,0.205572
3,0,0.817654,0.182346
4,0,0.864791,0.135209


In [27]:
GLM_WITH2_actual_predict = pd.concat([y_actual2,y_pred2['predict']],axis=1)
GLM_WITH2_actual_predict.columns = ['actual','pred']
GLM_WITH2_actual_predict.head()

Unnamed: 0,actual,pred
0,0,1
1,0,0
2,0,1
3,0,1
4,0,0


In [28]:
GLM_WITH2_roc_auc_value = roc_auc_score(GLM_WITH2_actual_predict['actual'],GLM_WITH2_actual_predict['pred'])
GLM_WITH2_roc_auc_value

0.6029487179487178

In [29]:
createGains(GLM_WITH2)

glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


Unnamed: 0_level_0,count,actual,non_actual,cum_count,cum_actual,cum_non_actual,percent_cum_actual,percent_cum_non_actual,if_random,lift,K_S,gain
decile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,320,91,229,320,91,229,0.15,0.09,60.0,1.52,6.0,28.44
1,320,87,233,640,178,462,0.3,0.18,120.0,1.48,12.0,27.81
2,320,76,244,960,254,706,0.42,0.27,180.0,1.41,15.0,26.46
3,320,69,251,1280,323,957,0.54,0.37,240.0,1.35,17.0,25.23
4,320,76,244,1600,399,1201,0.66,0.46,300.0,1.33,20.0,24.94
5,320,39,281,1920,438,1482,0.73,0.57,360.0,1.22,16.0,22.81
6,320,29,291,2240,467,1773,0.78,0.68,420.0,1.11,10.0,20.85
7,320,43,277,2560,510,2050,0.85,0.79,480.0,1.06,6.0,19.92
8,320,45,275,2880,555,2325,0.92,0.89,540.0,1.03,3.0,19.27
9,320,45,275,3200,600,2600,1.0,1.0,600.0,1.0,0.0,18.75


### GLM Analysis:

1. Hyperparameter process: I used Grid search to specify a set of values for each hyperparameter that I want to search over, and H2O will train a model for every combination of the hyperparameter values. This means that I can get the best combination of model parameters, find the best RMSE for the model;
2. Best GLM model: After grid search, it gave me the best parameter combination, with alpha=0.0,lambda_=1e-05(0.00001);
3. With these parameters, the ROC reached 0.603 and lift reached 1.52, showing that the decile 0 have 1.52 times more likely to contain actual positive cases than random chance.

## Sec 2.2: AutoML

### What's AutoML?
AutoML stands for Automatic Machine Learning. As the complexity of ML tasks is often beyond non-ML-experts, the rapid growth of machine learning applications has created a demand for off-the-shelf machine learning methods that can be used easily and without expert knowledge. We call the resulting research area that targets progressive automation of machine learning AutoML.


### Why AutoML?
Anomaly detection often involves dealing with complex and diverse data. AutoML frameworks can automatically handle the selection and optimization of models, handling various types of features, and choosing appropriate algorithms to address the challenges posed by diverse datasets. AutoML tools can explore and evaluate a variety of anomaly detection algorithms and configurations, helping to identify the most effective models for a given dataset. This can save time and effort compared to manual exploration and selection. Anomaly detection models often have hyperparameters that need to be tuned for optimal performance. AutoML frameworks can automatically search the hyperparameter space to find the best combination for a given model and dataset, improving the model's effectiveness.

In [30]:
from h2o.automl import H2OAutoML
aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=predictors, y=target, training_frame=train_hex)

AutoML progress: |
12:53:42.873: _train param, Dropping bad and constant columns: [TD054, TD051, TD062, TD061, CR012, TD048, AP005, TD055]
12:53:42.873: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.

██
12:53:53.407: _train param, Dropping bad and constant columns: [TD054, TD051, TD062, TD061, CR012, TD029, TD028, TD027, TD026, TD048, TD025, AP005, TD055]
12:53:53.407: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
12:53:54.283: _train param, Dropping bad and constant columns: [TD054, TD051, TD062, TD061, CR012, TD048, AP005, TD055]
12:53:54.284: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish 

Unnamed: 0,family,link,regularization,lambda_search,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
,gaussian,identity,Ridge ( lambda = 0.123 ),"nlambda = 30, lambda.max = 5.5601, lambda.min = 0.123, lambda.1se = 5.5601",76,76,9,AutoML_1_20231207_125342_training_Key_Frame__upload_928b0a2e75a26c43e8d4b4ae549e4ecd.hex

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
mae,0.293544,0.0044563,0.2992928,0.2902914,0.2955267,0.2946093,0.2879997
mean_residual_deviance,0.1469191,0.0082715,0.1571141,0.1495056,0.1454152,0.1482512,0.1343095
mse,0.1469191,0.0082715,0.1571141,0.1495056,0.1454152,0.1482512,0.1343095
null_deviance,146.60123,9.183244,159.60631,148.75443,144.38445,146.25026,134.01068
r2,0.0357508,0.0089954,0.0508828,0.0350153,0.0330202,0.0268639,0.032972
residual_deviance,141.04236,7.9405966,150.82956,143.52539,139.59865,142.32115,128.93707
rmse,0.3831772,0.0108616,0.3963762,0.3866596,0.3813335,0.385034,0.3664825
rmsle,0.2688488,0.0049575,0.2741535,0.2699195,0.2688968,0.2705562,0.2607183

Unnamed: 0,timestamp,duration,iteration,lambda,predictors,deviance_train,deviance_xval,deviance_se,alpha,iterations,training_rmse,training_deviance,training_mae,training_r2
,2023-12-07 12:53:54,0.000 sec,1,5.6,77,0.1487652,0.149478,0.0041031,0.0,,,,,
,2023-12-07 12:53:54,0.002 sec,2,3.5,77,0.1480608,0.1488541,0.0040491,0.0,,,,,
,2023-12-07 12:53:54,0.004 sec,3,2.1,77,0.1474213,0.1483111,0.0039893,0.0,,,,,
,2023-12-07 12:53:54,0.006 sec,4,1.3,77,0.1468541,0.1478593,0.0039248,0.0,,,,,
,2023-12-07 12:53:54,0.007 sec,5,0.83,77,0.1463703,0.1475032,0.003857,0.0,,,,,
,2023-12-07 12:53:54,0.009 sec,6,0.51,77,0.1459768,0.1472454,0.0037911,0.0,,,,,
,2023-12-07 12:53:54,0.010 sec,7,0.32,77,0.1456689,0.1470873,0.0037308,0.0,,,,,
,2023-12-07 12:53:54,0.012 sec,8,0.2,77,0.1454274,0.1470125,0.0036809,0.0,,,,,
,2023-12-07 12:53:54,0.014 sec,9,0.12,77,0.1452307,0.1470007,0.003645,0.0,,,,,
,2023-12-07 12:53:54,0.015 sec,10,0.076,77,0.1450624,0.1470347,0.003623,0.0,10.0,0.3810915,0.1452307,0.2918232,0.0483163

variable,relative_importance,scaled_importance,percentage
AP003,0.0356631,1.0,0.0886784
TD001,0.0225199,0.6314624,0.0559970
TD013,0.0210463,0.5901427,0.0523329
MB005,0.0177231,0.4969583,0.0440694
MB007.OPPO,0.0173013,0.4851325,0.0430208
TD023,0.0167070,0.4684681,0.0415430
PA029,0.0145900,0.4091067,0.0362789
AP006.android,0.0124955,0.3503753,0.0310707
CR015,0.0121131,0.3396521,0.0301198
MB007.VIVO,0.0113582,0.3184857,0.0282428


In [31]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  

model_id,rmse,mse,mae,rmsle,mean_residual_deviance
GLM_1_AutoML_1_20231207_125342,0.383407,0.147001,0.29361,0.26904,0.147001
StackedEnsemble_AllModels_1_AutoML_1_20231207_125342,0.383844,0.147336,0.294929,0.269212,0.147336
StackedEnsemble_BestOfFamily_1_AutoML_1_20231207_125342,0.383924,0.147398,0.295146,0.269262,0.147398
GBM_1_AutoML_1_20231207_125342,0.385855,0.148884,0.2949,0.271125,0.148884
GBM_grid_1_AutoML_1_20231207_125342_model_2,0.386242,0.149183,0.295347,0.271121,0.149183
GBM_5_AutoML_1_20231207_125342,0.388186,0.150688,0.296209,0.273239,0.150688
GBM_2_AutoML_1_20231207_125342,0.388204,0.150702,0.293971,0.273079,0.150702
DeepLearning_1_AutoML_1_20231207_125342,0.388668,0.151062,0.291147,0.273121,0.151062
DeepLearning_grid_3_AutoML_1_20231207_125342_model_1,0.389829,0.151967,0.280813,0.270947,0.151967
GBM_3_AutoML_1_20231207_125342,0.390339,0.152364,0.29485,0.275127,0.152364


### AutoML Analysis:

1. The AutoML process is to train multiple ML models such as deep learning, XGBoost, DRF, GBM, and compare the rmse, mse,mae, rmsle,mean_residual_deviance to find the best model among all models;
2. Best model: After AutoML, we can see that the GLM_1_AutoML_1_20231207_125342 has the best model performance for this dataset, with achieved rmse=0.383407, mean_residual_deviance=0.147001.