# Import relevant libraries

In [1]:
import os
import numpy as np
import pandas as pd
import pickle
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.grid.grid_search import H2OGridSearch

In [2]:
# import encoders to deal with hccv's
from category_encoders import *

In [3]:
h2o.remove_all # clear h2o data

<function h2o.h2o.remove_all(retained=None)>

In [4]:
# Set directories
print(os.getcwd())
dirRawData = "../input/"
dirPData   = "../PData/"
dirPOutput = "../POutput/"

/home/jovyan/Projects/Fraud/PCode


# load data pickle from reading data notebook

In [5]:
f_name = dirPData + '01_df_250k.pickle'

with (open(f_name, "rb")) as f:
    dict_ = pickle.load(f)

df_train = dict_['df_train']
df_test  = dict_['df_test']

del f_name, dict_

f_name = dirPData + '01_vars.pickle'

with open(f_name, "rb") as f:
    dict_ = pickle.load(f)

vars_ind_numeric     = dict_['vars_ind_numeric']
vars_ind_hccv        = dict_['vars_ind_hccv']
vars_ind_categorical = dict_['vars_ind_categorical']
vars_notToUse        = dict_['vars_notToUse']
var_dep              = dict_['var_dep']

del f_name, dict_

In [6]:
# remove variables which remains constant
vars_ind_categorical.remove('c02')
vars_ind_numeric.remove('e16')

In [8]:
## constant variables added to notTouse 
vars_notToUse.append('c02')
vars_notToUse.append('e16')

In [9]:
# combine categorical and numeric variables 
vars_ind = vars_ind_categorical + vars_ind_numeric

In [68]:
var_dep

['target']

In [10]:
# we tried with and without standardising before splines but doing so before seems to worsen auc
# standardisation for train set

for var in vars_ind_numeric:
    x = df_train[var]
    x -= np.mean(x, axis=0)
    x /= np.sqrt(np.mean(x ** 2, axis=0))
    df_train[var] = x

In [11]:
# standardisation for train set

for var in vars_ind_numeric:
    x = df_test[var]
    x -= np.mean(x, axis=0)
    x /= np.sqrt(np.mean(x ** 2, axis=0))
    df_test[var] = x

In [12]:
# spline numeric variables which have more than 8 unique values, 
# we exerimented with different cutoff values apart from 8 
#but this didn't make much of a difference to the list of variables selected to spline

vars_ind_tospline = df_train[vars_ind_numeric].columns[(df_train[vars_ind_numeric].nunique() > 8)].tolist()

In [13]:
# adding splines to the numeric variables we randomly chose
# some numeric variables may have non-linear effects so we spline them
# the lecturer kept original variables in dataset so we thought to do the same
def fn_tosplines(varname,x):
    ptiles = np.percentile(x, [10, 20, 40, 60, 80, 90])
    ptiles = np.unique(ptiles)
    df_ptiles = pd.DataFrame({var: x})
    for idx, ptile in enumerate(ptiles):
        df_ptiles[var + '_' + str(idx)] = np.maximum(0, x - ptiles[idx])
    return(df_ptiles)

In [14]:
# create splines for train dataset
for var in vars_ind_tospline:
    df_ptiles = fn_tosplines(var,df_train[var])
    df_train.drop(columns=[var], inplace=True)
    vars_ind_numeric.remove(var)
    df_train = pd.concat([df_train, df_ptiles], axis=1, sort=False)
    vars_ind_numeric.extend(df_ptiles.columns.tolist())

In [15]:
# create splines for test dataset
for var in vars_ind_tospline:
    df1_ptiles = fn_tosplines(var,df_test[var])
    df_test.drop(columns=[var], inplace=True)
    vars_ind_numeric.remove(var)
    df_test = pd.concat([df_test, df1_ptiles], axis=1, sort=False)
    vars_ind_numeric.extend(df1_ptiles.columns.tolist())

In [16]:
len(vars_ind_numeric)

169

In [17]:
# update variable list
vars_ind = vars_ind_categorical + vars_ind_numeric

In [18]:
#set y variable for train set
y_train = df_train[var_dep].values.ravel()

In [19]:
#target encoder
# we tried target encoder but since we dont have y values for the test set we didn't us this

#enc = TargetEncoder(cols=['e17','e18','e19','f10'])

#df_encoded = enc.fit_transform(df_train, y_train)

In [20]:
#binary encoder
# we tried out binary encoder and it seemed to work well while not taking up much space in our dataframes
enc = BinaryEncoder(cols=['e17','e18','e19','f10'])

enc.fit(df_train[vars_ind_hccv])
df_encoded = enc.transform(df_train[vars_ind_hccv])
df_encoded[vars_ind_hccv] = df_train[vars_ind_hccv]
df_encoded.head()

#df_encoded = enc.fit_transform(df_train, y_train)

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,e17_0,e17_1,e17_2,e17_3,e17_4,e17_5,e17_6,e17_7,e18_0,e18_1,...,f10_6,f10_7,f10_8,f10_9,f10_10,f10_11,e17,e18,e19,f10
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,38CF0,D556D,D2AFA,CHO
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,38CF0,D556D,D2AFA,AIR
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,1,38CF0,D556D,D2AFA,CHE
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,B6A15,E7317,CE2BB,CPH
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,1,B6A15,E7317,CE2BB,CUO


In [21]:
# drop encoded columns from df to avoid duplicates
df_encoded=df_encoded.drop(['e17', 'e18','e19','f10'], axis=1)

In [22]:
# extend categorical feature names with those resulting from encoding
vars_ind_categorical.extend(df_encoded.columns.tolist())

In [23]:
# concatenate original training df with encoded one
df_train = pd.concat([df_train, df_encoded], axis=1, sort=False)

In [24]:
# drop original variables from dataset as we have replaced them with encoded ones
df_train=df_train.drop(['e17', 'e18','e19','f10'], axis=1)

In [25]:
# binary encoder for test data
# same process as above for train data
enc = BinaryEncoder(cols=['e17','e18','e19','f10'])

enc.fit(df_test[vars_ind_hccv])
df_encoded = enc.transform(df_test[vars_ind_hccv])
df_encoded[vars_ind_hccv] = df_test[vars_ind_hccv]
df_encoded.head()


  elif pd.api.types.is_categorical(cols):


Unnamed: 0,e17_0,e17_1,e17_2,e17_3,e17_4,e17_5,e17_6,e17_7,e18_0,e18_1,...,f10_6,f10_7,f10_8,f10_9,f10_10,f10_11,e17,e18,e19,f10
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,920BD,11250,ECD18,CKG
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,F33BC,5614E,D9556,AIR
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,1,F33BC,5614E,D9556,AEL
3,0,0,0,0,0,0,1,1,0,0,...,0,0,0,1,0,0,861C8,9076B,463D0,CUO
4,0,0,0,0,0,0,1,1,0,0,...,0,0,0,1,0,1,861C8,9076B,463D0,AWV


In [26]:
# drop encoded columns from df to avoid duplicates
df_encoded=df_encoded.drop(['e17', 'e18','e19','f10'], axis=1)

In [27]:
# extend categorical feature names with those resulting from encoding
df_test = pd.concat([df_test, df_encoded], axis=1, sort=False)

In [28]:
# concatenate original training df with encoded one
df_test=df_test.drop(['e17', 'e18','e19','f10'], axis=1)

In [29]:
# we remove original variables that have been encoded from the list 
vars_ind_categorical.remove('e17')
vars_ind_categorical.remove('e18')
vars_ind_categorical.remove('e19')
vars_ind_categorical.remove('f10')


In [30]:
# these spline does not appear in test set when standardising earlier so we remove them from the list
vars_ind_numeric.remove('e09_4') 
vars_ind_numeric.remove('f01_4')

In [31]:
# update variable list
vars_ind = vars_ind_categorical + vars_ind_numeric

In [32]:
# add features engineered through division of numerics for train set
# we randomly chose some features to divide which did not have zeros to avoid values of infinity
# introduction of these features didn't seem to improve our auc
df_train['x0'] = df_train['e04']/df_train['e06']
df_train['x1'] = df_train['f13']/df_train['f02']
df_train['x2'] = df_train['e09']/df_train['f11']

# manually add interactions as the interaction_pairs argument didn't work in h2o
# we randomly chose some interactions between some variables for training set
# introduction of these features didn't seem to improve our auc
df_train['x3'] = df_train['e04']*df_train['e06']
df_train['x4'] = df_train['f13']*df_train['f02']
df_train['x5'] = df_train['e09']*df_train['f11']

In [33]:
# same as above for test set
df_test['x0'] = df_test['e04']/df_test['e06']
df_test['x1'] = df_test['f13']/df_test['f02']
df_test['x2'] = df_test['e09']/df_test['f11']

# same as above for test set
df_test['x3'] = df_test['e04']*df_test['e06']
df_test['x4'] = df_test['f13']*df_test['f02']
df_test['x5'] = df_test['e09']*df_test['f11']

In [34]:
# add new vars to vars_ind index
vars_ind.append('x0')
vars_ind.append('x1')
vars_ind.append('x2')
vars_ind.append('x3')
vars_ind.append('x4')
vars_ind.append('x5')

In [35]:
#check length of variables list
len(vars_ind)

261

In [36]:
# make sure no variables are repeated so we use set and then pass to list again
myset=set(vars_ind)

print(len(myset))

vars_ind=list(myset)


193


In [37]:
var_dep

['target']

In [38]:
df_train

Unnamed: 0,target,a01,a02,a03,a10,a12,a13,a16,a17,a18,...,f10_8,f10_9,f10_10,f10_11,x0,x1,x2,x3,x4,x5
0,1,D,C,H,Z,C,A,Z,Z,F,...,0,0,0,1,0.192403,-1.938869,-1.074925,0.216317,-3.662846,-1.238058
1,1,D,C,H,Z,C,A,Z,Z,F,...,0,0,1,0,0.192403,-0.491062,-0.875122,0.216317,-0.277127,-1.520726
2,0,D,C,H,Z,C,A,Z,Z,F,...,0,0,1,1,0.192403,-3.378403,-1.392959,0.216317,-0.380800,-0.955391
3,1,B,H,H,N,A,A,Y,N,F,...,0,1,0,0,-0.494890,-31.171417,-2.785252,-0.556401,-0.018121,-0.024136
4,0,B,H,H,N,A,A,Y,N,F,...,0,1,0,1,-0.494890,0.615723,-0.313070,-0.556401,0.221019,-0.214726
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,1,B,A,H,N,A,A,Y,N,F,...,0,1,1,0,1.239706,0.942550,-0.875122,1.393792,0.144381,-1.520726
249996,0,B,A,B,N,A,A,N,N,F,...,0,1,0,1,-0.986678,0.240482,-1.392959,-2.682268,0.565889,-0.955391
249997,1,B,A,B,N,A,A,N,N,F,...,0,1,0,1,-0.986678,0.278152,-1.392959,-2.682268,0.489251,-0.955391
249998,0,B,A,B,N,A,A,N,N,F,...,0,1,0,1,-0.986678,-0.777890,-1.392959,-2.682268,-0.174943,-0.955391


In [39]:
df_all=df_train.copy()

In [48]:
# for clarity later on set df_all equal to complete training set
df_all=df_train.copy()

In [49]:
# sample from df_all and df_test for faster model computations
df_all=df_all.sample(10000, random_state=2021)

df_test_50=df_test.sample(10000, random_state=2021)

In [50]:
# add folds to randomly distinguish between train and validation sets
rng = np.random.RandomState(2018)
fold = rng.randint(0, 10, df_all.shape[0]) # takes values 0-9 inclusive
df_all['fold'] = fold

In [51]:
# set index according to fold value
idx_train  = df_all['fold'].isin(range(9))
idx_val    = df_all['fold'].isin([9, 10])
idx_design = df_all['fold'].isin(range(10))

print("number of train examples",    np.sum(idx_train == 1))
print("number of val examples",      np.sum(idx_val == 1))
print("number of design examples",   np.sum(idx_design == 1))

number of train examples 9046
number of val examples 954
number of design examples 10000


In [52]:
#initialise h2o connection
h2o.init(ip='localhost', 
         port=54321, 
         https=False,enable_assertions = False)

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,1 day 2 hours 58 mins
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.2
H2O_cluster_version_age:,2 months and 16 days
H2O_cluster_name:,H2O_from_python_jovyan_w8onbs
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.554 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [53]:
#pass df_all to h2o

h2o_df_all = h2o.H2OFrame(df_all[vars_ind + var_dep + ['fold']],
                          destination_frame = 'df_all')

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [54]:
#pass index to h2o

idx_h2o_train  = h2o.H2OFrame(idx_train.astype('int').values,  
                              destination_frame = 'idx_h2o_train')
idx_h2o_val    = h2o.H2OFrame(idx_val.astype('int').values  ,  
                              destination_frame = 'idx_h2o_val')
idx_h2o_design = h2o.H2OFrame(idx_design.astype('int').values, 
                              destination_frame = 'idx_h2o_design')


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [55]:
#set manually

h2o_df_design = h2o_df_all[idx_h2o_design, :]
h2o_df_train  = h2o_df_all[idx_h2o_train, :]
h2o_df_val    = h2o_df_all[idx_h2o_val, :]

In [61]:
#pass test df to h2o
h2o_df_test=h2o.H2OFrame(df_test[vars_ind],
                          destination_frame = 'df_test')

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [124]:
# Set up the hyper parameters to search over
# We will  search over 6 values for alpha  

alpha_opts = [0.1,0.2,0.4,0.6,0.75,0.99]

#alpha_opts = np.arange(0, 1, 0.).tolist()

hyper_parameters = {"alpha":alpha_opts}

In [125]:
criteria = {# The default strategy, "Cartesian", covers the entire space of h-p combinations. 
            "strategy": "RandomDiscrete", 
            "max_models": 6,
            "stopping_metric": "AUTO",
            "seed": 2021}

grid = H2OGridSearch(H2OGeneralizedLinearEstimator(family="binomial",
                                                   nfolds = 10,
                                                   lambda_search=True),
                     hyper_params=hyper_parameters,
                     grid_id='g5',
                     search_criteria=criteria
                    )

grid.train(y = var_dep,
           x = vars_ind, 
           training_frame=h2o_df_all[idx_h2o_train, :],
           validation_frame=h2o_df_all[idx_h2o_val,:] 
           )

glm Grid Build progress: |████████████████████████████████████████████████| 100%


In [126]:
grid = grid.get_grid(sort_by='AUC', decreasing=True)
grid

       alpha    model_ids                 auc
0      [0.4]   g5_model_3  0.7271663385975469
1     [0.99]   g5_model_2  0.7271206364410034
2      [0.6]   g5_model_4  0.7268291929144829
3     [0.75]   g5_model_6  0.7266378990963563
4      [0.2]   g5_model_5  0.7262986067356209
5      [0.1]   g5_model_1  0.7259440096866927
6     [0.99]   g5_model_8  0.7155528109982147
7     [0.75]  g5_model_12  0.7141706814495349
8      [0.6]  g5_model_10  0.7128647189891542
9      [0.2]  g5_model_11  0.7103943484998237
10     [0.4]   g5_model_9  0.7101319924796978
11     [0.1]   g5_model_7  0.7074168411103205




In [113]:
df_all

Unnamed: 0,target,a01,a02,a03,a10,a12,a13,a16,a17,a18,...,f10_9,f10_10,f10_11,x0,x1,x2,x3,x4,x5,fold
211451,0,B,C,H,N,A,A,N,N,F,...,1,1,0,0.941815,0.729448,-1.434972,0.872459,0.215457,-0.591435,6
96383,1,B,A,D,N,A,A,Y,N,F,...,0,0,0,-0.018496,0.008338,-3.411868,-0.050282,0.022742,-0.390056,2
55772,0,B,C,H,N,A,A,Y,N,F,...,1,0,1,-0.331249,0.246036,-1.392959,-0.372421,0.553116,-0.955391,9
205550,0,B,A,E,N,A,A,Y,N,F,...,1,0,1,0.846968,0.301789,-1.392959,0.952239,0.450933,-0.955391,5
157239,1,B,A,H,N,A,A,Y,N,F,...,1,1,0,0.545204,0.012548,-0.305297,0.505054,0.015113,-0.578949,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188056,0,B,C,H,N,A,A,N,N,F,...,1,0,1,-95.187234,-0.074988,-3.221876,-0.015606,-0.002529,-0.507713,3
123186,1,B,B,H,N,A,A,N,N,F,...,1,0,1,-1.149455,0.225231,2.019488,-1.292323,0.604208,1.385110,4
157104,1,B,A,B,N,A,A,Y,N,F,...,0,0,1,0.364926,-0.027805,-0.444618,0.338052,-0.006820,-0.151196,7
40243,0,B,C,H,N,A,A,N,N,F,...,0,1,0,0.761537,0.010838,0.158408,0.705457,0.017497,0.182449,4


In [70]:
# again with more narrower alpha range

alpha_opts = [0.1,0.2,0.4,0.99]

#alpha_opts = np.arange(0, 1, 0.).tolist()

hyper_parameters = {"alpha":alpha_opts}

criteria = {# The default strategy, "Cartesian", covers the entire space of h-p combinations. 
            "strategy": "RandomDiscrete", 
            "max_models": 2,
            "stopping_metric": "AUTO",
            "seed": 2021}

grid = H2OGridSearch(H2OGeneralizedLinearEstimator(family="binomial",
                                                   nfolds = 10,
                                                   lambda_search=True),
                     hyper_params=hyper_parameters,
                     grid_id='g6',
                     search_criteria=criteria
                    )

grid.train(y = var_dep,
           x = vars_ind, 
           training_frame=h2o_df_all[idx_h2o_train, :],
           validation_frame=h2o_df_all[idx_h2o_val,:] 
           )

glm Grid Build progress: |████████████████████████████████████████████████| 100%


In [71]:
grid1 = grid.get_grid(sort_by='AUC', decreasing=True)
grid1

     alpha   model_ids                 auc
0    [0.4]  g6_model_1  0.7296772366190788
1    [0.2]  g6_model_2  0.7296077038833862




In [56]:
# train penalised logistic regression model on training set and pass validation frame as well. 
# we experimented with grid search first and got different results for optimal value
# of alpha depending on max models, runtime and nfolds, eventually we chose 0.4 and ran on 250k train set
# we set nlambdas to 20 as this was default
# we choose 10 nfolds for computation time
#interaction_pairs = [('a01', 'a02')] - did not work



model=H2OGeneralizedLinearEstimator(alpha=0.4, 
                                    lambda_search=True,
                                    lambda_min_ratio=1e-8,
                                    nlambdas=20,
                                    nfolds=10,
                                    early_stopping=True,
                                    family='binomial',
                                    #interaction_pairs=interaction_pairs, - did not work
                                    # we already standardised above
                                    standardize=False,
                                    seed=2020)

model.train(x=vars_ind, 
            y='target',
            training_frame=h2o_df_all[idx_h2o_train, :],
           validation_frame=h2o_df_all[idx_h2o_val,:])

# Predict the model on train and val
model_pred_train = model.predict(h2o_df_all[idx_h2o_train, :])
model_pred_val   = model.predict(h2o_df_all[idx_h2o_val, :])

model_pred_train = model_pred_train.as_data_frame().values.ravel()
model_pred_val   = model_pred_val.as_data_frame().values.ravel()



glm Model Build progress: |███████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%


In [60]:
model

Model Details
H2OGeneralizedLinearEstimator :  Generalized Linear Modeling
Model Key:  GLM_model_python_1626346102456_11562


GLM Model: summary


Unnamed: 0,Unnamed: 1,family,link,regularization,lambda_search,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
0,,binomial,logit,"Elastic Net (alpha = 0.4, lambda = 6.193E-4 )","nlambda = 20, lambda.max = 0.2081, lambda.min = 6.193E-4, lambda.1...",368,213,16,py_1_sid_8c62




ModelMetricsBinomialGLM: glm
** Reported on train data. **

MSE: 0.20372932263675242
RMSE: 0.45136384728592566
LogLoss: 0.5925776267276079
Null degrees of freedom: 9045
Residual degrees of freedom: 8832
Null deviance: 12537.298558247585
Residual deviance: 10720.914422755885
AIC: 11148.914422755885
AUC: 0.7480757535534752
AUCPR: 0.7416168129386597
Gini: 0.49615150710695044

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3733317967604742: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,2091.0,2516.0,0.5461,(2516.0/4607.0)
1,1,634.0,3805.0,0.1428,(634.0/4439.0)
2,Total,2725.0,6321.0,0.3482,(3150.0/9046.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.373332,0.707249,264.0
1,max f2,0.215188,0.833014,343.0
2,max f0point5,0.543278,0.682899,172.0
3,max accuracy,0.511443,0.683175,189.0
4,max precision,0.980507,1.0,0.0
5,max recall,0.050076,1.0,395.0
6,max specificity,0.980507,1.0,0.0
7,max absolute_mcc,0.511443,0.366359,189.0
8,max min_per_class_accuracy,0.485867,0.67875,202.0
9,max mean_per_class_accuracy,0.511443,0.682326,189.0



Gains/Lift Table: Avg response rate: 49.07 %, avg score: 49.07 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.01006,0.932025,1.970665,1.970665,0.967033,0.945884,0.967033,0.945884,0.019824,0.019824,97.066461,97.066461,0.019173
1,2,0.020009,0.913172,1.856704,1.913999,0.911111,0.922624,0.939227,0.934318,0.018473,0.038297,85.670446,91.399935,0.035909
2,3,0.030069,0.891737,1.858695,1.895497,0.912088,0.902237,0.930147,0.923585,0.018698,0.056995,85.869503,89.54968,0.052871
3,4,0.040018,0.869284,1.834062,1.880223,0.9,0.881301,0.922652,0.913073,0.018247,0.075242,83.406173,88.022289,0.069164
4,5,0.050077,0.847561,1.701938,1.844408,0.835165,0.859107,0.905077,0.902232,0.017121,0.092363,70.193762,84.440841,0.08303
5,6,0.100044,0.75732,1.722251,1.783397,0.845133,0.795777,0.875138,0.849063,0.086055,0.178419,72.225069,78.339704,0.153891
6,7,0.150011,0.705012,1.54191,1.702961,0.756637,0.729486,0.835667,0.809234,0.077044,0.255463,54.19103,70.296078,0.207058
7,8,0.200088,0.66448,1.525011,1.658424,0.748344,0.684267,0.813812,0.777958,0.076369,0.331831,52.501085,65.842414,0.258682
8,9,0.300022,0.597128,1.291688,1.536269,0.63385,0.629866,0.753869,0.72863,0.129083,0.460915,29.168801,53.626885,0.315918
9,10,0.400066,0.539274,1.20019,1.452226,0.58895,0.568036,0.712628,0.68847,0.120072,0.580987,20.019018,45.222596,0.355243




ModelMetricsBinomialGLM: glm
** Reported on validation data. **

MSE: 0.21941782911135557
RMSE: 0.46842056862541337
LogLoss: 0.6288541412060552
Null degrees of freedom: 953
Residual degrees of freedom: 740
Null deviance: 1323.5225807934007
Residual deviance: 1199.8537014211533
AIC: 1627.8537014211533
AUC: 0.7017933769476979
AUCPR: 0.6936828729796906
Gini: 0.4035867538953959

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3974218144281553: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,199.0,269.0,0.5748,(269.0/468.0)
1,1,77.0,409.0,0.1584,(77.0/486.0)
2,Total,276.0,678.0,0.3627,(346.0/954.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.397422,0.702749,262.0
1,max f2,0.168844,0.844382,372.0
2,max f0point5,0.476621,0.661593,210.0
3,max accuracy,0.476621,0.657233,210.0
4,max precision,0.964324,1.0,0.0
5,max recall,0.108156,1.0,390.0
6,max specificity,0.964324,1.0,0.0
7,max absolute_mcc,0.476621,0.31408,210.0
8,max min_per_class_accuracy,0.498318,0.646091,196.0
9,max mean_per_class_accuracy,0.476621,0.656458,210.0



Gains/Lift Table: Avg response rate: 50.94 %, avg score: 50.10 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.010482,0.926318,1.374074,1.374074,0.7,0.93934,0.7,0.93934,0.014403,0.014403,37.407407,37.407407,0.007993
1,2,0.020964,0.916581,1.962963,1.668519,1.0,0.922712,0.85,0.931026,0.020576,0.034979,96.296296,66.851852,0.028569
2,3,0.030398,0.888327,1.526749,1.624521,0.777778,0.898374,0.827586,0.920892,0.014403,0.049383,52.674897,62.452107,0.038699
3,4,0.040881,0.854179,1.962963,1.711301,1.0,0.876158,0.871795,0.909422,0.020576,0.069959,96.296296,71.130104,0.059275
4,5,0.050314,0.831377,1.526749,1.676698,0.777778,0.84245,0.854167,0.896865,0.014403,0.084362,52.674897,67.669753,0.069405
5,6,0.100629,0.746294,1.594907,1.635802,0.8125,0.78413,0.833333,0.840497,0.080247,0.164609,59.490741,63.580247,0.130421
6,7,0.149895,0.706334,1.21119,1.496244,0.617021,0.726634,0.762238,0.803074,0.059671,0.22428,21.118991,49.62445,0.15163
7,8,0.20021,0.667145,1.308642,1.449098,0.666667,0.687789,0.73822,0.774102,0.065844,0.290123,30.864198,44.909831,0.183286
8,9,0.29979,0.589926,1.281092,1.393292,0.652632,0.629183,0.70979,0.725964,0.127572,0.417695,28.109162,39.329189,0.240345
9,10,0.400419,0.542389,1.267747,1.361741,0.645833,0.565288,0.693717,0.685585,0.127572,0.545267,26.774691,36.174132,0.295267




ModelMetricsBinomialGLM: glm
** Reported on cross-validation data. **

MSE: 0.21296689236233238
RMSE: 0.46148336087266717
LogLoss: 0.613891682059749
Null degrees of freedom: 9045
Residual degrees of freedom: 8833
Null deviance: 12539.460913903944
Residual deviance: 11106.528311824979
AIC: 11532.528311824979
AUC: 0.7198259913108122
AUCPR: 0.714937758174933
Gini: 0.4396519826216243

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.37800359036505526: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,2041.0,2566.0,0.557,(2566.0/4607.0)
1,1,758.0,3681.0,0.1708,(758.0/4439.0)
2,Total,2799.0,6247.0,0.3675,(3324.0/9046.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.378004,0.688939,262.0
1,max f2,0.118674,0.829988,380.0
2,max f0point5,0.54131,0.660583,171.0
3,max accuracy,0.54131,0.661618,171.0
4,max precision,0.974124,1.0,0.0
5,max recall,0.044815,1.0,396.0
6,max specificity,0.974124,1.0,0.0
7,max absolute_mcc,0.54131,0.326348,171.0
8,max min_per_class_accuracy,0.487013,0.656827,202.0
9,max mean_per_class_accuracy,0.514682,0.660619,186.0



Gains/Lift Table: Avg response rate: 49.07 %, avg score: 49.09 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.01006,0.931089,1.903483,1.903483,0.934066,0.945225,0.934066,0.945225,0.019148,0.019148,90.348287,90.348287,0.017846
1,2,0.020009,0.912014,1.924633,1.913999,0.944444,0.921723,0.939227,0.933539,0.019148,0.038297,92.463268,91.399935,0.035909
2,3,0.030069,0.888471,1.746725,1.858036,0.857143,0.900887,0.911765,0.922615,0.017572,0.055868,74.672545,85.803639,0.050659
3,4,0.040018,0.866063,1.811419,1.846446,0.888889,0.879775,0.906077,0.911964,0.018022,0.073891,81.141899,84.644643,0.06651
4,5,0.050077,0.845312,1.746725,1.826414,0.857143,0.856184,0.896247,0.900759,0.017572,0.091462,74.672545,82.641418,0.08126
5,6,0.100044,0.754576,1.609538,1.718096,0.789823,0.794954,0.843094,0.847915,0.080424,0.171886,60.953795,71.809588,0.141063
6,7,0.150011,0.704254,1.505842,1.647397,0.738938,0.729426,0.808401,0.808448,0.075242,0.247128,50.584222,64.73968,0.190692
7,8,0.200088,0.665027,1.448535,1.597627,0.710817,0.683917,0.783978,0.777281,0.072539,0.319667,44.853538,59.762651,0.234796
8,9,0.300022,0.596187,1.219552,1.471694,0.598451,0.629853,0.722181,0.728174,0.121874,0.441541,21.955186,47.16945,0.277877
9,10,0.400066,0.539023,1.213701,1.407178,0.59558,0.567512,0.690522,0.687998,0.121424,0.562965,21.370076,40.717824,0.319856




Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid,cv_9_valid,cv_10_valid
0,accuracy,0.6345562,0.032508243,0.6373874,0.6318083,0.63811564,0.63015187,0.63755983,0.66487646,0.5503212,0.66413045,0.65826946,0.6329412
1,auc,0.72159827,0.027750472,0.7361416,0.71856874,0.7056311,0.71317786,0.7303428,0.75151026,0.6542936,0.725113,0.7492192,0.7319846
2,aucpr,0.7177322,0.029451834,0.7189274,0.74393827,0.6997672,0.7223053,0.7446244,0.7342133,0.6549249,0.7229511,0.74790174,0.6877681
3,err,0.36544383,0.032508243,0.3626126,0.36819172,0.36188436,0.36984816,0.3624402,0.3351235,0.4496788,0.33586955,0.34173056,0.3670588
4,err_count,330.7,34.237892,322.0,338.0,338.0,341.0,303.0,312.0,420.0,309.0,312.0,312.0
5,f0point5,0.62838215,0.023562545,0.6188811,0.6423558,0.6283512,0.6341386,0.6334842,0.6429331,0.57061535,0.65682656,0.6402439,0.6159918
6,f1,0.6949046,0.012768884,0.68737864,0.7135593,0.6893382,0.7006146,0.68923074,0.69942194,0.6671949,0.6973555,0.7078652,0.6970874
7,f2,0.7784641,0.021385288,0.77292573,0.8025162,0.7634365,0.7826599,0.7557355,0.76679343,0.8031286,0.743215,0.7914573,0.8027728
8,lift_top_group,1.9077967,0.119562596,1.8793651,1.9085239,1.8434211,1.9617021,2.0096154,1.8914222,1.8556291,2.0087335,2.075,1.644555
9,logloss,0.6129891,0.019763103,0.6047422,0.61349285,0.6274024,0.62422884,0.60086447,0.59187007,0.65501857,0.6149001,0.58563656,0.6117351



See the whole table with table.as_data_frame()

Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,iteration,lambda,predictors,deviance_train,deviance_test,deviance_xval,deviance_se,...,training_pr_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_r2,validation_auc,validation_pr_auc,validation_lift,validation_classification_error
0,,2021-07-16 13:48:03,0.000 sec,1,0.21,1,1.385949,1.38734,1.386286,0.000444,...,,,,,,,,,,
1,,2021-07-16 13:48:03,0.337 sec,3,0.079,9,1.370315,1.372231,1.374259,0.00113,...,,,,,,,,,,
2,,2021-07-16 13:48:04,0.533 sec,5,0.03,34,1.321388,1.324715,1.332127,0.004095,...,,,,,,,,,,
3,,2021-07-16 13:48:04,0.728 sec,7,0.011,57,1.271872,1.287063,1.285836,0.008041,...,,,,,,,,,,
4,,2021-07-16 13:48:04,0.927 sec,9,0.0043,105,1.233255,1.273289,1.254388,0.01023,...,,,,,,,,,,
5,,2021-07-16 13:48:04,1.234 sec,12,0.0016,159,1.203979,1.263687,1.235311,0.011709,...,,,,,,,,,,
6,,2021-07-16 13:48:05,1.655 sec,16,0.00062,214,1.185155,1.257708,1.227318,0.012595,...,,,,,,,,,,
7,,2021-07-16 13:48:05,1.981 sec,19,0.00023,264,1.17476,1.259766,1.228215,0.012893,...,,,,,,,,,,
8,,2021-07-16 13:48:05,2.205 sec,21,8.9e-05,309,1.169096,1.264696,1.231877,0.012998,...,,,,,,,,,,
9,,2021-07-16 13:48:06,2.432 sec,23,3.4e-05,321,1.166208,1.270625,1.23607,0.012886,...,0.741617,1.970665,0.34822,0.468421,0.628854,0.122016,0.701793,0.693683,1.374074,0.362683



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,f13_0,1.708895,1.0,0.062259
1,f09.F,1.577223,0.922949,0.057462
2,f13,1.000692,0.585579,0.036458
3,f13_1,0.872666,0.510661,0.031793
4,f11,0.838468,0.490649,0.030547
5,f03.F,0.683011,0.39968,0.024884
6,f11_1,0.62839,0.367717,0.022894
7,f11_0,0.619616,0.362583,0.022574
8,e20.54CD2,0.581102,0.340046,0.021171
9,e11.A,0.500708,0.293001,0.018242



See the whole table with table.as_data_frame()




In [62]:
model_pred_test=model.predict(h2o_df_test).as_data_frame()

glm prediction progress: |████████████████████████████████████████████████| 100%




In [66]:
model_pred_test

Unnamed: 0,predict,p0,p1
0,1,0.436217,0.563783
1,1,0.088731,0.911269
2,1,0.496274,0.503726
3,1,0.297664,0.702336
4,1,0.371687,0.628313
...,...,...,...
296685,1,0.453223,0.546777
296686,0,0.649669,0.350331
296687,1,0.351903,0.648097
296688,0,0.663141,0.336859


In [63]:
# add test predictions and uniqie id to dataframe

df_sub_14 = pd.DataFrame({
        "unique_id": df_test["unique_id"],
        "Predicted": model_pred_test['p1']
    })

In [67]:
df_sub_14.to_csv('../POutput/df_sub_part3.csv', index=False)

In [65]:
df_sub_14

Unnamed: 0,unique_id,Predicted
0,6,0.563783
1,16,0.911269
2,17,0.503726
3,18,0.702336
4,19,0.628313
...,...,...
296685,2265630,0.546777
296686,2265631,0.350331
296687,2265632,0.648097
296688,2265637,0.336859


In [None]:
# get whole 250k train set back and set up train,val,design and test frames and index - we didn't run this section becuase of computation time

In [43]:
# for clarity later on set df_all equal to complete training set
df_all=df_train.copy()


# add folds to randomly distinguish between train and validation sets
rng = np.random.RandomState(2018)
fold = rng.randint(0, 10, df_all.shape[0]) # takes values 0-9 inclusive
df_all['fold'] = fold

# set index according to fold value
idx_train  = df_all['fold'].isin(range(9))
idx_val    = df_all['fold'].isin([9, 10])
idx_design = df_all['fold'].isin(range(10))

print("number of train examples",    np.sum(idx_train == 1))
print("number of val examples",      np.sum(idx_val == 1))
print("number of design examples",   np.sum(idx_design == 1))

number of train examples 223921
number of val examples 24984
number of design examples 248905


In [68]:
#pass df_all to h2o

h2o_df_all = h2o.H2OFrame(df_all[vars_ind + var_dep + ['fold']],
                          destination_frame = 'df_all')

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [None]:
#pass index to h2o

idx_h2o_train  = h2o.H2OFrame(idx_train.astype('int').values,  
                              destination_frame = 'idx_h2o_train')
idx_h2o_val    = h2o.H2OFrame(idx_val.astype('int').values  ,  
                              destination_frame = 'idx_h2o_val')
idx_h2o_design = h2o.H2OFrame(idx_design.astype('int').values, 
                              destination_frame = 'idx_h2o_design')

In [None]:
#set manually

h2o_df_design = h2o_df_all[idx_h2o_design, :]
h2o_df_train  = h2o_df_all[idx_h2o_train, :]
h2o_df_val    = h2o_df_all[idx_h2o_val, :]

#pass test df to h2o
h2o_df_test=h2o.H2OFrame(df_test_50[vars_ind],
                          destination_frame = 'df_test')

In [201]:
model_pred_train = model.predict(h2o_df_all[idx_h2o_train, :]).as_data_frame()

glm prediction progress: |████████████████████████████████████████████████| 100%


In [169]:
#df_test.loc[:,'Predicted']=pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [None]:
# run model again for large dataset

##model=H2OGeneralizedLinearEstimator(alpha=0.4, 
                                    lambda_search=True,
                                    lambda_min_ratio=1e-8,
                                    nlambdas=20,
                                    nfolds=10,
                                    early_stopping=True,
                                    family='binomial',
                                    #interaction_pairs=interaction_pairs, - did not work
                                    # we already standardised above
                                    standardize=False,
                                    seed=2020)

model.train(x=vars_ind, 
            y='target',
            training_frame=h2o_df_all[idx_h2o_train, :],
           validation_frame=h2o_df_all[idx_h2o_val,:])

# Predict the model on train and val
model_pred_train = model.predict(h2o_df_all[idx_h2o_train, :])
model_pred_val   = model.predict(h2o_df_all[idx_h2o_val, :])

model_pred_train = model_pred_train.as_data_frame().values.ravel()
model_pred_val   = model_pred_val.as_data_frame().values.ravel()