In [1]:
import h2o
from h2o.automl import H2OAutoML
import random, os, sys
from datetime import datetime
import pandas as pd
import logging
import csv
import optparse
import time
import json
from distutils.util import strtobool

In [2]:
data_path=None
all_variables=None
test_path=None
target=None
nthreads=1 
min_mem_size=6 
run_time=333
classification=False
scale=False
max_models=9    
model_path=None
balance_y=False 
balance_threshold=0.2
name=None 
server_path=None  
analysis=0

# Functions

In [3]:
def alphabet(n):
  alpha='0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'    
  str=''
  r=len(alpha)-1   
  while len(str)<n:
    i=random.randint(0,r)
    str+=alpha[i]   
  return str

In [4]:
def set_meta_data(run_id,analysis,target,run_time,classification,scale,model,balance,balance_threshold,name,nthreads,min_mem_size):
  m_data={}
  m_data['run_id'] =run_id
  m_data['start_time'] = time.time()
  m_data['target']=target
  m_data['max_models']=model
  m_data['run_time']=run_time
  m_data['scale']=scale
  m_data['classification']=classification
  m_data['scale']=False
  m_data['balance']=balance
  m_data['balance_threshold']=balance_threshold
  m_data['project'] =name
  m_data['end_time'] = time.time()
  m_data['execution_time'] = 0.0
  m_data['nthreads'] = nthreads
  m_data['min_mem_size'] = min_mem_size
  m_data['analysis'] = analysis
  return m_data

In [5]:
def dict_to_json(dct,n):
  j = json.dumps(dct, indent=4)
  f = open(n, 'w')
  print(j, file=f)
  f.close()

In [6]:
def stackedensemble(mod):
    coef_norm=None
    try:
      metalearner = h2o.get_model(mod.metalearner()['name'])
      coef_norm=metalearner.coef_norm()
    except:
      pass        
    return coef_norm


In [7]:
def stackedensemble_df(df):
    bm_algo={ 'GBM': None,'GLM': None,'DRF': None,'XRT': None,'Dee': None}
    for index, row in df.iterrows():
      if len(row['model_id'])>3:
        key=row['model_id'][0:3]
        if key in bm_algo:
          if bm_algo[key] is None:
                bm_algo[key]=row['model_id']
    bm=list(bm_algo.values()) 
    bm=list(filter(None.__ne__, bm))             
    return bm

In [8]:
def se_stats(modl):
    d={}
    d['algo']=modl.algo
    d['model_id']=modl.model_id   
    d['auc']=modl.auc()   
    d['roc']=modl.roc()
    d['mse']=modl.mse()   
    d['null_degrees_of_freedom']=modl.null_degrees_of_freedom()
    d['null_deviance']=modl.null_deviance()
    d['residual_degrees_of_freedom']=modl.residual_degrees_of_freedom()   
    d['residual_deviance']=modl.residual_deviance()
    d['rmse']=modl.rmse()
    return d

In [9]:
def get_model_by_algo(algo,models_dict):
    mod=None
    mod_id=None    
    for m in list(models_dict.keys()):
        if m[0:3]==algo:
            mod_id=m
            mod=h2o.get_model(m)      
    return mod,mod_id     

In [10]:
def gbm_stats(modl):
    d={}
    d['algo']=modl.algo
    d['model_id']=modl.model_id   
    d['varimp']=modl.varimp()  
    return d
    
    
def dl_stats(modl):
    d={}
    d['algo']=modl.algo
    d['model_id']=modl.model_id   
    d['varimp']=modl.varimp()  
    return d
    
    
def drf_stats(modl):
    d={}
    d['algo']=modl.algo
    d['model_id']=modl.model_id   
    d['varimp']=modl.varimp()  
    d['roc']=modl.roc()      
    return d
    
def xrt_stats(modl):
    d={}
    d['algo']=modl.algo
    d['model_id']=modl.model_id   
    d['varimp']=modl.varimp()  
    d['roc']=modl.roc()      
    return d
    
    
def glm_stats(modl):
    d={}
    d['algo']=modl.algo
    d['model_id']=modl.model_id   
    d['coef']=modl.coef()  
    d['coef_norm']=modl.coef_norm()      
    return d

In [11]:
def model_performance_stats(perf):
    d={}
    try:    
      d['mse']=perf.mse()
    except:
      pass      
    try:    
      d['rmse']=perf.rmse() 
    except:
      pass      
    try:    
      d['null_degrees_of_freedom']=perf.null_degrees_of_freedom()
    except:
      pass      
    try:    
      d['residual_degrees_of_freedom']=perf.residual_degrees_of_freedom()
    except:
      pass      
    try:    
      d['residual_deviance']=perf.residual_deviance() 
    except:
      pass      
    try:    
      d['null_deviance']=perf.null_deviance() 
    except:
      pass      
    try:    
      d['aic']=perf.aic() 
    except:
      pass      
    try:
      d['logloss']=perf.logloss() 
    except:
      pass    
    try:
      d['auc']=perf.auc()
    except:
      pass  
    try:
      d['gini']=perf.gini()
    except:
      pass    
    return d

In [12]:
def impute_missing_values(df, x, scal=False):
    # determine column types
    ints, reals, enums = [], [], []
    for key, val in df.types.items():
        if key in x:
            if val == 'enum':
                enums.append(key)
            elif val == 'int':
                ints.append(key)            
            else: 
                reals.append(key)    
    _ = df[reals].impute(method='mean')
    _ = df[ints].impute(method='median')
    if scal:
        df[reals] = df[reals].scale()
        df[ints] = df[ints].scale()    
    return


In [13]:
def get_independent_variables(df, targ):
    C = [name for name in df.columns if name != targ]
    # determine column types
    ints, reals, enums = [], [], []
    for key, val in df.types.items():
        if key in C:
            if val == 'enum':
                enums.append(key)
            elif val == 'int':
                ints.append(key)            
            else: 
                reals.append(key)    
    x=ints+enums+reals
    return x

In [14]:
def get_all_variables_csv(i):
    ivd={}
    try:
      iv = pd.read_csv(i,header=None)
    except:
      sys.exit(1)    
    col=iv.values.tolist()[0]
    dt=iv.values.tolist()[1]
    i=0
    for c in col:
      ivd[c.strip()]=dt[i].strip()
      i+=1        
    return ivd
    
    

def check_all_variables(df,dct,y=None):     
    targ=list(dct.keys())     
    for key, val in df.types.items():
        if key in targ:
          if dct[key] not in ['real','int','enum']:                      
            targ.remove(key)  
    for key, val in df.types.items():
        if key in targ:            
          if dct[key] != val:
            print('convert ',key,' ',dct[key],' ',val)
            if dct[key]=='enum':
                try:
                  df[key] = df[key].asfactor() 
                except:
                  targ.remove(key)                 
            if dct[key]=='int': 
                try:                
                  df[key] = df[key].asnumeric() 
                except:
                  targ.remove(key)                  
            if dct[key]=='real':
                try:                
                  df[key] = df[key].asnumeric()  
                except:
                  targ.remove(key)                  
    if y is None:
      y=df.columns[-1] 
    if y in targ:
      targ.remove(y)
    else:
      y=targ.pop()            
    return targ  

In [15]:
def predictions(mod,data,run_id):
    test = h2o.import_file(data)
    mod_perf=mod_best.model_performance(test)
              
    stats_test={}
    stats_test=model_performance_stats(mod_perf)

    n=run_id+'_test_stats.json'
    dict_to_json(stats_test,n) 

    try:    
      cf=mod_perf.confusion_matrix(metrics=["f1","f2","f0point5","accuracy","precision","recall","specificity","absolute_mcc","min_per_class_accuracy","mean_per_class_accuracy"])
      cf_df=cf[0].table.as_data_frame()
      cf_df.to_csv(run_id+'_test_confusion_matrix.csv')
    except:
      pass

    predictions = mod_best.predict(test)
    predictions_df=test.cbind(predictions).as_data_frame() 
    predictions_df.to_csv(run_id+'_predictions.csv')
    return

In [16]:
def predictions_test(mod,test,run_id):
    mod_perf=mod_best.model_performance(test)          
    stats_test={}
    stats_test=model_performance_stats(mod_perf)
    n=run_id+'_test_stats.json'
    dict_to_json(stats_test,n) 
    try:
      cf=mod_perf.confusion_matrix(metrics=["f1","f2","f0point5","accuracy","precision","recall","specificity","absolute_mcc","min_per_class_accuracy","mean_per_class_accuracy"])
      cf_df=cf[0].table.as_data_frame()
      cf_df.to_csv(run_id+'_test_confusion_matrix.csv')
    except:
      pass
    predictions = mod_best.predict(test)    
    predictions_df=test.cbind(predictions).as_data_frame() 
    predictions_df.to_csv(run_id+'_predictions.csv')
    return predictions

In [17]:
def check_X(x,df):
    for name in x:
        if name not in df.columns:
          x.remove(name)  
    return x    
    
    
def get_stacked_ensemble(lst):
    se=None
    for model in model_set:
      if 'BestOfFamily' in model:
        se=model
    if se is None:     
      for model in model_set:
        if 'AllModels'in model:
          se=model           
    return se       
    
def get_variables_types(df):
    d={}
    for key, val in df.types.items():
        d[key]=val           
    return d    
    

# End Functions

In [18]:
datapath="crime_new.csv"

In [19]:
#data_path = os.path.join(os.path.abspath(os.curdir),data_path)

In [20]:
port_no=random.randint(5555,55555)
h2o.init(strict_version_check=False,min_mem_size_GB=min_mem_size,port=port_no)

Checking whether there is an H2O instance running at http://localhost:49271..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.144-b01, mixed mode)
  Starting server from C:\Users\ptari\Anaconda3\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\ptari\AppData\Local\Temp\tmp8iqimd1j
  JVM stdout: C:\Users\ptari\AppData\Local\Temp\tmp8iqimd1j\h2o_ptari_started_from_python.out
  JVM stderr: C:\Users\ptari\AppData\Local\Temp\tmp8iqimd1j\h2o_ptari_started_from_python.err
  Server is running at http://127.0.0.1:49271
Connecting to H2O server at http://127.0.0.1:49271... successful.


0,1
H2O cluster uptime:,04 secs
H2O cluster timezone:,America/New_York
H2O data parsing timezone:,UTC
H2O cluster version:,3.20.0.8
H2O cluster version age:,23 days
H2O cluster name:,H2O_from_python_ptari_sdql7d
H2O cluster total nodes:,1
H2O cluster free memory:,5.750 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [21]:
run_id=alphabet(9)
# run_id to std out
print (run_id)
# meta data
meta_data = set_meta_data(run_id,analysis,target,run_time,classification,scale,max_models,balance_y,balance_threshold,name,nthreads,min_mem_size)
print(meta_data)

DHl8dQlmO
{'run_id': 'DHl8dQlmO', 'start_time': 1539619988.1557174, 'target': None, 'max_models': 9, 'run_time': 333, 'scale': False, 'classification': False, 'balance': False, 'balance_threshold': 0.2, 'project': None, 'end_time': 1539619988.1557174, 'execution_time': 0.0, 'nthreads': 1, 'min_mem_size': 6, 'analysis': 0}


In [22]:
df = h2o.import_file('crime_new.csv')

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [23]:
df=df[['DISTRICT','REPORTING_AREA','MONTH','DAY_OF_WEEK','HOUR','Lat','Long','Day','Night','OFFENSE_CODE_GROUP']]
df.head()

DISTRICT,REPORTING_AREA,MONTH,DAY_OF_WEEK,HOUR,Lat,Long,Day,Night,OFFENSE_CODE_GROUP
0,0,10,3,20,42.3207,-71.0568,0,1,Motor Vehicle Accident Response
6,750,10,3,19,42.3443,-71.1578,1,0,Motor Vehicle Accident Response
0,0,10,3,19,42.316,-71.0904,1,0,Motor Vehicle Accident Response
5,234,10,2,20,42.33,-71.0385,0,1,Motor Vehicle Accident Response
8,255,10,3,15,42.3109,-71.0577,1,0,Motor Vehicle Accident Response
6,773,10,1,19,42.3426,-71.1558,1,0,Motor Vehicle Accident Response
3,613,10,3,15,42.3397,-71.1087,1,0,Motor Vehicle Accident Response
2,822,10,3,16,42.2547,-71.1436,1,0,Motor Vehicle Accident Response
3,904,10,3,15,42.3305,-71.0803,1,0,Motor Vehicle Accident Response
2,520,10,3,14,42.256,-71.1219,1,0,Motor Vehicle Accident Response




In [24]:
df.describe()

Rows:223601
Cols:10




Unnamed: 0,DISTRICT,REPORTING_AREA,MONTH,DAY_OF_WEEK,HOUR,Lat,Long,Day,Night,OFFENSE_CODE_GROUP
type,int,int,int,int,int,real,real,int,int,enum
mins,0.0,0.0,1.0,1.0,0.0,-1.0,-71.17867378,0.0,0.0,
mean,6.170410686893172,356.8560516276761,6.671137427829037,3.9565565449170617,13.271389662836928,39.416615842285495,-66.20819779427623,0.7398267449608902,0.2601732550391098,
maxs,12.0,962.0,12.0,7.0,23.0,42.39504158,0.0,1.0,1.0,
sigma,3.359611389797776,253.92184084869484,3.2579484608576266,1.9710244404143626,6.187729598960209,10.7067091332241,17.96108091494706,0.4387299775925634,0.4387299775925634,
zeros,1251,15392,0,0,9106,14859,14859,58175,165426,
missing,0,0,0,0,0,0,0,0,0,0
0,0.0,0.0,10.0,3.0,20.0,42.32073413,-71.05676415,0.0,1.0,Motor Vehicle Accident Response
1,6.0,750.0,10.0,3.0,19.0,42.34432328,-71.15778368,1.0,0.0,Motor Vehicle Accident Response
2,0.0,0.0,10.0,3.0,19.0,42.31596119,-71.09042564,1.0,0.0,Motor Vehicle Accident Response


In [25]:
# dependent variable
# assign target and inputs for classification or regression
if target==None:
  target=df.columns[-1]   
y = target
print(y)

OFFENSE_CODE_GROUP


In [26]:
if all_variables is not None:
  ivd=get_all_variables_csv(all_variables)
  print(ivd)    
  X=check_all_variables(df,ivd,y)
  print(X)

In [27]:
df.describe()

Rows:223601
Cols:10




Unnamed: 0,DISTRICT,REPORTING_AREA,MONTH,DAY_OF_WEEK,HOUR,Lat,Long,Day,Night,OFFENSE_CODE_GROUP
type,int,int,int,int,int,real,real,int,int,enum
mins,0.0,0.0,1.0,1.0,0.0,-1.0,-71.17867378,0.0,0.0,
mean,6.170410686893172,356.8560516276761,6.671137427829037,3.9565565449170617,13.271389662836928,39.416615842285495,-66.20819779427623,0.7398267449608902,0.2601732550391098,
maxs,12.0,962.0,12.0,7.0,23.0,42.39504158,0.0,1.0,1.0,
sigma,3.359611389797776,253.92184084869484,3.2579484608576266,1.9710244404143626,6.187729598960209,10.7067091332241,17.96108091494706,0.4387299775925634,0.4387299775925634,
zeros,1251,15392,0,0,9106,14859,14859,58175,165426,
missing,0,0,0,0,0,0,0,0,0,0
0,0.0,0.0,10.0,3.0,20.0,42.32073413,-71.05676415,0.0,1.0,Motor Vehicle Accident Response
1,6.0,750.0,10.0,3.0,19.0,42.34432328,-71.15778368,1.0,0.0,Motor Vehicle Accident Response
2,0.0,0.0,10.0,3.0,19.0,42.31596119,-71.09042564,1.0,0.0,Motor Vehicle Accident Response


In [28]:
# independent variables

X = []  
if all_variables is None:
  X=get_independent_variables(df, target)  
else: 
  ivd=get_all_variables_csv(all_variables)    
  X=check_all_variables(df, ivd)


X=check_X(X,df)


# Add independent variables

meta_data['X']=X  


# impute missing values

_=impute_missing_values(df,X, scale)

In [29]:
if analysis == 3:
  classification=False
elif analysis == 2:
  classification=True
elif analysis == 1:
  classification=True

In [30]:
print(classification)

False


In [31]:
# Force target to be factors
# Only 'int' or 'string' are allowed for asfactor(), got Target (Total orders):real 

if classification:
    df[y] = df[y].asfactor()

In [32]:
def check_y(y,df):
  ok=False
  C = [name for name in df.columns if name == y]
  for key, val in df.types.items():
    if key in C:
      if val in ['real','int','enum']:        
        ok=True         
  return ok, val

In [33]:
ok,val=check_y(y,df)
print(ok)
print(val)

True
enum


In [34]:
if val=='enum':
    print(df[y].levels())

[['Drug Violation', 'Investigate Person', 'Investigate Property', 'Larceny', 'Larceny From Motor Vehicle', 'Medical Assistance', 'Motor Vehicle Accident Response', 'Other', 'Simple Assault', 'Towed', 'Vandalism', 'Verbal Disputes']]


In [35]:
allV=get_variables_types(df)
allV

{'DAY_OF_WEEK': 'int',
 'DISTRICT': 'int',
 'Day': 'int',
 'HOUR': 'int',
 'Lat': 'real',
 'Long': 'real',
 'MONTH': 'int',
 'Night': 'int',
 'OFFENSE_CODE_GROUP': 'enum',
 'REPORTING_AREA': 'int'}

In [36]:
meta_data['variables']=allV

In [37]:
meta_data

{'X': ['DISTRICT',
  'REPORTING_AREA',
  'MONTH',
  'DAY_OF_WEEK',
  'HOUR',
  'Day',
  'Night',
  'Lat',
  'Long'],
 'analysis': 0,
 'balance': False,
 'balance_threshold': 0.2,
 'classification': False,
 'end_time': 1539619988.1557174,
 'execution_time': 0.0,
 'max_models': 9,
 'min_mem_size': 6,
 'nthreads': 1,
 'project': None,
 'run_id': 'DHl8dQlmO',
 'run_time': 333,
 'scale': False,
 'start_time': 1539619988.1557174,
 'target': None,
 'variables': {'DAY_OF_WEEK': 'int',
  'DISTRICT': 'int',
  'Day': 'int',
  'HOUR': 'int',
  'Lat': 'real',
  'Long': 'real',
  'MONTH': 'int',
  'Night': 'int',
  'OFFENSE_CODE_GROUP': 'enum',
  'REPORTING_AREA': 'int'}}

In [38]:
# split into training and test for showing how to predict
train, test = df.split_frame([0.9])

In [39]:
# Set up AutoML

aml = H2OAutoML(max_runtime_secs=run_time,project_name = name)

In [40]:
model_start_time = time.time()

In [41]:
aml.train(x=X,y=y,training_frame=train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [42]:
meta_data['model_execution_time'] = time.time() - model_start_time

In [43]:
# get leaderboard
aml_leaderboard_df=aml.leaderboard.as_data_frame()

In [44]:
aml_leaderboard_df

Unnamed: 0,model_id,mean_per_class_error,logloss,rmse,mse
0,DRF_0_AutoML_20181015_121320,0.769072,2.533622,0.846672,0.716854


In [45]:
meta_data

{'X': ['DISTRICT',
  'REPORTING_AREA',
  'MONTH',
  'DAY_OF_WEEK',
  'HOUR',
  'Day',
  'Night',
  'Lat',
  'Long'],
 'analysis': 0,
 'balance': False,
 'balance_threshold': 0.2,
 'classification': False,
 'end_time': 1539619988.1557174,
 'execution_time': 0.0,
 'max_models': 9,
 'min_mem_size': 6,
 'model_execution_time': 1333.7958040237427,
 'nthreads': 1,
 'project': None,
 'run_id': 'DHl8dQlmO',
 'run_time': 333,
 'scale': False,
 'start_time': 1539619988.1557174,
 'target': None,
 'variables': {'DAY_OF_WEEK': 'int',
  'DISTRICT': 'int',
  'Day': 'int',
  'HOUR': 'int',
  'Lat': 'real',
  'Long': 'real',
  'MONTH': 'int',
  'Night': 'int',
  'OFFENSE_CODE_GROUP': 'enum',
  'REPORTING_AREA': 'int'}}

In [48]:

# STart best model as first model

model_set=aml_leaderboard_df['model_id']
mod_best=h2o.get_model(model_set[0])

In [49]:
mod_best._id

'DRF_0_AutoML_20181015_121320'

In [50]:
# Get stacked ensemble  
se=get_stacked_ensemble(model_set)

In [52]:
print(se)

None


In [53]:
if se is not None:
  mod_best=h2o.get_model(se)

In [55]:
mod_best._id

'DRF_0_AutoML_20181015_121320'

In [56]:
mod_best._get_metrics


<function h2o.model.model_base.ModelBase._get_metrics>

In [57]:
type(mod_best)

h2o.estimators.random_forest.H2ORandomForestEstimator

In [58]:
mods=mod_best.coef_norm
print(mods)

Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  DRF_0_AutoML_20181015_121320


ModelMetricsMultinomial: drf
** Reported on train data. **

MSE: 0.7180776305204055
RMSE: 0.8473946132236182
LogLoss: 3.309472598064804
Mean Per-Class Error: 0.7726609595699673
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class



0,1,2,3,4,5,6,7,8,9,10,11,12,13
Drug Violation,Investigate Person,Investigate Property,Larceny,Larceny From Motor Vehicle,Medical Assistance,Motor Vehicle Accident Response,Other,Simple Assault,Towed,Vandalism,Verbal Disputes,Error,Rate
6550.0,504.0,230.0,702.0,174.0,708.0,1620.0,705.0,332.0,162.0,250.0,351.0,0.4669596,"5,738 / 12,288"
969.0,1510.0,399.0,1823.0,402.0,1781.0,2703.0,979.0,700.0,571.0,666.0,1200.0,0.8898052,"12,193 / 13,703"
496.0,587.0,1222.0,849.0,288.0,861.0,1618.0,481.0,362.0,446.0,421.0,615.0,0.8518069,"7,024 / 8,246"
918.0,903.0,341.0,8536.0,579.0,1403.0,2660.0,979.0,880.0,728.0,641.0,623.0,0.5552082,"10,655 / 19,191"
355.0,417.0,226.0,1131.0,786.0,726.0,1689.0,416.0,334.0,570.0,898.0,433.0,0.9015161,"7,195 / 7,981"
1100.0,1370.0,470.0,1994.0,538.0,3175.0,3649.0,1110.0,981.0,814.0,854.0,1391.0,0.8180099,"14,271 / 17,446"
1556.0,1574.0,655.0,2904.0,861.0,2563.0,10568.0,1500.0,1125.0,1397.0,1175.0,1535.0,0.6144895,"16,845 / 27,413"
1169.0,970.0,313.0,1852.0,367.0,1373.0,2549.0,1684.0,1028.0,564.0,683.0,936.0,0.8751483,"11,804 / 13,488"
719.0,834.0,274.0,1837.0,305.0,1260.0,2138.0,1254.0,1148.0,326.0,702.0,933.0,0.9021313,"10,582 / 11,730"


Top-10 Hit Ratios: 


0,1
k,hit_ratio
1,0.2529509
2,0.3794450
3,0.4802250
4,0.566647
5,0.6437269
6,0.7136464
7,0.7764366
8,0.8332412
9,0.8821519



ModelMetricsMultinomial: drf
** Reported on validation data. **

MSE: 0.7183328097884892
RMSE: 0.8475451668132438
LogLoss: 2.484416576567506
Mean Per-Class Error: 0.7683351577463483
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class



0,1,2,3,4,5,6,7,8,9,10,11,12,13
Drug Violation,Investigate Person,Investigate Property,Larceny,Larceny From Motor Vehicle,Medical Assistance,Motor Vehicle Accident Response,Other,Simple Assault,Towed,Vandalism,Verbal Disputes,Error,Rate
1686.0,113.0,51.0,198.0,33.0,172.0,417.0,181.0,68.0,43.0,43.0,71.0,0.4518856,"1,390 / 3,076"
233.0,370.0,119.0,487.0,83.0,449.0,760.0,237.0,170.0,140.0,167.0,260.0,0.8935252,"3,105 / 3,475"
127.0,141.0,343.0,223.0,58.0,215.0,438.0,110.0,71.0,107.0,75.0,166.0,0.8346191,"1,731 / 2,074"
195.0,197.0,74.0,2150.0,127.0,346.0,697.0,248.0,229.0,211.0,164.0,178.0,0.5535714,"2,666 / 4,816"
92.0,85.0,48.0,281.0,165.0,198.0,483.0,106.0,88.0,148.0,212.0,98.0,0.9176647,"1,839 / 2,004"
264.0,298.0,125.0,518.0,132.0,806.0,953.0,265.0,212.0,208.0,220.0,297.0,0.8124709,"3,492 / 4,298"
382.0,342.0,137.0,758.0,200.0,636.0,2871.0,344.0,258.0,345.0,278.0,391.0,0.5864304,"4,071 / 6,942"
310.0,246.0,63.0,454.0,75.0,320.0,662.0,377.0,258.0,152.0,135.0,232.0,0.8852010,"2,907 / 3,284"
175.0,180.0,66.0,469.0,66.0,307.0,563.0,311.0,294.0,85.0,160.0,249.0,0.8994872,"2,631 / 2,925"


Top-10 Hit Ratios: 


0,1
k,hit_ratio
1,0.2598541
2,0.3912828
3,0.4927274
4,0.5772687
5,0.6556543
6,0.7276857
7,0.7938592
8,0.8483916
9,0.8990022



ModelMetricsMultinomial: drf
** Reported on cross-validation data. **

MSE: 0.716854302682089
RMSE: 0.8466724884405357
LogLoss: 2.5336223253155468
Mean Per-Class Error: 0.7690721384925537
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class



0,1,2,3,4,5,6,7,8,9,10,11,12,13
Drug Violation,Investigate Person,Investigate Property,Larceny,Larceny From Motor Vehicle,Medical Assistance,Motor Vehicle Accident Response,Other,Simple Assault,Towed,Vandalism,Verbal Disputes,Error,Rate
6782.0,472.0,233.0,702.0,144.0,673.0,1589.0,675.0,307.0,149.0,220.0,342.0,0.4480794,"5,506 / 12,288"
994.0,1448.0,354.0,1792.0,355.0,1708.0,2872.0,967.0,670.0,585.0,668.0,1290.0,0.8943297,"12,255 / 13,703"
526.0,550.0,1253.0,826.0,262.0,868.0,1768.0,467.0,319.0,411.0,390.0,606.0,0.8480475,"6,993 / 8,246"
945.0,812.0,318.0,8677.0,532.0,1324.0,2831.0,989.0,824.0,775.0,566.0,598.0,0.5478610,"10,514 / 19,191"
340.0,388.0,244.0,1159.0,754.0,749.0,1813.0,418.0,305.0,574.0,819.0,418.0,0.9055256,"7,227 / 7,981"
1080.0,1287.0,428.0,2000.0,516.0,3225.0,3847.0,1127.0,946.0,805.0,811.0,1374.0,0.8151439,"14,221 / 17,446"
1517.0,1443.0,662.0,2854.0,827.0,2470.0,11089.0,1415.0,1087.0,1362.0,1111.0,1576.0,0.5954839,"16,324 / 27,413"
1149.0,894.0,311.0,1849.0,315.0,1362.0,2701.0,1716.0,981.0,558.0,623.0,1029.0,0.8727758,"11,772 / 13,488"
733.0,744.0,279.0,1807.0,264.0,1394.0,2319.0,1207.0,1082.0,350.0,646.0,905.0,0.9077579,"10,648 / 11,730"


Top-10 Hit Ratios: 


0,1
k,hit_ratio
1,0.2583709
2,0.389104
3,0.4904062
4,0.5774746
5,0.6561395
6,0.7277061
7,0.7913479
8,0.8483264
9,0.8983311


Cross-Validation Metrics Summary: 


0,1,2,3,4,5,6,7
,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.2583709,0.0011846,0.2614600,0.2575131,0.2564565,0.2583523,0.2580725
err,0.7416291,0.0011846,0.7385399,0.7424869,0.7435435,0.7416478,0.7419274
err_count,23863.4,38.117188,23764.0,23891.0,23925.0,23864.0,23873.0
logloss,2.5336223,0.0102946,2.542618,2.5164566,2.520588,2.5562558,2.532193
max_per_class_error,0.9224130,0.0023989,0.9189785,0.9220608,0.9225011,0.9286658,0.9198591
mean_per_class_accuracy,0.2309280,0.0015050,0.2348265,0.2299240,0.2286013,0.2313036,0.2299846
mean_per_class_error,0.769072,0.0015050,0.7651734,0.7700760,0.7713987,0.7686964,0.7700155
mse,0.7168543,0.0006272,0.7152127,0.7172411,0.7178820,0.7170287,0.716907
r2,0.928929,0.0000609,0.9290890,0.9288934,0.9288298,0.9289115,0.9289213


Scoring History: 


0,1,2,3,4,5,6,7,8,9
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_classification_error,validation_rmse,validation_logloss,validation_classification_error
,2018-10-15 12:32:32,19 min 11.200 sec,0.0,,,,,,
,2018-10-15 12:32:37,19 min 15.445 sec,1.0,0.8708019,18.3418657,0.8038922,0.8732529,18.5211586,0.8098193
,2018-10-15 12:32:44,19 min 22.320 sec,3.0,0.8676477,16.2544761,0.8005742,0.8560050,10.0905855,0.7880262
,2018-10-15 12:32:50,19 min 29.106 sec,5.0,0.8627564,13.7906751,0.7935999,0.8521939,6.9569940,0.7729349
,2018-10-15 12:33:01,19 min 40.160 sec,8.0,0.8576401,11.0309672,0.7859861,0.8504784,5.1037955,0.7630808
,2018-10-15 12:33:16,19 min 55.254 sec,12.0,0.8539730,8.4846678,0.7767845,0.8498918,4.0483083,0.7584641
,2018-10-15 12:33:42,20 min 20.487 sec,19.0,0.8501414,5.9323887,0.7635890,0.8486307,3.2576250,0.7476420
,2018-10-15 12:34:22,21 min 0.434 sec,30.0,0.8484223,4.3313177,0.7544146,0.8479454,2.7630356,0.7440677
,2018-10-15 12:35:16,21 min 54.455 sec,50.0,0.8473946,3.3094726,0.7470491,0.8475452,2.4844166,0.7401459


Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
Lat,546196.8750000,1.0,0.1983759
Long,541878.3125000,0.9920934,0.1968074
REPORTING_AREA,451312.7812500,0.8262822,0.1639145
MONTH,392802.8437500,0.7191598,0.1426640
HOUR,381185.4062500,0.6978901,0.1384446
DAY_OF_WEEK,297398.8750000,0.5444903,0.1080137
DISTRICT,115925.1171875,0.2122405,0.0421034
Night,13551.1718750,0.0248101,0.0049217
Day,13091.8681641,0.0239691,0.0047549


<bound method ModelBase.coef_norm of >


In [60]:
bm=stackedensemble_df(aml_leaderboard_df)
bm

['DRF_0_AutoML_20181015_121320']

In [63]:
mods

Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  DRF_0_AutoML_20181015_121320


ModelMetricsMultinomial: drf
** Reported on train data. **

MSE: 0.7180776305204055
RMSE: 0.8473946132236182
LogLoss: 3.309472598064804
Mean Per-Class Error: 0.7726609595699673
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class



0,1,2,3,4,5,6,7,8,9,10,11,12,13
Drug Violation,Investigate Person,Investigate Property,Larceny,Larceny From Motor Vehicle,Medical Assistance,Motor Vehicle Accident Response,Other,Simple Assault,Towed,Vandalism,Verbal Disputes,Error,Rate
6550.0,504.0,230.0,702.0,174.0,708.0,1620.0,705.0,332.0,162.0,250.0,351.0,0.4669596,"5,738 / 12,288"
969.0,1510.0,399.0,1823.0,402.0,1781.0,2703.0,979.0,700.0,571.0,666.0,1200.0,0.8898052,"12,193 / 13,703"
496.0,587.0,1222.0,849.0,288.0,861.0,1618.0,481.0,362.0,446.0,421.0,615.0,0.8518069,"7,024 / 8,246"
918.0,903.0,341.0,8536.0,579.0,1403.0,2660.0,979.0,880.0,728.0,641.0,623.0,0.5552082,"10,655 / 19,191"
355.0,417.0,226.0,1131.0,786.0,726.0,1689.0,416.0,334.0,570.0,898.0,433.0,0.9015161,"7,195 / 7,981"
1100.0,1370.0,470.0,1994.0,538.0,3175.0,3649.0,1110.0,981.0,814.0,854.0,1391.0,0.8180099,"14,271 / 17,446"
1556.0,1574.0,655.0,2904.0,861.0,2563.0,10568.0,1500.0,1125.0,1397.0,1175.0,1535.0,0.6144895,"16,845 / 27,413"
1169.0,970.0,313.0,1852.0,367.0,1373.0,2549.0,1684.0,1028.0,564.0,683.0,936.0,0.8751483,"11,804 / 13,488"
719.0,834.0,274.0,1837.0,305.0,1260.0,2138.0,1254.0,1148.0,326.0,702.0,933.0,0.9021313,"10,582 / 11,730"


Top-10 Hit Ratios: 


0,1
k,hit_ratio
1,0.2529509
2,0.3794450
3,0.4802250
4,0.566647
5,0.6437269
6,0.7136464
7,0.7764366
8,0.8332412
9,0.8821519



ModelMetricsMultinomial: drf
** Reported on validation data. **

MSE: 0.7183328097884892
RMSE: 0.8475451668132438
LogLoss: 2.484416576567506
Mean Per-Class Error: 0.7683351577463483
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class



0,1,2,3,4,5,6,7,8,9,10,11,12,13
Drug Violation,Investigate Person,Investigate Property,Larceny,Larceny From Motor Vehicle,Medical Assistance,Motor Vehicle Accident Response,Other,Simple Assault,Towed,Vandalism,Verbal Disputes,Error,Rate
1686.0,113.0,51.0,198.0,33.0,172.0,417.0,181.0,68.0,43.0,43.0,71.0,0.4518856,"1,390 / 3,076"
233.0,370.0,119.0,487.0,83.0,449.0,760.0,237.0,170.0,140.0,167.0,260.0,0.8935252,"3,105 / 3,475"
127.0,141.0,343.0,223.0,58.0,215.0,438.0,110.0,71.0,107.0,75.0,166.0,0.8346191,"1,731 / 2,074"
195.0,197.0,74.0,2150.0,127.0,346.0,697.0,248.0,229.0,211.0,164.0,178.0,0.5535714,"2,666 / 4,816"
92.0,85.0,48.0,281.0,165.0,198.0,483.0,106.0,88.0,148.0,212.0,98.0,0.9176647,"1,839 / 2,004"
264.0,298.0,125.0,518.0,132.0,806.0,953.0,265.0,212.0,208.0,220.0,297.0,0.8124709,"3,492 / 4,298"
382.0,342.0,137.0,758.0,200.0,636.0,2871.0,344.0,258.0,345.0,278.0,391.0,0.5864304,"4,071 / 6,942"
310.0,246.0,63.0,454.0,75.0,320.0,662.0,377.0,258.0,152.0,135.0,232.0,0.8852010,"2,907 / 3,284"
175.0,180.0,66.0,469.0,66.0,307.0,563.0,311.0,294.0,85.0,160.0,249.0,0.8994872,"2,631 / 2,925"


Top-10 Hit Ratios: 


0,1
k,hit_ratio
1,0.2598541
2,0.3912828
3,0.4927274
4,0.5772687
5,0.6556543
6,0.7276857
7,0.7938592
8,0.8483916
9,0.8990022



ModelMetricsMultinomial: drf
** Reported on cross-validation data. **

MSE: 0.716854302682089
RMSE: 0.8466724884405357
LogLoss: 2.5336223253155468
Mean Per-Class Error: 0.7690721384925537
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class



0,1,2,3,4,5,6,7,8,9,10,11,12,13
Drug Violation,Investigate Person,Investigate Property,Larceny,Larceny From Motor Vehicle,Medical Assistance,Motor Vehicle Accident Response,Other,Simple Assault,Towed,Vandalism,Verbal Disputes,Error,Rate
6782.0,472.0,233.0,702.0,144.0,673.0,1589.0,675.0,307.0,149.0,220.0,342.0,0.4480794,"5,506 / 12,288"
994.0,1448.0,354.0,1792.0,355.0,1708.0,2872.0,967.0,670.0,585.0,668.0,1290.0,0.8943297,"12,255 / 13,703"
526.0,550.0,1253.0,826.0,262.0,868.0,1768.0,467.0,319.0,411.0,390.0,606.0,0.8480475,"6,993 / 8,246"
945.0,812.0,318.0,8677.0,532.0,1324.0,2831.0,989.0,824.0,775.0,566.0,598.0,0.5478610,"10,514 / 19,191"
340.0,388.0,244.0,1159.0,754.0,749.0,1813.0,418.0,305.0,574.0,819.0,418.0,0.9055256,"7,227 / 7,981"
1080.0,1287.0,428.0,2000.0,516.0,3225.0,3847.0,1127.0,946.0,805.0,811.0,1374.0,0.8151439,"14,221 / 17,446"
1517.0,1443.0,662.0,2854.0,827.0,2470.0,11089.0,1415.0,1087.0,1362.0,1111.0,1576.0,0.5954839,"16,324 / 27,413"
1149.0,894.0,311.0,1849.0,315.0,1362.0,2701.0,1716.0,981.0,558.0,623.0,1029.0,0.8727758,"11,772 / 13,488"
733.0,744.0,279.0,1807.0,264.0,1394.0,2319.0,1207.0,1082.0,350.0,646.0,905.0,0.9077579,"10,648 / 11,730"


Top-10 Hit Ratios: 


0,1
k,hit_ratio
1,0.2583709
2,0.389104
3,0.4904062
4,0.5774746
5,0.6561395
6,0.7277061
7,0.7913479
8,0.8483264
9,0.8983311


Cross-Validation Metrics Summary: 


0,1,2,3,4,5,6,7
,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.2583709,0.0011846,0.2614600,0.2575131,0.2564565,0.2583523,0.2580725
err,0.7416291,0.0011846,0.7385399,0.7424869,0.7435435,0.7416478,0.7419274
err_count,23863.4,38.117188,23764.0,23891.0,23925.0,23864.0,23873.0
logloss,2.5336223,0.0102946,2.542618,2.5164566,2.520588,2.5562558,2.532193
max_per_class_error,0.9224130,0.0023989,0.9189785,0.9220608,0.9225011,0.9286658,0.9198591
mean_per_class_accuracy,0.2309280,0.0015050,0.2348265,0.2299240,0.2286013,0.2313036,0.2299846
mean_per_class_error,0.769072,0.0015050,0.7651734,0.7700760,0.7713987,0.7686964,0.7700155
mse,0.7168543,0.0006272,0.7152127,0.7172411,0.7178820,0.7170287,0.716907
r2,0.928929,0.0000609,0.9290890,0.9288934,0.9288298,0.9289115,0.9289213


Scoring History: 


0,1,2,3,4,5,6,7,8,9
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_classification_error,validation_rmse,validation_logloss,validation_classification_error
,2018-10-15 12:32:32,19 min 11.200 sec,0.0,,,,,,
,2018-10-15 12:32:37,19 min 15.445 sec,1.0,0.8708019,18.3418657,0.8038922,0.8732529,18.5211586,0.8098193
,2018-10-15 12:32:44,19 min 22.320 sec,3.0,0.8676477,16.2544761,0.8005742,0.8560050,10.0905855,0.7880262
,2018-10-15 12:32:50,19 min 29.106 sec,5.0,0.8627564,13.7906751,0.7935999,0.8521939,6.9569940,0.7729349
,2018-10-15 12:33:01,19 min 40.160 sec,8.0,0.8576401,11.0309672,0.7859861,0.8504784,5.1037955,0.7630808
,2018-10-15 12:33:16,19 min 55.254 sec,12.0,0.8539730,8.4846678,0.7767845,0.8498918,4.0483083,0.7584641
,2018-10-15 12:33:42,20 min 20.487 sec,19.0,0.8501414,5.9323887,0.7635890,0.8486307,3.2576250,0.7476420
,2018-10-15 12:34:22,21 min 0.434 sec,30.0,0.8484223,4.3313177,0.7544146,0.8479454,2.7630356,0.7440677
,2018-10-15 12:35:16,21 min 54.455 sec,50.0,0.8473946,3.3094726,0.7470491,0.8475452,2.4844166,0.7401459


Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
Lat,546196.8750000,1.0,0.1983759
Long,541878.3125000,0.9920934,0.1968074
REPORTING_AREA,451312.7812500,0.8262822,0.1639145
MONTH,392802.8437500,0.7191598,0.1426640
HOUR,381185.4062500,0.6978901,0.1384446
DAY_OF_WEEK,297398.8750000,0.5444903,0.1080137
DISTRICT,115925.1171875,0.2122405,0.0421034
Night,13551.1718750,0.0248101,0.0049217
Day,13091.8681641,0.0239691,0.0047549


<bound method ModelBase.coef_norm of >