# This notebook will help you to do:
* Import training and test data
* Univariate Analysis
* Run many ML algorithms using H2O
* Compare all model performance in test dataset
* Choosing the best model

## The Digit Recognizer dataset will be used for this demonstration

# 1. Parameters

In [None]:
#Model ID
ModelId='digit_recognizer_FML_v1'

#Setting the model target variable name
var_target = 'label'

#process outputs such as MOJO model, images and performance of tested models
OutputPath='/kaggle/temp'

#If you have a huge dataset, I should consider use a small sample for first execution
pct_sample_size = 1

# 2. Import Libraries

In [None]:
import glob
import functools
import datetime as dt
import pandas as pd
import numpy as np
import h2o
import matplotlib.pyplot as plt
import shap
from pandas_profiling import ProfileReport
from collections import defaultdict
from pandas_profiling.model.base import get_var_type
import seaborn as sns
import os
import random

# 3. Importing Data for Modeling

In [None]:
#Import bases with features for modeling
#In this case we will use titanic dataset available below
dataprep_df_full = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
#The target variavle must be integer

dataprep_df_full['label'] = dataprep_df_full['label'].astype(int)

### It is necessary to create a variable to indicate the records used in training and testing. In this case we will use the random variable, but you can use a date variable for exemple if you have a base with a reference date to fix the training base as an out of time validation.

In [None]:
random.seed(59354518745)
for i in range(len(dataprep_df_full)):
    dataprep_df_full.loc[i, ('random')] = random.random()
dataprep_df_full['dataset'] = ['train' if x <= 0.85 else 'test' for x in dataprep_df_full['random']]
dataprep_df_full = dataprep_df_full.drop(labels=['random'], axis=1)

In [None]:
#Work with a sample data if the pct_sample_size is less than 1
if pct_sample_size == 1:
    dataprep_df = dataprep_df_full
else:    
    dataprep_df = dataprep_df_full.sample(frac=pct_sample_size, replace=False, random_state=1)

## 3.1 Feature Engineering

In [None]:
X_train = dataprep_df.drop(labels=[var_target, 'dataset'], axis=1)
X_train = X_train.astype('float32')
X_train = X_train / 255
dataprep_df = pd.concat([dataprep_df.loc[:,(var_target, 'dataset')], X_train], axis=1)

# 4. Univariate Analysis

## 4.1 Pandas Profiling

##### For more details on the pandas profiling library see https://github.com/pandas-profiling/pandas-profiling


In [None]:
#Generate report
#If the database has many records or columns, the report can take a long time
#If this is the case, disable the explorative, samples, correlations, missing_diagrams, duplicates and interactions options by commenting out
profile = ProfileReport(dataprep_df, title=f"Pandas Profiling Report{ModelId}"
                        ,explorative=True
                        ,samples=None
                        ,correlations=None
                        ,missing_diagrams=None
                        ,duplicates=None
                        ,interactions=None
                       )
#profile.to_file("profile.html")
#display(profile)

## 5. Classify the types of variables
#### list all columns to select the ones to be used

In [None]:
# Get all the types pandas_profiling offers
list_columns = dataprep_df.columns.drop('dataset').drop(var_target)
d = {col: get_var_type(dataprep_df[col])['type'].value for col in list_columns}
fd = defaultdict(list)
for k, v in d.items():
    fd[v].append(k)
     
cols_by_base_type = dict(fd)
# Group the types pandas_profiling offers to match typical needs
cat_num_cols = defaultdict(list)
for k, v in cols_by_base_type.items():
    # Treat boolean and unique columns as categorical
    k = 'CAT' if k in ['BOOL', 'UNIQUE'] else k
    cat_num_cols[k].extend(v)
#print(dict(cat_num_cols))

### From the variables listed above you can select which  one will be tested in the model and confirm if the correct type is numeric(NUM) or categorical (CAT). Paste the correct information below:

In [None]:
#It is necessary to define the types of variables (cageroric and numeric) to ensure that the type of data used in the modeling will be the most suitable.
#For example, categorical variables need to be defined as a string because this prevents it from being treated as a numeric variable in H20 modeling
#Another example is that the string variables will have a missing treatment by placing the missing category for all values found as 'null'
CAT = []
#float
NUM = cat_num_cols['NUM'] + cat_num_cols['CAT']

selected_features = CAT + NUM

In [None]:
#Numeric features must be float type
for col_name in NUM:    
    dataprep_df[col_name] = dataprep_df[col_name].astype(float)    

#Categorical features must be string type and null values will be filled with "missing"
for col_name in CAT:        
    dataprep_df[col_name] = dataprep_df[col_name].astype(str)
    dataprep_df = dataprep_df.fillna(value={col_name: 'missing'})    

# 6. Modeling

## 6.1 Creating context and H2O and Importing data into the H2O context

In [None]:
# Number of threads, nthreads = -1, means use all cores on your machine
# max_mem_size is the maximum memory (in GB) to allocate to H2O
h2o.init(nthreads = -1, max_mem_size = 8)

In [None]:
#Import TRAINING base to the H20 context
data_hdf = h2o.H2OFrame(dataprep_df.query('dataset == "train"'))

# Conversion of Target variables and categorical features to factor (enum)
#no H2O it is necessary that the categorical variables are transformed into a factor
data_hdf[var_target] = data_hdf[var_target].asfactor()
for col_name in CAT:
    data_hdf[col_name] = data_hdf[col_name].asfactor()    
    
# Partition data into 90%, 10% chunks
# Setting a seed will guarantee reproducibility
train_hdf, valid_hdf = data_hdf.split_frame(ratios=[0.90], destination_frames=['train_hdf', 'valid_hdf'], seed=1)
        
#Notice that `split_frame()` uses approximate splitting not exact splitting (for efficiency), so these are not exactly 90%, 10% of the total rows.
print('Training: ' + str(train_hdf.nrow))
print('Validation: ' + str(valid_hdf.nrow))

In [None]:
#Import TEST base to the H20 context
test_hdf = h2o.H2OFrame(dataprep_df.query('dataset == "test"'))

# Conversion of Target variables and categorical features to factor (enum)
#no H2O it is necessary that the categorical variables are transformed into a factor
test_hdf[var_target] = test_hdf[var_target].asfactor()
for col_name in CAT:
    test_hdf[col_name] = test_hdf[col_name].asfactor()    
    
print('Test: ' + str(test_hdf.nrow))

## 6.2 Using H2O to performe many ML algorithms

## Logistic Regresion (GLM)

In [None]:
vModel = 'GLM_'

start = dt.datetime.now()
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

#definir parámetros
GLM = H2OGeneralizedLinearEstimator(family= 'multinomial',
                                    seed=1,
                                    #auc_type="MACRO_OVR",
                                    model_id='%s%s%s' % (vModel, ModelId, str(dt.datetime.now())[:19].replace('-',"").replace(':',"").replace(' ',"_")))

#Executar Modelo
GLM.train(x = selected_features,
          y = var_target,
          training_frame = train_hdf,
          validation_frame = valid_hdf)

#Execution time of the model
stop = dt.datetime.now()
execution_time = stop-start
print("\n"+ "Execution time: " + str(execution_time) +"\n")
print(GLM)

## GBM - Gradient Boosting Machine

In [None]:
vModel='GBM_'

#Execution time of the model
start = dt.datetime.now()

from h2o.estimators.gbm import H2OGradientBoostingEstimator
GBM = H2OGradientBoostingEstimator(model_id='%s%s%s' % (vModel, ModelId, str(dt.datetime.now())[:19].replace('-',"").replace(':',"").replace(' ',"_")),
                                   ntrees=500,
                                   score_tree_interval=5,     #used for early stopping
                                   stopping_rounds=3,         #used for early stopping
                                   stopping_metric='mean_per_class_error',     #used for early stopping
                                   stopping_tolerance=0.0005, #used for early stopping
                                   #auc_type="MACRO_OVR",
                                   seed=1)

# The use of a validation_frame is recommended with using early stopping
GBM.train(x=selected_features, y=var_target, training_frame=train_hdf, validation_frame=valid_hdf)

#Execution time of the model
stop = dt.datetime.now()
execution_time = stop-start
print("\n"+ "Execution time: " + str(execution_time) + "\n")
print(GBM)

## GBM - Gradient Boosting Machine with Cross-Validation

In [None]:
vModel='GBM_cv_'

#Execution time of the model
start = dt.datetime.now()

from h2o.estimators.gbm import H2OGradientBoostingEstimator
GBM_cv = H2OGradientBoostingEstimator(model_id='%s%s%s' % (vModel, ModelId, str(dt.datetime.now())[:19].replace('-',"").replace(':',"").replace(' ',"_"))
                                   ,nfolds=5
                                   ,seed=1
                                   #,auc_type="MACRO_OVR"
                                   )

# The use of a validation_frame is recommended with using early stopping
GBM_cv.train(x=selected_features, y=var_target, training_frame=train_hdf, validation_frame=valid_hdf)

#Execution time of the model
stop = dt.datetime.now()
execution_time = stop-start
print("\n"+ "Execution time: " + str(execution_time) + "\n")
print(GBM_cv)

## Random Forest

In [None]:
vModel='DRF_CV_'

#Execution time of the model
start = dt.datetime.now()

from h2o.estimators.random_forest import H2ORandomForestEstimator

DRF = H2ORandomForestEstimator(seed=1
                               ,nfolds=5
                               #,auc_type="MACRO_OVR"
                               ,model_id='%s%s%s' % (vModel, ModelId, str(dt.datetime.now())[:19].replace('-',"").replace(':',"").replace(' ',"_")))

# The use of a validation_frame is recommended with using early stopping
DRF.train(x=selected_features, y=var_target, training_frame=train_hdf, validation_frame=valid_hdf)

#Execution time of the model
stop = dt.datetime.now()
execution_time = stop-start
print("\n"+ "Execution time: " + str(execution_time) + "\n")
print(DRF)

## H2OAutoML

In [None]:
vModel='AUTOML'

#Execution time of the model
start = dt.datetime.now()

#Set the maximum time in seconds for the H20 AutoML
max_runtime_secs=60*10

#Define metrics to select the best model in AutoML
sort_metric = 'mean_per_class_error'

from h2o.automl import H2OAutoML
AUTOML = H2OAutoML(seed=1,                   
                   include_algos = ['DRF', 'GLM', 'XGBoost', 'GBM', 'DeepLearning', 'StackedEnsemble'],
                   max_runtime_secs = max_runtime_secs,
                   stopping_metric = sort_metric,
                   sort_metric = sort_metric)
AUTOML.train(x=selected_features, y=var_target, training_frame = train_hdf, validation_frame = valid_hdf, leaderboard_frame=test_hdf)

#View the AutoML Leaderboard
lb = AUTOML.leaderboard
print(lb.head(rows=lb.nrows))

#Execution time of the model
stop = dt.datetime.now()
execution_time = stop-start
print("\n"+ "Execution time: " + str(execution_time) + "\n")

In [None]:
#Choose the desired AutoML model
best_automl_position=0
if len(AUTOML.leaderboard) > 0:
    best_AutoML = h2o.get_model(AUTOML.leaderboard[best_automl_position, 0])
    print(best_AutoML)

## 6.3 Compare performance on the TEST dataset for all trained models

In [None]:
#Create empty model list
list_models = []

#Define the list of all models that have been executed and should be compared
try:
    list_models.append(GLM)
except NameError:
    GLM = None
try:
    list_models.append(GBM)
except NameError:
    GBM = None
try:
    list_models.append(GBM_cv)
except NameError:
    GBM_cv = None
try:
    list_models.append(DRF)
except NameError:
    DRF = None
try:
    list_models.append(best_AutoML)
except NameError:
    best_AutoML = None

In [None]:
#Compare performance on the TEST dataset for all trained models
plt.rcParams.update({'font.size': 12})
fig = plt.figure(figsize=(10, 10))
for i in list_models:
    #Save all models in H20 format
    h2o.save_model(model=i, path='%s/models/todos/' % OutputPath, force=True)    
    
    #Ascertain the performance of all models on the test base
    performance = i.model_performance(test_hdf)
    
    #Salve metrics
    f=open("%s/models/todos/performance_%s.csv" % (OutputPath, i.model_id), 'w')
    f.write(
        str(i.model_id) + ";"
        + str(performance.mean_per_class_error()) + ";"
        + str(performance.auc()) + ';'
        + str(performance.aucpr()) + ';'
        + str(performance.logloss()) + ';'
        + str(performance.mse()) + ';'
        + str(performance.rmse()))
    f.write('\n')
    f.close()
    
    if i.model_id==list_models[0].model_id:
        df_plot = pd.DataFrame({'Model_id': i.model_id.split("_")[0]+"_"+i.model_id.split("_")[1]+"_"+i.model_id.split("_")[2],
                                    'mean_per_class_error': int(performance.mean_per_class_error()*100)/100,
                                    'mse': int(performance.mse()*100)/100,
                                    'mse': int(performance.rmse()*100)/100
                                    }, index=[0])
    else:
        df_plot = df_plot.append(pd.DataFrame({'Model_id': i.model_id.split("_")[0]+"_"+i.model_id.split("_")[1]+"_"+i.model_id.split("_")[2],
                                    'mean_per_class_error': int(performance.mean_per_class_error()*100)/100,
                                    'mse': int(performance.mse()*100)/100,
                                    'rmse': int(performance.rmse()*100)/100
                                    }, index=[0]))

ax = df_plot.plot(kind='bar', x="Model_id", title="mean_per_class_error, mse and rmse for Model (Test dataset)", grid=True, figsize=(10,5), legend=1)
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
plt.legend(loc=3, prop={'size': 10})

## 6.4 Choose the best model among all tested

In [None]:
#Consider all models in the history ./models/todos/performance_*.csv. To disregard any old version, set erase_modelos = "S":
apagar_modelos = 'N'
if apagar_modelos == 'S':
    os.system('rm %s/models/todos/performance_*.csv' % OutputPath)

In [None]:
sort_metric_best_model='mean_per_class_error'
#importar todos os modelos testados e imprmie na tela os 10 melhores erdedando per AUC
modelos_testados = pd.concat(map(functools.partial(pd.read_csv, sep=';', header=None), glob.glob('%s/models/todos/performance_*.csv' % OutputPath)))
modelos_testados.columns = ('model_id', 'mean_per_class_error', 'AUC', 'AUCPR', 'logloss', 'mse', 'rmse')
modelos_testados = modelos_testados.sort_values(by=sort_metric_best_model, ascending=True)
modelos_testados = modelos_testados.drop_duplicates(subset=["model_id"])
print('MBest Models. Sorted by : ' + str(sort_metric_best_model))
modelos_testados.reset_index(0).head(30)

In [None]:
#If you want to choose a model other than the first one on the list. Choose the position number:
posicao_melhor_modelo=0

melhor_modelo = h2o.load_model('%s/models/todos/%s' % (OutputPath, modelos_testados.iloc[posicao_melhor_modelo, 0]))
(print("\n"+ "BEST MODEL: " + str(modelos_testados.iloc[posicao_melhor_modelo, 0]) + "\n"))

plt.rcParams.update({'font.size': 10})
try:
    melhor_modelo.varimp_plot(50)
except Exception as e:
    print("Warning: This model doesn't have variable importances")

## 6.4 Stepwise for Analysis of the importance of variables

In [None]:
#Listar todas as variáveis do modelo atual, ordenadas por variable importance
#Para as variaveis definidas como fator (que possivelmente estão como dummys), remover a categoria do nome e deixar apenas o nome orifinal da variavel

#List all variables in the current model, ordered by variable importance
#For variables defined as a factor (which possibly are like dummys), remove the category from the name and leave only the orifinal name of the variable
try:
    df_features_sorted = melhor_modelo.varimp(True).variable.str.split('.', expand=True).drop_duplicates(subset = 0)[0].reset_index(drop=True)
except Exception as e:
    #As the model with ensemble in H20 does not show the importance of variables, we will include variables with higher IV first using result_formatado graph of step 5.1
    df_features_sorted = result_formated_graph.Variable.reset_index(drop=True)

In [None]:
#Define the number of variables to be increased with each new model. Try to put 10% or 20% of the total, as it can take a long time
qt_var=10
qt_total_var = len(df_features_sorted)

dict_model_tmp={}
dict_performance={}

for i in range(qt_var, qt_total_var+qt_var, qt_var):    
    df_features_sorted[0:i].values.tolist()    
    
    #If no model chosen is not an ensemble of models. Then use the same model for training with increment of variables
    melhor_modelo_tmp = melhor_modelo
    if melhor_modelo_tmp.model_id.lower().find("ensemble") == -1:
        dict_model_tmp[i] = melhor_modelo_tmp
        dict_model_tmp[i].train(x = df_features_sorted[0:i].values.tolist(),
                                y = var_target,
                                training_frame=train_hdf, 
                                validation_frame=valid_hdf)
    ##If it is not possible, for the home of an ensemble of models, use GradientBoostingEstimator to make the assessment
    else:
        dict_model_tmp[i] = H2OGradientBoostingEstimator(seed=1, model_id=str('model_tmp_%s' % i))
        dict_model_tmp[i].train(x = df_features_sorted[0:i].values.tolist(),
                                y = var_target,
                                training_frame=train_hdf, 
                                validation_frame=valid_hdf)       


    perform_oot = dict_model_tmp[i].model_performance(test_hdf)
    dict_performance_tmp = {}
    dict_performance_tmp['MSE'] = {'qt_var': i, 'medida': 'MSE', 'Validation_Dataset': dict_model_tmp[i].mse(valid=True), 'Test_Dataset': perform_oot.mse()}
    dict_performance_tmp['RMSE'] = {'qt_var': i, 'medida': 'RMSE', 'Validation_Dataset': dict_model_tmp[i].rmse(valid=True), 'Test_Dataset': perform_oot.rmse()}
    dict_performance_tmp['logloss'] = {'qt_var': i, 'medida': 'logloss', 'Validation_Dataset': dict_model_tmp[i].logloss(valid=True), 'Test_Dataset': perform_oot.logloss()}
    dict_performance[i] = pd.DataFrame(dict_performance_tmp).transpose()

In [None]:
##Plot graph comparing the increase in performance with the increase in variables
for i in dict_performance.keys():
    if i == list(dict_performance.keys())[0]:
        df_performance = dict_performance[i]
    else:
        df_performance = df_performance.append(dict_performance[i], ignore_index=True)

lista_metricas_perf = df_performance['medida'].unique()

for i in range(len(lista_metricas_perf)):   
    #selects only the metric to be analyzed
    metrics_df_tmp = df_performance.query('medida == "%s"' % lista_metricas_perf[i])
    metrics_df_tmp = metrics_df_tmp.set_index('qt_var')
    del metrics_df_tmp['medida']
    if lista_metricas_perf[i] == 'R2':
        max_oot = metrics_df_tmp[metrics_df_tmp['Test_Dataset'] == metrics_df_tmp.Test_Dataset.max()].index.values
    else:
        max_oot = metrics_df_tmp[metrics_df_tmp['Test_Dataset'] == metrics_df_tmp.Test_Dataset.min()].index.values
        
    if lista_metricas_perf[i] == 'logloss':
        max_oot_filtro = max_oot[0]        
    
    ax=metrics_df_tmp.plot(figsize=(15,5), linewidth=2, fontsize=10, marker='D', ms=5,\
                            title='Best %s with %s Variables' % (lista_metricas_perf[i].upper(), str(max_oot[0])))
    plt.xlabel('Variables Number')
    plt.ylabel('%s' % lista_metricas_perf[i].upper())
    plt.grid(axis='y')
    plt.legend(loc=0, prop={'size': 12})
    #display(ax)

In [None]:
print('Consider removing the following variables: '+ str(df_features_sorted[df_features_sorted.index > int(max_oot_filtro)].values.tolist()))

## 6.5 Exporting the best model to Deploy

In [None]:
#Save the H2O model in MOJO format and all the variables of the best model
melhor_modelo = h2o.load_model('%s/models/todos/%s' % (OutputPath, modelos_testados.iloc[posicao_melhor_modelo, 0]))
caminho_modelo_mojo = melhor_modelo.download_mojo('%s/models/melhores/' % OutputPath, get_genmodel_jar=True)
print(caminho_modelo_mojo)
caminho_modelo_h2o = h2o.save_model(model=melhor_modelo, path='%s/models/melhores/' % OutputPath, force=True)

In [None]:
try:
    features_names= melhor_modelo.varimp(True)
    features_names.to_csv('%s/models/melhores/features_names_%s.csv' % (OutputPath, melhor_modelo.model_id), sep=';')
except Exception as e:
    print("Warning: This model doesn't have variable importances")

# 7. Predict Submision dataset using MOJO or H2O Model

In [None]:
submission_df = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')

submission_df = submission_df.astype('float32')
submission_df = submission_df / 255

#Numeric features must be float type
for col_name in NUM:    
    submission_df[col_name] = submission_df[col_name].astype(float)    

#Categorical features must be string type and null values will be filled with "missing"
for col_name in CAT:        
    submission_df[col_name] = submission_df[col_name].astype(str)    
    submission_df = submission_df.fillna(value={col_name: 'missing'}) 

In [None]:
#Importar MOJO
try:
    test_tmp = h2o.mojo_predict_pandas(submission_df, caminho_modelo_mojo)    
    predict_df = submission_df.merge(test_tmp, left_index=True, right_index=True)
except:    
    submission_hdf = h2o.H2OFrame(submission_df)
    for col_name in CAT:
        submission_hdf[col_name] = submission_hdf[col_name].asfactor() 
    h2o_predict = melhor_modelo.predict(submission_hdf)
    predict_df = h2o_predict.cbind(submission_hdf).as_data_frame()
    
predict_df.rename(columns={'predict':'Label'}, inplace=True)
predict_df = predict_df.reset_index(drop=True)
predict_df = predict_df.reset_index(drop=False)
predict_df.rename(columns={'index':'ImageId'}, inplace=True)
predict_df['ImageId'] = predict_df['ImageId']+1
predict_df.loc[:, ('ImageId', 'Label')]

# 8. Save final dataset with predictions

In [None]:
predict_df.loc[:, ('ImageId', 'Label')].to_csv('/kaggle/working/digit_recognizer_submission.csv', index=False)

## END