# This notebook will help you to do:
* Import training and test data
* Preprocessing Text
* Univariate Analysis
* Bivariate Analysis
* Run many ML algorithms using H2O
* Compare all model performance in test dataset
* Choosing the best model
* Interpret model output with Shapley Value

# 1. Parameters

In [None]:
#Model ID
ModelId='NLP_Disaster_Tweets_FML_v1'

#Setting the model target variable name
var_target = 'target'

#process outputs such as MOJO model, images and performance of tested models
OutputPath='/kaggle/working'

#If you have a huge dataset, I should consider use a small sample for first execution
pct_sample_size = 1

# 2. Import Libraries

In [None]:
import glob
import functools
import datetime as dt
import pandas as pd
import numpy as np
import h2o
import matplotlib.pyplot as plt
import shap
from pandas_profiling import ProfileReport
from collections import defaultdict
from pandas_profiling.model.base import get_var_type
import seaborn as sns
import os
import random
import re

# 3. Text Preprocessing

In [None]:
#Import bases with features for modeling
#In this case we will use titanic dataset available below
dataprep_df_full = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
#The target variavle must be integer 0 or 1

dataprep_df_full[var_target] = dataprep_df_full[var_target].astype(int)

### It is necessary to create a variable to indicate the records used in training and testing. In this case we will use the random variable, but you can use a date variable for exemple if you have a base with a reference date to fix the training base as an out of time validation.

In [None]:
random.seed(59354518745)
for i in range(len(dataprep_df_full)):
    dataprep_df_full.loc[i, ('random')] = random.random()
dataprep_df_full['dataset'] = ['train' if x <= 0.85 else 'test' for x in dataprep_df_full['random']]
dataprep_df_full = dataprep_df_full.drop(columns=['random'])

In [None]:
#Work with a sample data if the pct_sample_size is less than 1
if pct_sample_size == 1:
    dataprep_df = dataprep_df_full
else:    
    dataprep_df = dataprep_df_full.sample(frac=pct_sample_size, replace=False, random_state=1)

## 3.1 Data Cleaning

In [None]:
def remove_pattern(input_txt,pattern):
    r=re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i,'',input_txt)
    return input_txt

In [None]:
#Removing Twitter handles (@user)
dataprep_df['tidy_text'] = np.vectorize(remove_pattern)(dataprep_df['text'],'@[/w]*')
dataprep_df.head()

In [None]:
#Removing Punctuations, Numbers, and Special Characters
dataprep_df['tidy_text'] = dataprep_df['tidy_text'].str.replace('[^a-zA-Z#]',' ')
dataprep_df.head()

In [None]:
#Removing Short Words
dataprep_df['tidy_text'] = dataprep_df['tidy_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
dataprep_df.head()

## 3.2 Tokenizer and Lemma

In [None]:
#Tokenizing
tokenized_text = dataprep_df['tidy_text'].apply(lambda x: x.split())
tokenized_text.head()

In [None]:
#Stemming
from nltk.stem.porter import *
stemmer = PorterStemmer()
tokenized_text = tokenized_text.apply(lambda x: [stemmer.stem(i) for i in x])
tokenized_text

In [None]:
for i in range (len(tokenized_text)):
    tokenized_text[i] = ' '.join(tokenized_text[i])
dataprep_df['tidy_text'] = tokenized_text
dataprep_df.head()

## 3.3 TF-IDF features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df = 0.90, min_df= 2,
                                  max_features=1000, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(dataprep_df['tidy_text'])
tfidf.shape

In [None]:
#Set tfidf feature names
df_tfidf = pd.DataFrame(tfidf.todense())
df_tfidf.columns =[tfidf_vectorizer.get_feature_names()]
df_tfidf.columns =  df_tfidf.columns.get_level_values(0)

In [None]:
dataprep_df = dataprep_df.merge(df_tfidf, left_index=True, right_index=True,suffixes=('', '_y'))

# 4. Univariate Analysis

## 4.1 Pandas Profiling

##### For more details on the pandas profiling library see https://github.com/pandas-profiling/pandas-profiling


In [None]:
#Generate report
#If the database has many records or columns, the report can take a long time
#If this is the case, disable the explorative, samples, correlations, missing_diagrams, duplicates and interactions options by commenting out
profile = ProfileReport(dataprep_df, title=f"Pandas Profiling Report{ModelId}"
                        ,explorative=True
                        #,samples=None
                        #,correlations=None
                        #,missing_diagrams=None
                        #,duplicates=None
                        #,interactions=None
                       )
# profile.to_file("profile.html")
# display(profile)

## 4.2 Classify the types of variables
#### list all columns to select the ones to be used

In [None]:
# Get all the types pandas_profiling offers
list_columns = dataprep_df.columns.drop(['dataset',var_target, 'id', 'text', 'tidy_text'])
d = {col: get_var_type(dataprep_df[col])['type'].value for col in list_columns}
fd = defaultdict(list)
for k, v in d.items():
    fd[v].append(k)
     
cols_by_base_type = dict(fd)
# Group the types pandas_profiling offers to match typical needs
cat_num_cols = defaultdict(list)
for k, v in cols_by_base_type.items():
    # Treat boolean and unique columns as categorical
    k = 'CAT' if k in ['BOOL', 'UNIQUE'] else k
    cat_num_cols[k].extend(v)
dict(cat_num_cols)

### From the variables listed above you can select which  one will be tested in the model and confirm if the correct type is numeric(NUM) or categorical (CAT). Paste the correct information below:

In [None]:
#It is necessary to define the types of variables (cageroric and numeric) to ensure that the type of data used in the modeling will be the most suitable.
#For example, categorical variables need to be defined as a string because this prevents it from being treated as a numeric variable in H20 modeling
#Another example is that the string variables will have a missing treatment by placing the missing category for all values found as 'null'
CAT = ['keyword', 'location']
#float
NUM = cat_num_cols['NUM']
selected_features = CAT + NUM

In [None]:
#Numeric features must be float type
for col_name in NUM:    
    dataprep_df[col_name] = dataprep_df[col_name].astype(float)    

#Categorical features must be string type and null values will be filled with "missing"
for col_name in CAT:        
    dataprep_df[col_name] = dataprep_df[col_name].astype(str)
    dataprep_df = dataprep_df.fillna(value={col_name: 'missing'})

# 5. Bivariate Analysis

## 5.1 Calculation of the Information Value for all variables defined as selected features

In [None]:
def calculate_IV(dataframe, coluna_feature, coluna_target, cat_goods = None, buckets=20):
    '''
    Function to calculate the IV.
         Parameters
         dataframe: DataFrame with the input and target variables.
         column_feature (str): Name of the variable that contains the independent variable.
         column_target (str): Name of the variable that contains the dependent variable or target.
         cat_goods (str): Level of the target variable that should be considered "GOOD", if it is categorical.
         buckets (int): Number of partitions to be created in numeric variables.
    
    Returns
    stats (list):
    List with:
        [1] IV
        [0] dataframe pandas with statistics table
    '''
    
    # Initial definitions
    df = dataframe.loc[:,(coluna_feature, coluna_target)]
    tpVar = 'categorical'    
    
    #If the variable is numeric (float or int), it creates a category for discretization
    if df[coluna_feature].dtype=='float64' or df[coluna_feature].dtype=='int64' or df[coluna_feature].dtype=='int32' or df[coluna_feature].dtype=='float32':
        tpVar='numeric'
        coluna_feature_bucket = coluna_feature + "_bucket"        
        #create buckets using qcut
        df[coluna_feature_bucket] = pd.qcut(df[coluna_feature], buckets, labels=False, duplicates='drop')
        analyse_df = df.groupby(coluna_feature_bucket).agg({coluna_target: ['count', 'sum'], coluna_feature: ['min', 'max']})
        analyse_df.columns = ['_'.join(tup).rstrip('_') for tup in analyse_df.columns.values]
        analyse_df.rename(columns={(coluna_target+'_count'):'qty', (coluna_target+'_sum'):'qty_goods'}, inplace=True)
        
    #Categorical variables
    if tpVar == 'categorical':        
        analyse_df = df.groupby(coluna_feature).agg({coluna_target: ['count', 'sum']})
        analyse_df.columns = ['_'.join(tup).rstrip('_') for tup in analyse_df.columns.values]
        analyse_df.rename(columns={(coluna_target+'_count'):'qty', (coluna_target+'_sum'):'qty_goods'}, inplace=True)
        
    #IV Calculation
    analyse_df.loc[:, 'qty_bads'] = analyse_df.loc[:,'qty'] - analyse_df.loc[:,'qty_goods']
    analyse_df.loc[:, 'tot_goods'] = analyse_df.loc[:,'qty_goods'].sum()
    analyse_df.loc[:, 'tot_bads'] = analyse_df.loc[:,'qty_bads'].sum()
    analyse_df.loc[:, 'perc_goods'] = analyse_df.loc[:,'qty_goods'] / analyse_df.loc[:,'tot_goods']
    analyse_df.loc[:, 'perc_bads'] = analyse_df.loc[:,'qty_bads'] / analyse_df.loc[:,'tot_bads']
    analyse_df.loc[:, 'good_rate'] = analyse_df.loc[:,'qty_goods'] / analyse_df.loc[:,'qty']
    analyse_df.loc[:, 'odds'] = analyse_df.loc[:,'perc_goods'] / analyse_df.loc[:,'perc_bads']
    analyse_df.loc[:, 'ln_odds'] = np.log2(analyse_df['odds'])
    analyse_df.loc[:, 'iv_cat'] = (analyse_df.loc[:,'perc_goods'] / analyse_df.loc[:,'perc_bads']) * analyse_df.loc[:, 'ln_odds']
    
    if tpVar == 'numeric':
        analyse_df.reset_index(inplace=True)
        tabela_pdf = analyse_df.loc[:, (coluna_feature_bucket, coluna_feature+"_min", coluna_feature+"_max", 'qty', 'good_rate', 'odds', 'iv_cat')]
    else:
        analyse_df.reset_index(inplace=True)
        tabela_pdf = analyse_df.loc[:, (coluna_feature, 'qty', 'good_rate', 'odds', 'iv_cat')]
        
    df_iv = tabela_pdf.query('iv_cat != inf')['iv_cat'].sum()                       
    resultado = [df_iv, tabela_pdf]
    return resultado

def colunas_dataframe(dataframe):
    lista_colunas = []
    lista_colunas = [i for i in dataframe.columns if i in selected_features]
    return lista_colunas

def table_iv(dataframe):
    lista_colunas = colunas_dataframe(dataframe)
    dict_resultados = {}
    for col in lista_colunas:
        print("{0:.0%}".format((lista_colunas.index(col)+1) / (len(lista_colunas)+1)) + ":" + col)              
        dict_resultados[col] = calculate_IV(dataframe=dataframe, coluna_feature=col, coluna_target=var_target, buckets=10)
    return dict_resultados

In [None]:
start = dt.datetime.now()

result_data = table_iv(dataprep_df)
result_formated = pd.DataFrame.from_dict(data=result_data, orient='index').reset_index().rename(columns={'index': 'Variable', 0: 'IV'}).drop(columns=1)
result_formated_graph = result_formated.sort_values(by=['IV'], ascending=False)
                                                                                                                 
#Execution time
stop = dt.datetime.now()
execution_time = stop-start
print("\n"+"Execution time: " + str (execution_time)+"\n")

In [None]:
fig = plt.figure(figsize=(10,10))
sns.barplot(y="Variable", x="IV", data=result_formated_graph.head(40), palette="Blues_r").set_title("Information Value (IV)")
plt.axvline (x=0.02, linestyle="--", color='r')

## 5.2 Charts with the good rate (% of target = 1) by categories or by value range for numeric variables (ranges created by decile)

In [None]:
# for i in selected_features:
#     df_plot_tmp = result_data[i][1]
#     df_plot_tmp['Distribution'] = df_plot_tmp.qty / df_plot_tmp.qty.sum()
#     if i in CAT:
# #         print('--------------------------------------- ' + str(i))
#         df_plot_tmp = df_plot_tmp.sort_values(by=i)
#         df_plot_t1 = df_plot_tmp.loc[:, (i,'Distribution')]
#         df_plot_t1 = df_plot_t1.set_index(i)
#         df_plot_t2 = df_plot_tmp.loc[:, (i, 'good_rate')]
#         df_plot_t2 = df_plot_t2.set_index(i)
#         df_plot_t1.Distribution.plot(ylim=[0,1], kind='bar', rot=90, figsize=(15,5), linewidth=2, fontsize=12, grid=True, legend=1, title=i)
#         ax = df_plot_t2.good_rate.plot(secondary_y=True, kind="line", rot=90, figsize=(15,5), linewidth=2, fontsize=12\
#                                        , marker="D", ms=8, grid=True, color='r', legend=1)
#         for p in range(len(df_plot_t2)):
#             ax.annotate(str('{0:.1%}'.format(int(df_plot_t2.reset_index().iloc[p,1]*1000)/1000))\
#                             ,(df_plot_t2.reset_index().index.values[p]\
#                               ,df_plot_t2.reset_index().iloc[p,1]*1))
#         plt.show()
#         #display(ax)
#     else:
#         sort_var = str(i)+"_max"
# #         print('--------------------------------------- ' + str(i))
#         df_plot_tmp[sort_var] = df_plot_tmp[sort_var].astype(float)
#         df_plot_tmp = df_plot_tmp.sort_values(by=sort_var)
#         df_plot_tmp[i] = df_plot_tmp[sort_var].fillna(999999999.99).astype(float)
#         df_plot_tmp[i] = (df_plot_tmp[i]*100).astype(int)/100
#         df_plot_tmp[i] = df_plot_tmp[i].astype(str).replace("999999999.99", "missing")
#         df_plot_t1 = df_plot_tmp.loc[:, (i, 'Distribution')]
#         df_plot_t1 = df_plot_t1.set_index(i)
#         df_plot_t2 = df_plot_tmp.loc[:, (i, 'good_rate')]
#         df_plot_t2 = df_plot_t2.set_index(i)
#         df_plot_t1.Distribution.plot(ylim=[0,1], kind='bar', rot=90, figsize=(15,5), linewidth=2, fontsize=12, grid=True, legend=1, title=(i + ": ranges by decile"))
#         ax = df_plot_t2.good_rate.plot(secondary_y=True, kind="line", rot=90, figsize=(15,5), linewidth=2, fontsize=12, marker="D", ms=8, grid=True, color='r', legend=1)
#         for p in range(len(df_plot_t2)):
#             ax.annotate(str('{0:.1%}'.format(int(df_plot_t2.reset_index().iloc[p,1]*1000)/1000)), (df_plot_t2.reset_index().index.values[p], df_plot_t2.reset_index().iloc[p,1]*1))        
#         plt.show()
#         #display(ax)

# 6. Modeling

## 6.1 Creating context and H2O and Importing data into the H2O context

In [None]:
# Number of threads, nthreads = -1, means use all cores on your machine
# max_mem_size is the maximum memory (in GB) to allocate to H2O
h2o.init(nthreads = -1, max_mem_size = 8)

In [None]:
#Import TRAINING base to the H20 context
data_hdf = h2o.H2OFrame(dataprep_df.query('dataset == "train"'))

# Conversion of Target variables and categorical features to factor (enum)
#no H2O it is necessary that the categorical variables are transformed into a factor
data_hdf[var_target] = data_hdf[var_target].asfactor()
for col_name in CAT:
    data_hdf[col_name] = data_hdf[col_name].asfactor()    
    
# Partition data into 90%, 10% chunks
# Setting a seed will guarantee reproducibility
train_hdf, valid_hdf = data_hdf.split_frame(ratios=[0.90], destination_frames=['train_hdf', 'valid_hdf'], seed=1)
        
#Notice that `split_frame()` uses approximate splitting not exact splitting (for efficiency), so these are not exactly 90%, 10% of the total rows.
print('Training: ' + str(train_hdf.nrow))
print('Validation: ' + str(valid_hdf.nrow))

In [None]:
#Import TEST base to the H20 context
test_hdf = h2o.H2OFrame(dataprep_df.query('dataset == "test"'))

# Conversion of Target variables and categorical features to factor (enum)
#no H2O it is necessary that the categorical variables are transformed into a factor
test_hdf[var_target] = test_hdf[var_target].asfactor()
for col_name in CAT:
    test_hdf[col_name] = test_hdf[col_name].asfactor()    
    
print('Training: ' + str(test_hdf.nrow))

## 6.2 Using H2O to performe many ML algorithms

## Logistic Regresion (GLM)

In [None]:
vModel = 'GLM_'

start = dt.datetime.now()
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

#definir parámetros
GLM = H2OGeneralizedLinearEstimator(family= 'binomial',
                                    seed=1,
                                    model_id='%s%s%s' % (vModel, ModelId, str(dt.datetime.now())[:19].replace('-',"").replace(':',"").replace(' ',"_")))

#Executar Modelo
GLM.train(x = selected_features,
          y = var_target,
          training_frame = train_hdf,
          validation_frame = valid_hdf)

#Execution time of the model
stop = dt.datetime.now()
execution_time = stop-start
print("\n"+ "Execution time: " + str(execution_time) +"\n")
print(GLM)

## GBM - Gradient Boosting Machine

In [None]:
vModel='GBM_'

#Execution time of the model
start = dt.datetime.now()

from h2o.estimators.gbm import H2OGradientBoostingEstimator
GBM = H2OGradientBoostingEstimator(model_id='%s%s%s' % (vModel, ModelId, str(dt.datetime.now())[:19].replace('-',"").replace(':',"").replace(' ',"_")),
                                   ntrees=500,
                                   score_tree_interval=5,     #used for early stopping
                                   stopping_rounds=3,         #used for early stopping
                                   stopping_metric='AUCPR',     #used for early stopping
                                   stopping_tolerance=0.0005, #used for early stopping
                                   seed=1)

# The use of a validation_frame is recommended with using early stopping
GBM.train(x=selected_features, y=var_target, training_frame=train_hdf, validation_frame=valid_hdf)

#Execution time of the model
stop = dt.datetime.now()
execution_time = stop-start
print("\n"+ "Execution time: " + str(execution_time) + "\n")
print(GBM)

## GBM - Gradient Boosting Machine with Cross-Validation

In [None]:
vModel='GBM_cv_'

#Execution time of the model
start = dt.datetime.now()

from h2o.estimators.gbm import H2OGradientBoostingEstimator
GBM_cv = H2OGradientBoostingEstimator(model_id='%s%s%s' % (vModel, ModelId, str(dt.datetime.now())[:19].replace('-',"").replace(':',"").replace(' ',"_"))
                                   ,nfolds=5
                                   ,seed=1)

# The use of a validation_frame is recommended with using early stopping
GBM_cv.train(x=selected_features, y=var_target, training_frame=train_hdf, validation_frame=valid_hdf)

#Execution time of the model
stop = dt.datetime.now()
execution_time = stop-start
print("\n"+ "Execution time: " + str(execution_time) + "\n")
print(GBM_cv)

## Random Forest

In [None]:
vModel='DRF_CV_'

#Execution time of the model
start = dt.datetime.now()

from h2o.estimators.random_forest import H2ORandomForestEstimator

DRF = H2ORandomForestEstimator(seed=1
                               ,nfolds=5
                               ,model_id='%s%s%s' % (vModel, ModelId, str(dt.datetime.now())[:19].replace('-',"").replace(':',"").replace(' ',"_")))

# The use of a validation_frame is recommended with using early stopping
DRF.train(x=selected_features, y=var_target, training_frame=train_hdf, validation_frame=valid_hdf)

#Execution time of the model
stop = dt.datetime.now()
execution_time = stop-start
print("\n"+ "Execution time: " + str(execution_time) + "\n")
print(DRF)

## H2OAutoML

In [None]:
vModel='AUTOML'

#Execution time of the model
start = dt.datetime.now()

#Set the maximum time in seconds for the H20 AutoML
max_runtime_secs=60*10

#Define metrics to select the best model in AutoML
sort_metric = 'AUCPR'

from h2o.automl import H2OAutoML
AUTOML = H2OAutoML(seed=1,
                   include_algos = ['DRF', 'GLM', 'XGBoost', 'GBM', 'DeepLearning', 'StackedEnsemble'],
                   max_runtime_secs = max_runtime_secs,
                   stopping_metric = sort_metric,
                   sort_metric = sort_metric)
AUTOML.train(x=selected_features, y=var_target, training_frame = train_hdf, validation_frame = valid_hdf, leaderboard_frame=test_hdf)

#View the AutoML Leaderboard
lb = AUTOML.leaderboard
print(lb.head(rows=lb.nrows))

#Execution time of the model
stop = dt.datetime.now()
execution_time = stop-start
print("\n"+ "Execution time: " + str(execution_time) + "\n")

In [None]:
#Choose the desired AutoML model
best_automl_position=0
if len(AUTOML.leaderboard) > 0:
    best_AutoML = h2o.get_model(AUTOML.leaderboard[best_automl_position, 0])
    print(best_AutoML)

## 6.3 Compare performance on the TEST dataset for all trained models

In [None]:
#Create empty model list
list_models = []

#Define the list of all models that have been executed and should be compared
try:
    list_models.append(GLM)
except NameError:
    GLM = None
try:
    list_models.append(GBM)
except NameError:
    GBM = None
try:
    list_models.append(GBM_cv)
except NameError:
    GBM_cv = None
try:
    list_models.append(DRF)
except NameError:
    DRF = None
try:
    list_models.append(best_AutoML)
except NameError:
    best_AutoML = None

In [None]:
#Compare performance on the TEST dataset for all trained models
plt.rcParams.update({'font.size': 12})
fig = plt.figure(figsize=(10, 10))
for i in list_models:
    #Save all models in H20 format
    h2o.save_model(model=i, path='%s/models/todos/' % OutputPath, force=True)
    #calculate o KS in test dataset
    h2o_predict = i.predict(test_hdf)
    data = h2o_predict.cbind(test_hdf[var_target]).as_data_frame()
    data['target0'] = 1 - data[var_target]
    data['bucket'] = pd.qcut(data['p1'], 10, duplicates='drop', labels=False)
    grouped = data.groupby('bucket', as_index=False)
    kstable = pd.DataFrame()
    kstable['min_prob'] = grouped.min()['p1']
    kstable['max_prob'] = grouped.max()['p1']
    kstable['events'] = grouped.sum()[var_target]
    kstable['nonevents'] = grouped.sum()['target0']    
    kstable = kstable.sort_values(by="min_prob", ascending=False).reset_index(drop=True)
    kstable['event_rate'] = (kstable.events / data[var_target].sum()).apply('{0:.2%}'.format)
    kstable['nonevent_rate'] = (kstable.nonevents / data['target0'].sum()).apply('{0:.2%}'.format)
    kstable['cum_eventrate'] = (kstable.events / data[var_target].sum()).cumsum()
    kstable['cum_noneventrate'] = (kstable.nonevents / data['target0'].sum()).cumsum()
    kstable['KS'] = np.round(kstable['cum_eventrate'] - kstable['cum_noneventrate'], 3)
    ks = kstable['KS'].max()
    
    #Ascertain the performance of all models on the test base
    performance = i.model_performance(test_hdf)
    
    #Salve metrics
    f=open("%s/models/todos/performance_%s.csv" % (OutputPath, i.model_id), 'w')
    f.write(
        str(i.model_id) + ";"
        + str(performance.accuracy()[0][0]) + ";"
        + str(performance.auc()) + ';'
        + str(performance.aucpr()) + ';'
        + str(performance.logloss()) + ';'
        + str(ks) + ';'
        + str(performance.F1()[0][0]))
    f.write('\n')
    f.close()

    #graph with the ROC curve of all models
    fpr = performance.fprs
    tpr = performance.tprs
    plt.plot(fpr, tpr, lw=2, label=i.model_id.split("_")[0]+"_"+i.model_id.split("_")[1]+"_"+i.model_id.split("_")[2])
    plt.title("ROC Curve for Model (Test dataset)")    
    
    if i.model_id==list_models[0].model_id:
        df_plot = pd.DataFrame({'Model_id': i.model_id.split("_")[0]+"_"+i.model_id.split("_")[1]+"_"+i.model_id.split("_")[2],
                                    'AUROC': int(performance.auc()*100)/100,
                                    'AUCPR': int(performance.aucpr()*100)/100,
                                    'KS': int(ks*100)/100
                                    }, index=[0])
    else:
        df_plot = df_plot.append(pd.DataFrame({'Model_id': i.model_id.split("_")[0]+"_"+i.model_id.split("_")[1]+"_"+i.model_id.split("_")[2],
                                    'AUROC': int(performance.auc()*100)/100,
                                    'AUCPR': int(performance.aucpr()*100)/100,
                                    'KS': int(ks*100)/100
                                    }, index=[0]))
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
plt.xlim([0.0, 1.05])
plt.ylim([0.0, 1.05])
plt.legend(loc="lower right")
plt.show()
plt.close()
ax = df_plot.plot(kind='bar', x="Model_id", title="AUROC, AUCPR e KS for Model (Test dataset)", grid=True, figsize=(10,5), legend=1)
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
plt.legend(loc=3, prop={'size': 10})

## 6.4 Choose the best model among all tested

In [None]:
#Consider all models in the history ./models/todos/performance_*.csv. To disregard any old version, set erase_modelos = "S":
apagar_modelos = 'N'
if apagar_modelos == 'S':
    os.system('rm %s/models/todos/performance_*.csv' % OutputPath)

In [None]:
sort_metric_best_model='AUCPR'
#importar todos os modelos testados e imprmie na tela os 10 melhores erdedando per AUC
modelos_testados = pd.concat(map(functools.partial(pd.read_csv, sep=';', header=None), glob.glob('%s/models/todos/performance_*.csv' % OutputPath)))
modelos_testados.columns = ('model_id', 'accuracy', 'AUC', 'AUCPR', 'logloss', 'KS', 'F1')
modelos_testados = modelos_testados.sort_values(by=sort_metric_best_model, ascending=False)
modelos_testados = modelos_testados.drop_duplicates(subset=["model_id"])
print('MBest Models. Sorted by : ' + str(sort_metric_best_model))
modelos_testados.reset_index(0).head(30)

In [None]:
#If you want to choose a model other than the first one on the list. Choose the position number:
posicao_melhor_modelo=0

melhor_modelo = h2o.load_model('%s/models/todos/%s' % (OutputPath, modelos_testados.iloc[posicao_melhor_modelo, 0]))
(print("\n"+ "BEST MODEL: " + str(modelos_testados.iloc[posicao_melhor_modelo, 0]) + "\n"))

plt.rcParams.update({'font.size': 10})
try:
    melhor_modelo.varimp_plot(50)
except Exception as e:
    print("Warning: This model doesn't have variable importances")

## 6.4 Stepwise for Analysis of the importance of variables

In [None]:
#Listar todas as variáveis do modelo atual, ordenadas por variable importance
#Para as variaveis definidas como fator (que possivelmente estão como dummys), remover a categoria do nome e deixar apenas o nome orifinal da variavel

#List all variables in the current model, ordered by variable importance
#For variables defined as a factor (which possibly are like dummys), remove the category from the name and leave only the orifinal name of the variable
try:
    df_features_sorted = melhor_modelo.varimp(True).variable.str.split('.', expand=True).drop_duplicates(subset = 0)[0].reset_index(drop=True)
except Exception as e:
    #As the model with ensemble in H20 does not show the importance of variables, we will include variables with higher IV first using result_formatado graph of step 5.1
    df_features_sorted = result_formated_graph.Variable.reset_index(drop=True)

In [None]:
#Define the number of variables to be increased with each new model. Try to put 10% or 20% of the total, as it can take a long time
qt_var=100
qt_total_var = len(df_features_sorted)

dict_model_tmp={}
dict_performance={}

for i in range(qt_var, qt_total_var+qt_var, qt_var):    
    df_features_sorted[0:i].values.tolist()    
    
    #If no model chosen is not an ensemble of models. Then use the same model for training with increment of variables
    melhor_modelo_tmp = melhor_modelo
    if melhor_modelo_tmp.model_id.lower().find("ensemble") == -1:
        dict_model_tmp[i] = melhor_modelo_tmp
        dict_model_tmp[i].train(x = df_features_sorted[0:i].values.tolist(),
                                y = var_target,
                                training_frame=train_hdf, 
                                validation_frame=valid_hdf)
    ##If it is not possible, for the home of an ensemble of models, use GradientBoostingEstimator to make the assessment
    else:
        dict_model_tmp[i] = H2OGradientBoostingEstimator(seed=1, model_id=str('model_tmp_%s' % i))
        dict_model_tmp[i].train(x = df_features_sorted[0:i].values.tolist(),
                                y = var_target,
                                training_frame=train_hdf, 
                                validation_frame=valid_hdf)       


    perform_oot = dict_model_tmp[i].model_performance(test_hdf)
    dict_performance_tmp = {}
    dict_performance_tmp['AUC'] = {'qt_var': i, 'medida': 'AUC', 'Validation_Dataset': dict_model_tmp[i].auc(valid=True), 'Test_Dataset': perform_oot.auc()}
    dict_performance_tmp['accuracy'] = {'qt_var': i, 'medida': 'accuracy', 'Validation_Dataset': dict_model_tmp[i].accuracy(valid=True)[0][0], 'Test_Dataset': perform_oot.accuracy()[0][0]}
    dict_performance_tmp['AUCPR'] = {'qt_var': i, 'medida': 'AUCPR', 'Validation_Dataset': dict_model_tmp[i].aucpr(valid=True), 'Test_Dataset': perform_oot.aucpr()}
    dict_performance_tmp['F1'] = {'qt_var': i, 'medida': 'F1', 'Validation_Dataset': dict_model_tmp[i].F1(valid=True)[0][0], 'Test_Dataset': perform_oot.F1()[0][0]}
    dict_performance_tmp['logloss'] = {'qt_var': i, 'medida': 'logloss', 'Validation_Dataset': dict_model_tmp[i].logloss(valid=True), 'Test_Dataset': perform_oot.logloss()}
    dict_performance[i] = pd.DataFrame(dict_performance_tmp).transpose()

In [None]:
##Plot graph comparing the increase in performance with the increase in variables
for i in dict_performance.keys():
    if i == list(dict_performance.keys())[0]:
        df_performance = dict_performance[i]
    else:
        df_performance = df_performance.append(dict_performance[i], ignore_index=True)

lista_metricas_perf = df_performance['medida'].unique()

for i in range(len(lista_metricas_perf)):   
    #selects only the metric to be analyzed
    metrics_df_tmp = df_performance.query('medida == "%s"' % lista_metricas_perf[i])
    metrics_df_tmp = metrics_df_tmp.set_index('qt_var')
    del metrics_df_tmp['medida']
    if lista_metricas_perf[i] == 'logloss':
        max_oot = metrics_df_tmp[metrics_df_tmp['Test_Dataset'] == metrics_df_tmp.Test_Dataset.min()].index.values
    else:
        max_oot = metrics_df_tmp[metrics_df_tmp['Test_Dataset'] == metrics_df_tmp.Test_Dataset.max()].index.values
        
    if lista_metricas_perf[i] == sort_metric_best_model:
        max_oot_filtro = max_oot[0]        
    
    ax=metrics_df_tmp.plot(figsize=(15,5), linewidth=2, fontsize=10, marker='D', ms=5,\
                            title='Best %s with %s Variables' % (lista_metricas_perf[i].upper(), str(max_oot[0])))
    plt.xlabel('Variables Number')
    plt.ylabel('%s' % lista_metricas_perf[i].upper())
    plt.grid(axis='y')
    plt.legend(loc=0, prop={'size': 12})
    #display(ax)

In [None]:
print('Consider removing the following variables: '+ str(df_features_sorted[df_features_sorted.index > int(max_oot_filtro)].values.tolist()))

## 6.5 Exporting the best model to Deploy

In [None]:
#Save the H2O model in MOJO format and all the variables of the best model
melhor_modelo = h2o.load_model('%s/models/todos/%s' % (OutputPath, modelos_testados.iloc[posicao_melhor_modelo, 0]))
caminho_modelo_mojo = melhor_modelo.download_mojo('%s/models/melhores/' % OutputPath, get_genmodel_jar=True)
print(caminho_modelo_mojo)
caminho_modelo_h2o = h2o.save_model(model=melhor_modelo, path='%s/models/melhores/' % OutputPath, force=True)

In [None]:
try:
    features_names= melhor_modelo.varimp(True)
    features_names.to_csv('%s/models/melhores/features_names_%s.csv' % (OutputPath, melhor_modelo.model_id), sep=';')
except Exception as e:
    print("Warning: This model doesn't have variable importances")

# 7. Calculate Shapley Values using SHAP KernelExplainer for H20 models

#### The SHAP library calculates the Shaley Value of each variable used in the model and shows the individual impact of each variable on the predicted value for each record. To better understand how the SHAP library works, see the link https://github.com/slundberg/shap

In [None]:
class H2oProbWrapper:
    def __init__(self, h2o_model, feature_names):
        self.h2o_model = h2o_model
        self.feature_names = feature_names
    def predict_binary_prob(self, X):
        if isinstance(X, pd.Series):
            X = X.values.reshape(1,-1) 
        self.dataframe = pd.DataFrame(X, columns=self.feature_names)
        
        global NUM
        #Variaveis explicativas continuas
        for col_name in NUM:    
            self.dataframe[col_name] = self.dataframe[col_name].astype(float)
            
        global CAT
        for col_name in CAT:    
            self.dataframe[col_name] = self.dataframe[col_name].astype(str)
            self.dataframe = self.dataframe.fillna(value={col_name: 'missing'})
        
        self.h2oframe = h2o.H2OFrame(self.dataframe)
        for col_name in CAT:
            self.h2oframe[col_name] = self.h2oframe[col_name].asfactor()
        
        self.predictions = self.h2o_model.predict(self.h2oframe).as_data_frame().values
        return self.predictions.astype('float64') [:,-1]

In [None]:
#The calculation of the Shapley Value for H20 models takes a while. So it will only be done for 20 records. Increase the sample to deepen your analysis
shap_sample = dataprep_df.query('dataset == "test"').loc[:,(selected_features)].sample(n=20, replace=False, random_state=1)
shap_sample = shap_sample.fillna(0)

In [None]:
h2o_wrapper = H2oProbWrapper(melhor_modelo, selected_features)
h2o_explainer = shap.KernelExplainer(h2o_wrapper.predict_binary_prob, shap_sample)
h2o_shap_values = h2o_explainer.shap_values(shap_sample, nsamples="auto")

## Main SHAP Graphics

In [None]:
fig = shap.summary_plot(h2o_shap_values, shap_sample, plot_type="bar", show=True)
display(fig)

In [None]:
display(shap.summary_plot(h2o_shap_values, shap_sample, show=False))

In [None]:
#sort the features indexes by their importance in the model
#(sum of SHAP value magnitudes over the validation dataset)
top_inds = np.argsort(-np.sum(np.abs(h2o_shap_values),0))

#make SHAP plots of the three most important features
for i in range(9):
    fig=shap.dependence_plot(top_inds[i], h2o_shap_values, shap_sample, show=False)
#     display(fig)

In [None]:
df_shap_values = pd.DataFrame(h2o_shap_values)
df_shap_values['sum_shap'] = df_shap_values.sum(axis=1)

### Shap Force Plot

In [None]:
for i in df_shap_values.sort_values(by='sum_shap').iloc[0:3,:].index.values:
    fig = shap.force_plot(h2o_explainer.expected_value, h2o_shap_values[i,:], shap_sample.iloc[i,:], matplotlib=True, show=True)
    display(fig)

### Shap Waterfall Plot

In [None]:
for i in df_shap_values.sort_values(by='sum_shap').iloc[0:3,:].index.values:
    fig = shap.plots._waterfall.waterfall_legacy(h2o_explainer.expected_value, h2o_shap_values[i,:], shap_sample.iloc[i,:].to_numpy(), selected_features, show=True)
    display(fig)

# 8. Predict test dataset using MOJO or H2O Model

In [None]:
#Import test dataset
submission_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
# Text preprocessing

#Removing Twitter handles (@user)
submission_df['tidy_text'] = np.vectorize(remove_pattern)(submission_df['text'],'@[/w]*')

#Removing Punctuations, Numbers, and Special Characters
submission_df['tidy_text'] = submission_df['tidy_text'].str.replace('[^a-zA-Z#]',' ')

#Removing Short Words
submission_df['tidy_text'] = submission_df['tidy_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

#Tokenizing
tokenized_text = submission_df['tidy_text'].apply(lambda x: x.split())

#Stemming
from nltk.stem.porter import *
stemmer = PorterStemmer()
tokenized_text = tokenized_text.apply(lambda x: [stemmer.stem(i) for i in x])

for i in range (len(tokenized_text)):
    tokenized_text[i] = ' '.join(tokenized_text[i])
submission_df['tidy_text'] = tokenized_text

#TF-IDF
tfidf = tfidf_vectorizer.fit_transform(submission_df['tidy_text'])

#Set tfidf feature names
df_tfidf = pd.DataFrame(tfidf.todense())
df_tfidf.columns =[tfidf_vectorizer.get_feature_names()]
df_tfidf.columns =  df_tfidf.columns.get_level_values(0)

submission_df = submission_df.merge(df_tfidf, left_index=True, right_index=True,suffixes=('', '_y'))

In [None]:
#Numeric features must be float type
for col_name in NUM:
    if col_name in submission_df.columns:
        submission_df[col_name] = submission_df[col_name].astype(float)

#Categorical features must be string type and null values will be filled with "missing"
for col_name in CAT:
    if col_name in submission_df.columns:
        submission_df[col_name] = submission_df[col_name].astype(str)
        submission_df = submission_df.fillna(value={col_name: 'missing'})

In [None]:
#Importar MOJO
try:
    test_tmp = h2o.mojo_predict_pandas(submission_df, caminho_modelo_mojo)    
    predict_df = submission_df.merge(test_tmp, left_index=True, right_index=True)
except:    
    submission_hdf = h2o.H2OFrame(submission_df)
    for col_name in CAT:
        submission_hdf[col_name] = submission_hdf[col_name].asfactor() 
    h2o_predict = melhor_modelo.predict(submission_hdf)
    predict_df = h2o_predict.cbind(submission_hdf).as_data_frame()

predict_df = predict_df.drop(columns=['target'])
predict_df.rename(columns={'predict':'target'}, inplace=True)
predict_df.loc[:, ('id', 'target')]

# 9. Save final dataset with predictions

In [None]:
predict_df.loc[:, ('id', 'target')].to_csv('/kaggle/working/disaster_tweets_submission.csv', index=False)

## FIM