# This notebook is derived from Kaggle Grandmaster Bojan Tunguz's NB
https://www.kaggle.com/tunguz/elo-with-h2o-automl
This notebook merely builds on his fantastic feature engineering (194 features!) and tries to build a Deep Learning model using a new AutoML library known as Deep AutoViML. You can see its github here:
Github: https://github.com/AutoViML/deep_autoviml

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("/kaggle/input/fathers-day-specials-just-the-features/new_train.csv")
test = pd.read_csv("/kaggle/input/fathers-day-specials-just-the-features/new_test.csv")
print(train.shape)
train.head()

In [None]:
#train.to_csv('elo_new_train.csv',index=False)
#test.to_csv('elo_new_test.csv', index=False)

In [None]:
trf = train.sample(frac=0.01, random_state=99)
trf.shape

In [None]:
#trf.to_csv('elo_train.csv', index=False)

In [None]:
!pip install deep_autoviml --upgrade

In [None]:
from deep_autoviml import deep_autoviml as deepauto

In [None]:
print(test.shape)
test.head()

In [None]:
################################################################################################
def EDA_find_columns_with_infinity(df):
    """
    This function finds all columns in a dataframe that have inifinite values (np.inf or -np.inf)
    It returns a list of column names. If the list is empty, it means no columns were found.
    """
    add_cols = []
    sum_cols = 0
    for col in df.columns:
        inf_sum1 = 0
        inf_sum2 = 0
        inf_sum1 = len(df[df[col]==np.inf])
        inf_sum2 = len(df[df[col]==-np.inf])
        if (inf_sum1 > 0) or (inf_sum2 > 0):
            add_cols.append(col)
            sum_cols += inf_sum1
            sum_cols += inf_sum2
    return add_cols
#####################################################################################################
import copy
def FE_drop_rows_with_infinity(df, cols_list, fill_value=None):
    """
    This feature engineering function will fill infinite values in your data with a fill_value.
    You might need this function during deep_learning models where infinite values don't work.
    You can also leave the fill_value as None which means we will drop the rows with infinity.
    This function checks for both negative and positive infinity values to fill or remove.
    """
    # first you must drop rows that have inf in them ####
    print('    Shape of dataset initial: %s' %(df.shape[0]))
    corr_list_copy = copy.deepcopy(cols_list)
    init_rows = df.shape[0]
    if fill_value:
        for col in corr_list_copy:
            ### Capping using the n largest value based on n given in input.
            maxval = df[col].max()  ## what is the maximum value in this column?
            minval = df[col].min()
            if maxval == np.inf:
                sorted_list = sorted(df[col].unique())
                ### find the n_smallest values after the maximum value based on given input n
                next_best_value_index = sorted_list.index(np.inf) - 1
                capped_value = sorted_list[next_best_value_index]
                df.loc[df[col]==maxval, col] =  capped_value ## maximum values are now capped
            if minval == -np.inf:
                sorted_list = sorted(df[col].unique())
                ### find the n_smallest values after the maximum value based on given input n
                next_best_value_index = sorted_list.index(-np.inf)+1
                capped_value = sorted_list[next_best_value_index]
                df.loc[df[col]==minval, col] =  capped_value ## maximum values are now capped
        print('        capped all rows with infinite values in data')
    else:
        for col in corr_list_copy:
            df = df[df[col]!=np.inf]
            df = df[df[col]!=-np.inf]
        dropped_rows = init_rows - df.shape[0]
        print('        dropped %d rows due to infinite values in data' %dropped_rows)
        print('    Shape of dataset after dropping rows: %s' %(df.shape[0]))
    ###  Double check that all columns have been fixed ###############
    cols_with_infinity = EDA_find_columns_with_infinity(df)
    if cols_with_infinity:
        print('There are still %d columns with infinite values. Returning...' %len(cols_with_infinity))
    else:
        print('There are no columns with infinite values.')
    return df
##################################################################################

In [None]:
### There are 13 columns with Infinity and Negative Infinity in dataset
inf_cols = EDA_find_columns_with_infinity(train)
len(inf_cols)

In [None]:
target = "target"

# Too many inf and -inf values in train ~50% that prevents Deep Learning models from working
### we will cap them with the next highest (or lowest) values ##

In [None]:
### This is going to fill infinity rows with capped values
train1 = FE_drop_rows_with_infinity(train,inf_cols, True)
train1.head(2)

In [None]:
infcols2 = EDA_find_columns_with_infinity(train1)
infcols2

In [None]:
### we are going to do the same with test data ##
test1 = FE_drop_rows_with_infinity(test,inf_cols, True)
test1.head(2)

In [None]:
train1 = train1.fillna(0)
test1 = test1.fillna(0)

# Run Deep_AutoViML using the new selected train2 and test2 data

In [None]:
######   D E F A U L T S    S E T T I N G S   F O R   D E E P    A U T O  V I M L ###
keras_model_type =  "fast1" ## always try "fast" first, then "fast2", "auto", etc.
### always set early_stopping to True first and then change it to False
#### You always need 15 max_trials to get something decent #####
#### always set tuner to "storm" and then "optuna". 
### NLP char limit kicks off NLP processing. Feature Cross later.
project_name = "Elo"
model_options = {'nlp_char_limit':50, 'cat_feat_cross_flag':False,
                 'max_trials': 10, "tuner": "storm"}
keras_options = {"patience":10, 'class_weight': True, 'early_stopping': True, 
                 'lr_scheduler': '', "optimizer": ''}

In [None]:
model, cat_vocab_dict = deepauto.fit(train1, target, keras_model_type=keras_model_type,
                                     project_name=project_name, keras_options=keras_options,  
                                     model_options=model_options, save_model_flag=False, use_my_model='',
                                     model_use_case='', verbose=1)

In [None]:
predictions = deepauto.predict(model, project_name, test_dataset=test1,
                                 keras_model_type=keras_model_type, 
                                 cat_vocab_dict=cat_vocab_dict)

In [None]:
y_preds = predictions[-1]
y_preds[:15]

In [None]:
preds = pd.Series(y_preds)
preds.loc[pd.Series(y_preds).isnull()] = 0
preds

In [None]:
sample_submission = pd.read_csv('/kaggle/input/elo-merchant-category-recommendation/sample_submission.csv')

In [None]:
sample_submission['target'] = preds.values
sample_submission.to_csv('submission_1.csv', index=False)

In [None]:
sample_submission.head()