In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current sessionj

In [None]:
train = pd.read_csv("/kaggle/input/cat-in-the-dat-ii/train.csv")
train.head()
                    

In [None]:
test = pd.read_csv("/kaggle/input/cat-in-the-dat-ii/test.csv")
test.head()

In [None]:
train.info()


In [None]:
train_target = train.target
test_id = test.id
train.drop(["target","id"],1,inplace=True)
test.drop("id",1,inplace = True)


# Few important points about the data we have in the dataset
Given dataset contains Numeric as well as Categorical Variables:-

1. Following kind of categorical variabels are present in the dataset:

    1.1 Binary Variables - values like Male & Female.

    1.2 Ordinal Variables (low and high cardinality) - ordered values like "Good" "V Good" "Excellent".

    1.3 Nominal Variables (low and high cardinality) - do not have any intrinsic order, values like blood groups "A" "B" etc

We will convert all the non-categorical variables to categorical variables, lets first get a list of numeric variables

In [None]:
# get list of numeric variables
num_vars = [] 
for col in train:
    if train[col].dtypes != 'O':
        num_vars.append(col)
num_vars 

Lets convert all numeric variables to categorical variables

In [None]:
# converting numeric variables to categorical variables
for cols in num_vars:
    train[cols] = train[cols].astype('category')
    test[cols] = test[cols].astype('category')


Lets cross verify whether we have successfully converted the numeric fields to categorical fields or not

In [None]:
# verify the dataset 
train.info()

We have successfully converted all the viables to categorical type.

Lets check the number of unique values for each variable 

In [None]:
# number of unique variables for each of the column present in the dataset
for col in train.columns:
    print(col,":\n",train[col].nunique(),"\n")
    

There are lots of high cardinality features in the dataset.

# Model Based on DeepTables

DeepTables(DT) is a easy-to-use toolkit that enables deep learning to unleash great power on tabular data.

MLP (also known as Fully-connected neural networks) have been shown inefficient in learning distribution representation. The “add” operations of the perceptron layer have been proven poor performance to exploring multiplicative feature interactions. In most cases, manual feature engineering is necessary and this work requires extensive domain knowledge and very cumbersome.

DT follow these steps to build a neural network:

1. Category features -> Embedding Layer.

2. Continuous feature -> Dense Layer or to Embedding Layer after discretization/categorization.

3. Embedding/Dense layers -> Feature Interactions/Extractions nets.

4. Stacking(add/concat) outputs of nets as the output of the model.

# ModelConfig

ModelConfig is the most important parameter in DT. It is used to set how to clean and preprocess the data automatically, and how to assemble various network components to building a neural nets for prediction tasks, as well as the setting of hyper-parameters of nets, etc. If you do not change any settings in ModelConfig, DT will work in most cases as well. However, you can get a better performance by tuning the parameters in ModelConfig.

In [None]:
!pip install deeptables


In [None]:
from deeptables.models.deeptable import DeepTable, ModelConfig
from tensorflow.keras.utils import plot_model


In [None]:
n_folds=5
epochs=1
batch_size=128

In [None]:
%%time

conf = ModelConfig(
    dnn_params={
        'hidden_units':((300, 0.3, True),(300, 0.3, True),), #hidden_units
        'dnn_activation':'relu',
    },
    fixed_embedding_dim=True,
    embeddings_output_dim=20,
    nets =['linear','cin_nets','dnn_nets'],
    stacking_op = 'add',
    output_use_bias = False,
    cin_params={
       'cross_layer_size': (200, 200),
       'activation': 'relu',
       'use_residual': False,
       'use_bias': True,
       'direct': True, 
       'reduce_D': False,
    },
)

dt = DeepTable(config = conf)
oof_proba, eval_proba, test_prob = dt.fit_cross_validation(
    train, train_target, X_eval=None, X_test=test, 
    num_folds=n_folds, stratified=False, iterators=None, 
    batch_size=batch_size, epochs=epochs, verbose=1, callbacks=[], n_jobs=1)

In [None]:
# lets prepare for the prediction submission 
sub = pd.DataFrame() 
sub['Id'] = test_id 
sub['target'] = test_prob
sub.to_csv('submission_dt.csv',index=False)

Using the above model, I got Private Score of 0.78676 and Public Score of 0.78527.
I used epoch = 1 and n_folds = 5 just to make this submission possible quickly.

Model will produce much better results if you use n_folds = 50 and epoch = 100.



# Kindly upvote if you liked my kernel!