## Deep Learning Approaches for Tabular Datasets

Author : Antoreep Jana, 2021 <br><br>
Please find the relevant presentation [here](https://docs.google.com/presentation/d/1fQT_5swBVRRUwWoDvdps_2plWiBLAVeGCUeny2ZmKS4/edit?usp=sharing).

Necessary Imports

In [None]:
import pandas as pd 
import os
import numpy as np

### Table of Contents

1. NNs & CNNs
2. Embeddings
3. DAE
4. TabNets
5. Deep Tables

#### 1. NNs & CNNs

In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data -O sonar.csv

In [None]:
df = pd.read_csv("/kaggle/working/sonar.csv", header = None)
dataset = df.values

X = dataset[:, 0:60].astype(float)
Y = dataset[:, 60]

In [None]:
from keras.models import Sequential
from keras.layers import Dense 
from keras.wrappers.scikit_learn import KerasClassifier 
from sklearn.model_selection import cross_val_score 
from sklearn.preprocessing import LabelEncoder 

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler 
from sklearn.pipeline import Pipeline

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.33, random_state = 21)

In [None]:
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train.shape

In [None]:
y_train

In [None]:
X_test.shape

In [None]:
def encode(x):
    if x == "M":
        return 1
    else:
        return 0
    
y_train = np.array([encode(x) for x in y_train])
y_test = np.array([encode(x) for x in y_test])

In [None]:
y_train

In [None]:
from keras.models import Sequential
from keras.layers import Dense


model = Sequential()
model.add(Dense(50, activation = 'relu', input_shape = (60,)))
model.add(Dense(8, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer = 'sgd', metrics = ['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs = 10, batch_size = 1, verbose = 1)

#### 2. Embeddings

Check out this NB by Collin Morris, https://www.kaggle.com/colinmorris/embedding-layers <br>
In the meanwhile, let me do another implementation of embeddings.

#### 3. DAE

In [None]:
!pip install -q -U kaggler

In [None]:
import kaggler
from kaggler.preprocessing import DAE

In [None]:
encoding_dim = 128
seed = 42
n_fold = 5
n_class = 4

In [None]:
train = pd.read_csv('../input/cat-in-the-dat/train.csv')
train.head()

In [None]:
train.drop(['id'], axis= 1, inplace = True)

In [None]:
target = train.drop(['target'], inplace = True, axis = 1)
features = train[['bin_0', 'bin_1', 'bin_2']]

In [None]:
features

In [None]:
df_all = features.astype("int64")

dae = DAE(cat_cols = ['bin_0', 'bin_1', 'bin_2'], num_cols = [], encoding_dim = encoding_dim, random_state = seed, swap_prob = 0.3, n_layer= 3)
X = dae.fit_transform(df_all)
df_dae = pd.DataFrame(X, columns = [f'dae1_{x}' for x in range(X.shape[1])])
print(df_dae.shape)

In [None]:
df_dae.to_csv("df_dae.csv", index = False)

#### 4. TabNets

In [None]:
!pip install -q pytorch-tabnet

In [None]:
train_data = pd.read_csv('../input/santander-customer-satisfaction/train.csv')
test_data = pd.read_csv('../input/santander-customer-satisfaction/test.csv')
sample = pd.read_csv("../input/santander-customer-satisfaction/sample_submission.csv")

In [None]:
X_train = train_data.iloc[:, :-1].to_numpy()
y_train = train_data['TARGET'].to_numpy().squeeze()
X_test = test_data.to_numpy()

Learn more about model parameters here -> https://github.com/dreamquark-ai/tabnet#model-parameters <br>
For demonstration purposes, we'll be proceeding without much configurations.

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier


classifier = TabNetClassifier(verbose = 0, seed = 21)
classifier.fit(X_train= X_train, y_train = y_train, patience = 5, max_epochs = 100, eval_metric = ['auc'])

In [None]:
predictions = classifier.predict_proba(X_test)[:, 1]

#### 5. Deep Tables

Binary Classification

In [None]:
!pip install -q deeptables[gpu]

In [None]:
from deeptables.models.deeptable import DeepTable, ModelConfig
from deeptables.models.deepnets import WideDeep 
from deeptables.datasets import dsutils 
from sklearn.model_selection import train_test_split 

In [None]:
df_train = dsutils.load_adult()

y = df_train.pop(14)
X = df_train


conf = ModelConfig(nets = WideDeep, metrics = ['AUC', 'accuracy'], auto_discrete = True)
dt = DeepTable(config = conf)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)


model, history = dt.fit(X_train, y_train, epochs = 100)


score = dt.evaluate(X_test, y_test)



In [None]:
print("Score -> ", score)

Regression

In [None]:
df_train = dsutils.load_boston()
y = df_train.pop("target")
X = df_train

In [None]:
conf = ModelConfig(
    metrics = ['RootMeanSquaredError'],
    nets = ['dnn_nets'],
    dnn_params = {
        'hidden_units' : ((256, 0.3, True), (256, 0.3, True)),
        'dnn_activation' : 'relu'
    },
    earlystopping_patience = 5
)

dt = DeepTable(config = conf)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)

model, history = dt.fit(X_train, y_train, epochs = 100)
score = dt.evaluate(X_test, y_test)

In [None]:
print(score)