In [3]:
import numpy as np
import torch
import pandas as pd
import matplotlib as plt
import torch.nn as nn
from torch.nn import Linear, ReLU, BatchNorm1d

In [4]:
class GBN(nn.Module):
    def __init__(self,inp,vbs=128,momentum=0.01):
        super().__init__()
        self.bn = nn.BatchNorm1d(inp,momentum=momentum)
        self.vbs = vbs
    def forward(self,x):
        chunk = torch.chunk(x,x.size(0)//self.vbs,0)
        res = [self.bn(y) for y in chunk]
        return torch.cat(res,0)

In [5]:
class AttentionTransformer(nn.Module):
    def __init__(self,d_a,inp_dim,relax,vbs=128):
        super().__init__()
        self.fc = nn.Linear(d_a,inp_dim)
        self.bn = GBN(out_dim,vbs=vbs)
        self.smax = Sparsemax()
        self.r = relax
    #a:feature from previous decision step
    def forward(self,a,priors): 
        a = self.bn(self.fc(a)) 
        mask = self.smax(a*priors) 
        priors =priors*(self.r-mask)  #updating the prior
        return mask

In [6]:
class GLU(nn.Module):
    def __init__(self,inp_dim,out_dim,fc=None,vbs=128):
        super().__init__()
        if fc:
            self.fc = fc
        else:
            self.fc = nn.Linear(inp_dim,out_dim*2)
        self.bn = GBN(out_dim*2,vbs=vbs) 
        self.od = out_dim
    def forward(self,x):
        x = self.bn(self.fc(x))
        return x[:,:self.od]*torch.sigmoid(x[:,self.od:])
class FeatureTransformer(nn.Module):
    def __init__(self,inp_dim,out_dim,shared,n_ind,vbs=128):
        super().__init__()
        first = True
        self.shared = nn.ModuleList()
        if shared:
            self.shared.append(GLU(inp_dim,out_dim,shared[0],vbs=vbs))
            first= False    
            for fc in shared[1:]:
                self.shared.append(GLU(out_dim,out_dim,fc,vbs=vbs))
        else:
            self.shared = None
        self.independ = nn.ModuleList()
        if first:
            self.independ.append(GLU(inp,out_dim,vbs=vbs))
        for x in range(first, n_ind):
            self.independ.append(GLU(out_dim,out_dim,vbs=vbs))
        self.scale = torch.sqrt(torch.tensor([.5],device=device))
    def forward(self,x):
        if self.shared:
            x = self.shared[0](x)
            for glu in self.shared[1:]:
                x = torch.add(x, glu(x))
                x = x*self.scale
        for glu in self.independ:
            x = torch.add(x, glu(x))
            x = x*self.scale
        return x

In [7]:
class DecisionStep(nn.Module):
    def __init__(self,inp_dim,n_d,n_a,shared,n_ind,relax,vbs=128):
        super().__init__()
        self.fea_tran = FeatureTransformer(inp_dim,n_d+n_a,shared,n_ind,vbs)
        self.atten_tran =  AttentionTransformer(n_a,inp_dim,relax,vbs)
    def forward(self,x,a,priors):
        mask = self.atten_tran(a,priors)
        sparse_loss = ((-1)*mask*torch.log(mask+1e-10)).mean()
        x = self.fea_tran(x*mask)
        return x,sparse_loss

In [8]:
class TabNet(nn.Module):
    def __init__(self,inp_dim,final_out_dim,n_d=64,n_a=64,
n_shared=2,n_ind=2,n_steps=5,relax=1.2,vbs=128):
        super().__init__()
        if n_shared>0:
            self.shared = nn.ModuleList()
            self.shared.append(nn.Linear(inp_dim,2*(n_d+n_a)))
            for x in range(n_shared-1):
                self.shared.append(nn.Linear(n_d+n_a,2*(n_d+n_a)))
        else:
            self.shared=None
        self.first_step = FeatureTransformer(inp_dim,n_d+n_a,self.shared,n_ind) 
        self.steps = nn.ModuleList()
        for x in range(n_steps-1):
            self.steps.append(DecisionStep(inp_dim,n_d,n_a,self.shared,n_ind,relax,vbs))
        self.fc = nn.Linear(n_d,final_out_dim)
        self.bn = nn.BatchNorm1d(inp_dim)
        self.n_d = n_d
    def forward(self,x):
        x = self.bn(x)
        x_a = self.first_step(x)[:,self.n_d:]
        sparse_loss = torch.zeros(1).to(x.device)
        out = torch.zeros(x.size(0),self.n_d).to(x.device)
        priors = torch.ones(x.shape).to(x.device)
        for step in self.steps:
            x_te,l = step(x,x_a,priors)
            out += F.relu(x_te[:,:self.n_d])
            x_a = x_te[:,self.n_d:]
            sparse_loss += l
        return self.fc(out),sparse_loss

In [9]:
pip install pytorch-tabnet

Collecting pytorch-tabnet
  Downloading pytorch_tabnet-3.1.1-py3-none-any.whl (39 kB)
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-3.1.1


In [10]:
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor

pytorch_tabnet.tab_model.

In [11]:
dfs = pd.read_csv("adult.data", header=None)
#dfs.iloc[0][""]

In [12]:
# from google.colab import drive
# drive.mount('/content/gdrive')

In [13]:
dfs.nunique()

0        73
1         9
2     21648
3        16
4        16
5         7
6        15
7         6
8         5
9         2
10      119
11       92
12       94
13       42
14        2
dtype: int64

In [14]:
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'earn']
dfs.columns = cols

In [15]:
dfs.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,earn
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [16]:
### Preprocessing
from sklearn.preprocessing import LabelEncoder

nunique = dfs.nunique()
types = dfs.dtypes

categorical_columns = []
categorical_dims = {}

## Label Encoding
for col in dfs.columns:
    if types[col] == 'object' or nunique[col] < 200:
        print(col, dfs[col].nunique())
        l_enc = LabelEncoder()
        dfs[col] = dfs[col].fillna("Fill Na")
        dfs[col] = l_enc.fit_transform(dfs[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)


age 73
workclass 9
education 16
education-num 16
marital-status 7
occupation 15
relationship 6
race 5
sex 2
capital-gain 119
capital-loss 92
hours-per-week 94
native-country 42
earn 2


In [17]:
print(categorical_columns)
print(categorical_dims)

['age', 'workclass', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'earn']
{'age': 73, 'workclass': 9, 'education': 16, 'education-num': 16, 'marital-status': 7, 'occupation': 15, 'relationship': 6, 'race': 5, 'sex': 2, 'capital-gain': 119, 'capital-loss': 92, 'hours-per-week': 94, 'native-country': 42, 'earn': 2}


In [18]:
dfs.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,earn
0,22,7,77516,9,12,4,1,1,4,1,25,0,39,39,0
1,33,6,83311,9,12,2,4,0,4,1,0,0,12,39,0
2,21,4,215646,11,8,0,6,1,4,1,0,0,39,39,0
3,36,4,234721,1,6,2,6,0,2,1,0,0,39,39,0
4,11,4,338409,9,12,2,10,5,2,0,0,0,39,5,0


In [19]:
# Fill Nan Values with mean
dfs_indexes = dfs.index
dfs.fillna(dfs.loc[dfs_indexes, col].mean(), inplace=True)

In [20]:
X = dfs[['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']]

y = dfs['earn']

In [21]:
from sklearn.model_selection import train_test_split

train_ratio = 0.8
validation_ratio = 0.1
test_ratio = 0.10

# Divide to 80/10/10
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - train_ratio)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio)) 

train_indices = X_train.index
valid_indices = X_val.index
test_indices = X_test.index

In [22]:
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
X_val = np.asarray(X_val)
y_val = np.asarray(y_val)
X_test = np.asarray(X_test)
y_test = np.asarray(y_test)

In [25]:
# from sklearn.metrics import mean_squared_error
# inp_dim = X_train.shape[1]
# final_out_dim = 1

# clf = TabNet(inp_dim, 1)
# clf.fit(X_train, y_train, eval_set=[(X_val, y_val)])
# y_pred = clf.predict(X_test)

# u = mean_squared_error(y_test, y_pred, squared=False)
# print(u)

(26048, 14)

In [27]:
from sklearn.metrics import mean_squared_error

clf = TabNetClassifier(optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=2e-2), scheduler_params={"step_size":50, "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR, mask_type='entmax')

clf.fit(X_train, y_train, eval_set=[(X_val, y_val)])
y_pred = clf.predict(X_test)

u = mean_squared_error(y_test, y_pred, squared=False)
print(u)

Device used : cuda
epoch 0  | loss: 0.47204 | val_0_auc: 0.59224 |  0:00:01s
epoch 1  | loss: 0.37133 | val_0_auc: 0.59898 |  0:00:03s
epoch 2  | loss: 0.3564  | val_0_auc: 0.62016 |  0:00:04s
epoch 3  | loss: 0.34879 | val_0_auc: 0.65889 |  0:00:06s
epoch 4  | loss: 0.3412  | val_0_auc: 0.82206 |  0:00:07s
epoch 5  | loss: 0.33696 | val_0_auc: 0.86525 |  0:00:09s
epoch 6  | loss: 0.33505 | val_0_auc: 0.88116 |  0:00:10s
epoch 7  | loss: 0.33188 | val_0_auc: 0.8868  |  0:00:11s
epoch 8  | loss: 0.33182 | val_0_auc: 0.88876 |  0:00:13s
epoch 9  | loss: 0.32806 | val_0_auc: 0.89395 |  0:00:14s
epoch 10 | loss: 0.32625 | val_0_auc: 0.89697 |  0:00:16s
epoch 11 | loss: 0.32312 | val_0_auc: 0.89597 |  0:00:17s
epoch 12 | loss: 0.32562 | val_0_auc: 0.89893 |  0:00:19s
epoch 13 | loss: 0.32277 | val_0_auc: 0.89823 |  0:00:20s
epoch 14 | loss: 0.32167 | val_0_auc: 0.89926 |  0:00:21s
epoch 15 | loss: 0.32227 | val_0_auc: 0.89987 |  0:00:23s
epoch 16 | loss: 0.32264 | val_0_auc: 0.90016 |  0:00