## TabNet experiment using loan approval dataset

Adapted from example here: https://www.geeksforgeeks.org/tabnet/

In [1]:
# Install TabNet
! pip install pytorch-tabnet
  


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-tabnet
  Downloading pytorch_tabnet-3.1.1-py3-none-any.whl (39 kB)
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-3.1.1


In [2]:
# imports necessary modules
from pytorch_tabnet.tab_model import TabNetClassifier
  
import os
import torch
import pandas as pd
import numpy as np
  
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score

In [3]:
# get to the appropriate directory in Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd /content/drive/MyDrive/tabnet_experiment

/content/drive/MyDrive/tabnet_experiment


In [5]:
# set overall parms
data_directory = "data"

In [6]:
# Load training and test data
data =pd.read_csv(data_directory+'/train.csv')
data.head()
data.isna().sum()


Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [7]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [8]:
# load test data
test_data = pd.read_csv(data_directory+'/test.csv')

test_data.isna().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [9]:
test_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [10]:
data.shape

(614, 13)

In [11]:
test_data.shape

(367, 12)

In [12]:
# set index column
data.set_index('Loan_ID', inplace=True)
test_data.set_index('Loan_ID', inplace=True)
  
# Replace NAs
data.fillna(method="bfill", inplace=True)
test_data.fillna(method="bfill", inplace=True)

In [13]:
# convert categorical column to integer Labels 
gen = LabelEncoder().fit(data['Gender'])
data['Gender'] = gen.transform(data['Gender'])
  
s_type= LabelEncoder().fit(data['Married'])
data['Married'] = s_type.transform(data['Married'])
  
n_dep= LabelEncoder().fit(data['Dependents'])
data['Dependents'] = n_dep.transform(data['Dependents'])
  
edu= LabelEncoder().fit(data['Education'])
data['Education'] = edu.transform(data['Education'])
  
s_emp = LabelEncoder().fit(data['Self_Employed'])
data['Self_Employed'] = s_emp.transform(data['Self_Employed'])
  
c_history = LabelEncoder().fit(data['Credit_History'])
data['Credit_History'] = c_history.transform(data['Credit_History'])
  
p_area = LabelEncoder().fit(data['Property_Area'])
data['Property_Area'] = p_area.transform(data['Property_Area'])
  
l_status = LabelEncoder().fit(data['Loan_Status'])
data['Loan_Status'] = l_status.transform(data['Loan_Status'])

In [14]:
# For test data - convert categorical column to integer Labels
test_data['Gender'] = gen.transform(test_data['Gender'])
test_data['Married'] = s_type.transform(test_data['Married'])
test_data['Dependents'] = n_dep.transform(test_data['Dependents'])
test_data['Education'] = edu.transform(test_data['Education'])
test_data['Self_Employed'] = s_emp.transform(test_data['Self_Employed'])
test_data['Credit_History'] = c_history.transform(test_data['Credit_History'])
test_data['Property_Area'] = p_area.transform(test_data['Property_Area'])

In [15]:
# select feature and target variable
X = data.loc[:,data.columns != 'Loan_Status']
y = data.loc[:,data.columns == 'Loan_Status']
X.shape, y.shape
  
# convert to numpy
X= X.to_numpy()
y= y.to_numpy()
  
y= y.flatten()

In [16]:
# define and train the Tabnet model with cross validation
kf = KFold(n_splits=5, random_state=42, shuffle=True)
CV_score_array    =[]
for train_index, test_index in kf.split(X):
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    tb_cls = TabNetClassifier(optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-3),
                       scheduler_params={"step_size":10, "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='entmax' # "sparsemax"
                       )
    tb_cls.fit(X_train,y_train,
               eval_set=[(X_train, y_train), (X_valid, y_valid)],
               eval_name=['train', 'valid'],
               eval_metric=['accuracy'],
               max_epochs=1000 , patience=100,
               batch_size=28, drop_last=False)            
    CV_score_array.append(tb_cls.best_cost)

Device used : cuda
epoch 0  | loss: 0.86096 | train_accuracy: 0.34012 | valid_accuracy: 0.43902 |  0:00:00s
epoch 1  | loss: 0.80519 | train_accuracy: 0.41955 | valid_accuracy: 0.47154 |  0:00:01s
epoch 2  | loss: 0.74189 | train_accuracy: 0.52749 | valid_accuracy: 0.5122  |  0:00:01s
epoch 3  | loss: 0.70653 | train_accuracy: 0.66599 | valid_accuracy: 0.63415 |  0:00:02s
epoch 4  | loss: 0.68186 | train_accuracy: 0.70468 | valid_accuracy: 0.63415 |  0:00:02s
epoch 5  | loss: 0.65176 | train_accuracy: 0.71079 | valid_accuracy: 0.63415 |  0:00:03s
epoch 6  | loss: 0.63351 | train_accuracy: 0.70672 | valid_accuracy: 0.63415 |  0:00:03s
epoch 7  | loss: 0.62282 | train_accuracy: 0.70672 | valid_accuracy: 0.63415 |  0:00:04s
epoch 8  | loss: 0.5952  | train_accuracy: 0.6945  | valid_accuracy: 0.63415 |  0:00:04s
epoch 9  | loss: 0.62598 | train_accuracy: 0.69857 | valid_accuracy: 0.63415 |  0:00:05s
epoch 10 | loss: 0.58432 | train_accuracy: 0.70061 | valid_accuracy: 0.63415 |  0:00:05s
ep

In [17]:
CV_score_array

[0.8048780487804879,
 0.7886178861788617,
 0.8455284552845529,
 0.7723577235772358,
 0.819672131147541]

In [19]:
# select feature and target variable
X_test = test_data.loc[:,test_data.columns != 'Loan_Status']
y_test = test_data.loc[:,test_data.columns == 'Loan_Status']
X_test.shape, y_test.shape
  
# convert to numpy
X_test= X_test.to_numpy()
y_test= y_test.to_numpy()
  
y_test= y_test.flatten()

In [20]:
# Test model and generate prediction
predictions =[ 'N' if i < 0.5 else 'Y' for i in tb_cls.predict(X_test)]

In [21]:
predictions

['Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'Y',
 'N',
 'N',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'N',
 'Y',
 'N',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'N',
 'N',
 'Y',
 'Y',
 'Y',
 'N',
 'N',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'N',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'Y',
 'N',
 'N',
 'N'