<a href="https://colab.research.google.com/github/qAp/soil/blob/develop/nbs/01_tabnet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model-fitting using TabNet

In [None]:
! pip install pytorch-tabnet

In [29]:
from google.colab import drive
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
import torch
from pytorch_tabnet.tab_model import TabNetRegressor

In [28]:
drive.mount('/content/drive')

Mounted at /content/drive


In [32]:
Path.ls = lambda src: list(src.iterdir())

In [None]:
! wget https://www.dropbox.com/s/paqtujasjz8lwr8/iscn_env_dataset.csv

In [5]:
iscn = pd.read_csv('iscn_env_dataset.csv')
iscn.dropna(inplace=True)
iscn.shape

(30932, 167)

In [6]:
covariate_families = ['EarthEnv_', 'WorldClim2_', 'SG_', 'humandev_']
other_covariates   = ['FanEtAl_Depth_to_Water_Table_AnnualMean', 'WCS_Human_Footprint_2009']

In [11]:
target = 'orgc_stock_organic'
features = [v for v in iscn.columns.values if v.startswith('WorldClim2_')]

In [8]:
iscn['split'] = np.random.choice(['train', 'valid', 'test'], size=len(iscn), p=[.98, .01, .01])

In [9]:
model = TabNetRegressor()

Device used : cuda


In [17]:
def get_data_split(df, split, features, target):
    df = df[df.split==split]
    X = df[features].values
    y = df[target].values.reshape(-1, 1)
    return X, y

In [18]:
X_train, y_train = get_data_split(iscn, 'train', features, target)
X_valid, y_valid = get_data_split(iscn, 'valid', features, target)
X_test , y_test  = get_data_split(iscn, 'test',  features, target)

In [19]:
model.fit(X_train, y_train, 
          eval_set=[(X_train, y_train), (X_valid, y_valid)],
          eval_name=['train', 'valid'],
          eval_metric=['mse', 'mae'],
          patience=50,
          batch_size=1024, virtual_batch_size=128,
          num_workers=0,
          drop_last=False)

epoch 0  | loss: 0.18589 | train_mse: 2.61971 | train_mae: 0.76108 | valid_mse: 2.19816 | valid_mae: 0.72735 |  0:00:02s
epoch 1  | loss: 0.05301 | train_mse: 0.11129 | train_mae: 0.18615 | valid_mse: 0.05921 | valid_mae: 0.18508 |  0:00:03s
epoch 2  | loss: 0.05122 | train_mse: 0.06416 | train_mae: 0.08857 | valid_mse: 0.01344 | valid_mae: 0.0792  |  0:00:05s
epoch 3  | loss: 0.05094 | train_mse: 0.05417 | train_mae: 0.05278 | valid_mse: 0.00499 | valid_mae: 0.04611 |  0:00:07s
epoch 4  | loss: 0.05055 | train_mse: 0.05747 | train_mae: 0.068   | valid_mse: 0.01023 | valid_mae: 0.06217 |  0:00:08s
epoch 5  | loss: 0.04972 | train_mse: 0.05522 | train_mae: 0.05617 | valid_mse: 0.00661 | valid_mae: 0.05053 |  0:00:10s
epoch 6  | loss: 0.04964 | train_mse: 0.05139 | train_mae: 0.03828 | valid_mse: 0.00282 | valid_mae: 0.03172 |  0:00:12s
epoch 7  | loss: 0.04906 | train_mse: 0.04984 | train_mae: 0.03389 | valid_mse: 0.00281 | valid_mae: 0.02808 |  0:00:13s
epoch 8  | loss: 0.04883 | train

In [25]:
y_pred = model.predict(X_test)
test_score = mean_absolute_error(y_test, y_pred)

print(f'Best validation score: {model.best_cost}')
print(f'Best test score: {test_score}')

Best validation score: 0.0016631230570341353
Best test score: 0.00019286763070214543


In [34]:
saving_pth = Path('/content/drive/MyDrive/soilshot') / 'simple_tabnet'
saved_pth = model.save_model(saving_pth)

Successfully saved model at /content/drive/MyDrive/soilshot/simple_tabnet.zip


In [36]:
loaded_model = TabNetRegressor()
loaded_model.load_model(saved_pth)

Device used : cuda
Device used : cuda


In [37]:
loaded_y_pred = loaded_model.predict(X_test)
loaded_test_score = mean_absolute_error(y_test, loaded_y_pred)

print(f'Best validation score: {model.best_cost}')
print(f'Best test score: {test_score}')

Best validation score: 0.0016631230570341353
Best test score: 0.00019286763070214543


# Useful Links

- https://github.com/dreamquark-ai/tabnet