# Importing Libraryies

* System Append to set proper path

In [1]:
sys.path.append('../')

* Default

In [2]:
import lasio
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from collections import Counter

* Pandas Option

In [3]:
pd.set_option('max_columns', None)

* Tqdm Progress Bar

In [4]:
%%capture
from tqdm import tqdm_notebook

# Checkpoint Import

In [5]:
df = pd.read_csv('../checkpoints/total_df.csv.gz', compression='gzip')

In [6]:
df.head()

Unnamed: 0,CALI,NPHI,RHOB,GR,DTC,RDEP,LITHOLOGY_GEOLINK,DEPTH,WELL_NAME
0,19.406,0.475704,1.789,64.497482,167.582153,1.574993,,493.493134,15_9-12
1,19.406,0.479429,1.754,62.406261,167.425064,1.569011,,493.645538,15_9-12
2,19.406,0.474963,1.778,62.629055,167.808395,1.57801,,493.797943,15_9-12
3,19.452999,0.504394,1.642,65.998596,169.244873,1.586024,,493.950348,15_9-12
4,19.452999,0.480163,1.563,64.997223,170.635086,1.603011,,494.102722,15_9-12


# Lithology Code Prediction

In [7]:
litho_data = df[pd.notnull(df['LITHOLOGY_GEOLINK'])].drop(columns=['WELL_NAME']) # not null dataframe (model training)

In [8]:
len(litho_data)

1294715

    * Converting Lithology Data to Integer

In [9]:
litho_data['LITHOLOGY_GEOLINK'] = litho_data['LITHOLOGY_GEOLINK'].astype(int)

    * Reducing Dataset Size

In [10]:
litho_data["Set"] = np.random.choice(["train_red", "rest"], p =[.6, .4], size=(litho_data.shape[0],))

train = litho_data[litho_data.Set == 'train_red']

In [11]:
print(train.shape, litho_data.shape)

(777065, 9) (1294715, 9)


    * Dataset Split

In [12]:
from sklearn.model_selection import train_test_split

X = train.drop(columns=['LITHOLOGY_GEOLINK', 'Set'])

Y = train['LITHOLOGY_GEOLINK']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state=42)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state=42)

In [13]:
print('### Training Dataset: ### ', x_train.shape, y_train.shape)
print('### Validation Dataset: ### ', x_val.shape, y_val.shape)
print('### Test Dataset: ### ', x_test.shape, y_test.shape)

### Training Dataset: ###  (497321, 7) (497321,)
### Validation Dataset: ###  (124331, 7) (124331,)
### Test Dataset: ###  (155413, 7) (155413,)


In [14]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()

x_train = std_scaler.fit_transform(x_train)
x_val = std_scaler.fit_transform(x_val)
x_test = std_scaler.fit_transform(x_test)

    * Hyper-Parameter Tunning --> Random Search

        * Parameter Grid Definition

In [15]:
grid = {
    "n_independent": [1, 4, 5, 10],
    "n_shared": [1, 2, 5], 
    "mask_type": ['entmax', 'sparsemax']
    #"n_a": [4, 16, 32, 64],
    #"n_d": [4, 16, 32, 64],
    #"gamma": [1, 1.2, 1.5, 2],
    #"n_steps": [3, 5, 7]}
     #"momentum": [0.6, 0.9, 0.98],
     #"lambda_sparse": [0, 1e-6, 1e-2, 0.1]
}

        * TabNet Call

In [16]:
import torch
from pytorch_tabnet.tab_model import TabNetClassifier

        * Grid Search (n_experiment)

In [17]:
from sklearn.model_selection import ParameterGrid

test_grid =list(ParameterGrid(grid))

grid_result = pd.DataFrame()

mean_loss = []

mean_train_acc = []

mean_valid_acc = []

max_loss = []

max_train_acc = []

max_valid_acc = []

for param_grid in tqdm_notebook(test_grid, desc='Parameter Grid Search:'):

    print(param_grid)

    tabnet_rand = TabNetClassifier(**param_grid) # generating model

    tabnet_rand.fit(X_train=x_train, y_train=y_train,eval_set=[(x_train, y_train),(x_val, y_val)], eval_name=['train', 'valid'], eval_metric=['balanced_accuracy'], max_epochs=20, num_workers=25, drop_last=False, batch_size=32768, virtual_batch_size=16384, pin_memory=False) # fitting

    if len(grid_result) == 0:
        grid_result = pd.DataFrame.from_records([param_grid]) # storing first configuration

    else:
        grid_result = grid_result.append(param_grid, ignore_index=True) # storing configuration used on this iteration

    mean_loss.append(np.mean(tabnet_rand.history['loss']))

    mean_train_acc.append(np.mean(tabnet_rand.history['train_balanced_accuracy']))

    mean_valid_acc.append(np.mean(tabnet_rand.history['valid_balanced_accuracy']))

    max_loss.append(np.max(tabnet_rand.history['loss']))

    max_train_acc.append(np.max(tabnet_rand.history['train_balanced_accuracy']))

    max_valid_acc.append(np.max(tabnet_rand.history['valid_balanced_accuracy']))

    del tabnet_rand

grid_result['Mean Loss'] = mean_loss

grid_result['Mean Train Acc'] = mean_train_acc

grid_result['Mean Valid Acc'] = mean_train_acc

grid_result['Max Loss'] = max_loss

grid_result['Max Train Acc'] = max_train_acc

grid_result['Max Valid Acc'] = max_valid_acc

HBox(children=(FloatProgress(value=0.0, description='Parameter Grid Search:', max=24.0, style=ProgressStyle(de…

{'mask_type': 'entmax', 'n_independent': 1, 'n_shared': 1}
Device used : cuda
epoch 0  | loss: 2.95952 | train_balanced_accuracy: 0.08916 | valid_balanced_accuracy: 0.08899 |  0:00:08s
epoch 1  | loss: 2.11915 | train_balanced_accuracy: 0.089   | valid_balanced_accuracy: 0.08848 |  0:00:15s
epoch 2  | loss: 1.90716 | train_balanced_accuracy: 0.12235 | valid_balanced_accuracy: 0.12267 |  0:00:24s
epoch 3  | loss: 1.81055 | train_balanced_accuracy: 0.16373 | valid_balanced_accuracy: 0.16425 |  0:00:32s
epoch 4  | loss: 1.74031 | train_balanced_accuracy: 0.18597 | valid_balanced_accuracy: 0.18531 |  0:00:39s
epoch 5  | loss: 1.6936  | train_balanced_accuracy: 0.20433 | valid_balanced_accuracy: 0.20367 |  0:00:47s
epoch 6  | loss: 1.65425 | train_balanced_accuracy: 0.221   | valid_balanced_accuracy: 0.22076 |  0:00:55s
epoch 7  | loss: 1.61634 | train_balanced_accuracy: 0.2303  | valid_balanced_accuracy: 0.23017 |  0:01:03s
epoch 8  | loss: 1.57543 | train_balanced_accuracy: 0.22459 | vali

In [18]:
grid_result.sort_values(by='Mean Valid Acc', ascending=False).head(n=10)

Unnamed: 0,mask_type,n_independent,n_shared,Mean Loss,Mean Train Acc,Mean Valid Acc,Max Loss,Max Train Acc,Max Valid Acc
10,entmax,10,2,1.475405,0.288635,0.288635,2.790438,0.41251,0.408902
3,entmax,4,1,1.489784,0.28476,0.28476,2.608547,0.381962,0.380496
11,entmax,10,5,1.543819,0.282147,0.282147,3.055278,0.405126,0.405117
9,entmax,10,1,1.440152,0.282012,0.282012,2.545085,0.388066,0.385473
7,entmax,5,2,1.483659,0.280874,0.280874,2.704068,0.388913,0.380776
15,sparsemax,4,1,1.531047,0.274068,0.274068,2.620872,0.376786,0.37747
6,entmax,5,1,1.501074,0.273802,0.273802,2.752722,0.389606,0.38603
4,entmax,4,2,1.527184,0.266305,0.266305,2.846228,0.358849,0.357932
5,entmax,4,5,1.510017,0.265836,0.265836,2.811839,0.352707,0.353543
16,sparsemax,4,2,1.56802,0.2615,0.2615,2.864651,0.341376,0.341946


In [None]:
grid_result.to_csv('../models/results/grid_performance_search.csv')