In [1]:
%cd /home/Reguformer/notebooks

/home/Reguformer/notebooks


In [2]:
import json
import os
import warnings

import numpy as np
import pandas as pd
import torch

from copy import deepcopy


warnings.filterwarnings("ignore")

from trans_oil_gas import utils_clustering, utils_fix_seeds, utils_model

Choose GPU device if it is available.

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
gpu = [0] if 'cuda' in device.type else 0
print(device, gpu)

cpu 0


Fix all possible seeds for results reproducibility.

In [4]:
utils_fix_seeds.fix_seeds(device=device)

# Read data

Reading of already preprocessed data (here synthetic dataset with the same columns as in the original one is presented).

In [5]:
df = pd.read_csv('../data/synthetic_well_log_data.csv')

# Load models

In [6]:
# results_len = 5000
# slice_len = 100
# input_size = 4
# n_splits = 5
# n_times = 5

results_len = 500
slice_len = 100
input_size = 4
n_splits = 2
n_times = 2

In [7]:
path_to_models = "./saves_all_models/"

In [8]:
shared_fixed_params = {
    "enc_in": 4,
    "distil": None,
    "device": device,
    "output_attention": False,
    "n_seq": 100
}
fixed_params_tr = {
    "attn": "full",
    "activation": "relu",
}
fixed_params_reguformer = {
    "attn": "prob",
    "activation": "gelu",
}

In [9]:
path_to_save = './saves_emb_quality-clustering/'
path_to_logs = './logs_emd_quality-clustering'

if not os.path.exists(path_to_save):
    os.mkdir(path_to_save)

if not os.path.exists(path_to_logs):
    os.mkdir(path_to_logs)

# Obtain embeddings and cluster them

In [10]:
ari = dict()
for loss_type in ["siamese", "triplet"]:
    for model_type in [
        "None",
        "topQ", "randQ", 
        "topK", "randK", 
        "topQ_topK", "topQ_randK", 
        "randQ_topK", "randQ_randK",
    ]:
        print(loss_type, model_type)
        with open(os.path.join(path_to_models, "best_params_{}_reguformer_{}.json".format(loss_type, model_type)), "r") as f:
            best_params = json.load(f)
        
        if "None" in model_type:
            fixed_params = deepcopy(fixed_params_tr)
            enc_type = "transformer"
            reg_type = ""
        else:
            fixed_params = deepcopy(fixed_params_reguformer)
            enc_type = "reguformer"
            reg_type = "_" + model_type

        if "siamese" in loss_type:
            model = utils_model.SiameseArchitecture(
                encoder_type="reguformer", 
                fc_hidden_size=64, 
                **shared_fixed_params, 
                **fixed_params,
                **best_params,
            ).float()

        elif "triplet" in loss_type:
            model = utils_model.TripletArchitecture(
                encoder_type="reguformer", 
                **shared_fixed_params, 
                **fixed_params,
                **best_params,
            ).float()

        model.load_state_dict(torch.load(os.path.join(path_to_models, "res_{}_{}{}.pth".format(loss_type, enc_type, reg_type)), map_location=device))

        model.eval()
        metrics = utils_clustering.emb_clustering_wells(
            model,
            df,
            slice_len=slice_len,
            n_splits=n_splits, 
            results_len_test=results_len,
            n_times=n_times,
            path_to_saves=path_to_save,
        )
        np.save(
            os.path.join(path_to_save, "ari_{}_{}.npy".format(loss_type, model_type)),
            metrics,
        )
        ari[model_type] = (metrics.mean(), metrics.std())

    # the same for Performer

siamese None


500it [00:06, 79.31it/s] 


ARI = 1.0 ± 0.0


500it [00:04, 103.43it/s]


ARI = 1.0 ± 0.0
siamese topQ


500it [00:10, 47.32it/s]


ARI = 1.0 ± 0.0


500it [00:11, 44.75it/s]


ARI = 1.0 ± 0.0
siamese randQ


500it [00:46, 10.87it/s]


ARI = 1.0 ± 0.0


500it [00:16, 29.95it/s]


ARI = 1.0 ± 0.0
siamese topK


500it [00:07, 69.77it/s] 


ARI = 1.0 ± 0.0


500it [00:05, 94.67it/s] 


ARI = 1.0 ± 0.0
siamese randK


500it [00:07, 62.79it/s]


ARI = 1.0 ± 0.0


500it [00:07, 64.02it/s]


ARI = 1.0 ± 0.0
siamese topQ_topK


500it [00:10, 47.54it/s]


ARI = 1.0 ± 0.0


500it [00:10, 46.20it/s]


ARI = 1.0 ± 0.0
siamese topQ_randK


500it [00:05, 88.61it/s]


ARI = 1.0 ± 0.0


500it [00:05, 88.64it/s] 


ARI = 1.0 ± 0.0
siamese randQ_topK


500it [00:08, 60.13it/s]


ARI = 1.0 ± 0.0


500it [00:08, 58.43it/s]


ARI = 1.0 ± 0.0
siamese randQ_randK


500it [00:14, 35.65it/s]


ARI = 1.0 ± 0.0


500it [00:06, 73.00it/s] 


ARI = 1.0 ± 0.0
triplet None


500it [00:05, 90.15it/s] 


ARI = 1.0 ± 0.0


500it [00:05, 92.79it/s] 


ARI = 1.0 ± 0.0
triplet topQ


500it [00:19, 26.23it/s]


ARI = 1.0 ± 0.0


500it [00:18, 27.38it/s]


ARI = 1.0 ± 0.0
triplet randQ


500it [00:07, 65.90it/s]


ARI = 1.0 ± 0.0


500it [00:07, 68.48it/s]


ARI = 1.0 ± 0.0
triplet topK


500it [00:12, 40.00it/s]


ARI = 1.0 ± 0.0


500it [00:12, 40.71it/s]


ARI = 1.0 ± 0.0
triplet randK


500it [00:10, 45.79it/s]


ARI = 1.0 ± 0.0


500it [00:11, 45.41it/s]


ARI = 1.0 ± 0.0
triplet topQ_topK


500it [00:11, 43.11it/s]


ARI = 1.0 ± 0.0


500it [00:12, 41.02it/s] 


ARI = 1.0 ± 0.0
triplet topQ_randK


500it [00:11, 43.85it/s]


ARI = 1.0 ± 0.0


500it [00:11, 42.00it/s]


ARI = 1.0 ± 0.0
triplet randQ_topK


500it [00:06, 73.40it/s]


ARI = 1.0 ± 0.0


500it [00:06, 76.76it/s]


ARI = 1.0 ± 0.0
triplet randQ_randK


500it [00:10, 48.48it/s]


ARI = 1.0 ± 0.0


500it [00:10, 47.97it/s]


ARI = 1.0 ± 0.0


In [11]:
ans_df = pd.DataFrame(ari, index=['mean', 'std']).T
ans_df['mean ± std'] = [
    str(round(ans_df.iloc[i]['mean'], 3)) + ' $\pm$ ' + str(round(ans_df.iloc[i]['std'], 3)) for i in range(len(ans_df))
]
ans_df[['mean ± std']].to_csv(os.path.join(path_to_save, "clustering.csv"))