In [8]:
# @title Import Libraries
import pandas as pd
import tracemalloc
import time
import psutil
from realtabformer import REaLTabFormer
from transformers import GPT2Config
from src.data_processing import csv_data_split

In [9]:
train_data, test_data, sample_data = csv_data_split("../data/breast-cancer-wisconsin.csv")

In [10]:
# Initialize an empty DataFrame to store results
results = pd.DataFrame(columns=["Model", "Time (s)", "Memory Current (MB)", "Memory Peak (MB)"])

In [36]:
results["System Memory Usage (MB)"] = 0
results

Unnamed: 0,Model,Time (s),Memory Current (MB),Memory Peak (MB),Layers,Heads,Embedding Dim,DS Size,System Memory Usage (MB)
0,small_model,566.675813,42.637881,47.304494,4,8,512,273,0
1,regular_model,1790.28255,1.735398,2.29164,6,12,768,273,0
2,large_model,4983.763235,1.624654,2.558731,12,12,768,273,0


In [42]:
new_results = pd.DataFrame(columns=["Model", "Time (s)", "Memory Current (MB)", "Memory Peak (MB)", "System Memory Usage (MB)"])
new_results

Unnamed: 0,Model,Time (s),Memory Current (MB),Memory Peak (MB),System Memory Usage (MB)


In [43]:
def fit_and_track_memory(model, data, model_name):
    tracemalloc.start()
    start_time = time.time()

    process = psutil.Process()
    initial_memory = process.memory_info().rss

    model.fit(data)

    end_time = time.time()
    elapsed_time = end_time - start_time

    final_memory = process.memory_info().rss
    current, peak = tracemalloc.get_traced_memory()

    print(f"Model: {model_name}")
    print(f"System Memory Usage: {((final_memory - initial_memory) / 1024**2):.2f} MB")
    print(f"Python Current Memory: {current / 1024**2:.2f} MB")
    print(f"Python Peak Memory: {peak / 1024**2:.2f} MB")

    tracemalloc.stop()

    new_results.loc[len(results)] = [
        model_name,
        elapsed_time,
        current / 1024**2,
        peak / 1024**2,
        (final_memory - initial_memory) / 1024**2
    ]

    model.save(f"../models/{model_name}")

In [43]:
def fit_and_track(model, data, model_name):
    start_time = time.time()
    model.fit(data)
    end_time = time.time()
    elapsed_time = end_time - start_time


    new_results.loc[len(results)] = [
        model_name,
        elapsed_time
    ]

    model.save(f"../models/{model_name}")

In [44]:
config_small = GPT2Config(
    n_embd=512,
    n_layer=4,
    n_head=8
)

rtf_model_small = REaLTabFormer(
    model_type="tabular",
    tabular_config=config_small,
    batch_size=16,
    epochs=75,
    mask_rate=0.1,
)

rtf_model_reg = REaLTabFormer(
    model_type="tabular",
    batch_size=16,
    epochs=75,
    mask_rate=0.1,
)

# rtf_model_large = REaLTabFormer(
#     model_type="tabular",
#     tabular_config=config_large,
#     batch_size=16,
#     epochs=75,
#     mask_rate=0.3,
# )

# Fit models and track performance
fit_and_track_memory(rtf_model_small, train_data, "small_model_full")
fit_and_track_memory(rtf_model_reg, train_data, "regular_model_full")
# fit_and_track_memory(rtf_model_large, sample_df, "large_model")

# Display the results
print(results)

Computing the sensitivity threshold...
Using parallel computation!!!




Bootstrap round:   0%|          | 0/500 [00:00<?, ?it/s]

Sensitivity threshold summary:
count    500.000000
mean       0.004391
std        0.013035
min       -0.021667
25%       -0.005000
50%        0.002500
75%        0.011806
max        0.057778
dtype: float64
Sensitivity threshold: 0.02722222222222222 qt_max: 0.05


Map:   0%|          | 0/546 [00:00<?, ? examples/s]

Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 5,                     sensitivity_threshold: 0.02722222222222222,                         val_sensitivity: -0.01725925925925926,                             val_sensitivities: [-0.025, -0.014444444444444446, -0.025, -0.02388888888888889, -0.019444444444444445, -0.01, -0.019444444444444445, -0.001666666666666667, -0.015, -0.012777777777777779, -0.007222222222222222, -0.025, -0.02277777777777778, -0.015555555555555557, -0.021666666666666667]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 10,                     sensitivity_threshold: 0.02722222222222222,                         val_sensitivity: -0.01488888888888889,                             val_sensitivities: [-0.025, -0.009444444444444445, -0.025, -0.02388888888888889, -0.020555555555555556, -0.017222222222222222, -0.01888888888888889, -0.008333333333333333, -0.01611111111111111, -0.016666666666666666, 0.006666666666666667, -0.017777777777777778, -0.015, 0.0005555555555555548, -0.01666666666666667]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 15,                     sensitivity_threshold: 0.02722222222222222,                         val_sensitivity: -0.015370370370370373,                             val_sensitivities: [-0.025, -0.015000000000000001, -0.02277777777777778, -0.02388888888888889, -0.021666666666666667, -0.01888888888888889, -0.023333333333333334, -0.017222222222222222, -0.02, -0.018333333333333333, 0.011111111111111112, -0.01888888888888889, -0.015, 0.008888888888888889, -0.010555555555555556]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 20,                     sensitivity_threshold: 0.02722222222222222,                         val_sensitivity: -0.009888888888888888,                             val_sensitivities: [-0.025, -0.009444444444444445, -0.019444444444444445, -0.020555555555555556, -0.01666666666666667, -0.00888888888888889, -0.01888888888888889, -0.0011111111111111113, -0.013333333333333334, -0.015555555555555555, 0.02666666666666667, -0.011111111111111112, -0.015, 0.012222222222222223, -0.012222222222222223]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 25,                     sensitivity_threshold: 0.02722222222222222,                         val_sensitivity: -0.012703703703703705,                             val_sensitivities: [-0.022222222222222223, -0.0011111111111111105, -0.02, -0.016666666666666666, -0.006666666666666666, -0.012222222222222221, -0.02, -0.007222222222222222, -0.015000000000000001, -0.020555555555555556, 0.008333333333333333, -0.017777777777777778, -0.01611111111111111, -0.01, -0.013333333333333334]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 30,                     sensitivity_threshold: 0.02722222222222222,                         val_sensitivity: -0.016185185185185188,                             val_sensitivities: [-0.02388888888888889, -0.012222222222222223, -0.022222222222222223, -0.02388888888888889, -0.013333333333333334, -0.02277777777777778, -0.025, -0.014444444444444446, -0.021111111111111112, -0.025, 0.006666666666666667, -0.01388888888888889, -0.017222222222222222, 0.002222222222222221, -0.01666666666666667]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 35,                     sensitivity_threshold: 0.02722222222222222,                         val_sensitivity: -0.011888888888888892,                             val_sensitivities: [-0.017222222222222222, -0.008888888888888889, -0.013888888888888888, -0.015, -0.010555555555555556, -0.009444444444444445, -0.021111111111111112, -0.0011111111111111118, -0.012222222222222223, -0.011666666666666667, 0.004444444444444444, -0.019444444444444445, -0.02277777777777778, -0.001666666666666667, -0.01777777777777778]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 40,                     sensitivity_threshold: 0.02722222222222222,                         val_sensitivity: -0.013592592592592594,                             val_sensitivities: [-0.025, -0.006666666666666666, -0.02, -0.019444444444444445, -0.013333333333333334, -0.011111111111111112, -0.021666666666666667, 0.0011111111111111113, -0.01611111111111111, -0.015555555555555557, 0.006666666666666667, -0.021666666666666667, -0.021666666666666667, 0.0011111111111111113, -0.020555555555555556]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 45,                     sensitivity_threshold: 0.02722222222222222,                         val_sensitivity: -0.012740740740740743,                             val_sensitivities: [-0.02277777777777778, -0.012777777777777779, -0.02, -0.017777777777777778, -0.015555555555555555, -0.01611111111111111, -0.01888888888888889, -0.011111111111111112, -0.015, -0.018333333333333333, 0.012222222222222223, -0.015555555555555557, -0.012777777777777777, 0.0038888888888888888, -0.010555555555555556]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 50,                     sensitivity_threshold: 0.02722222222222222,                         val_sensitivity: -0.013888888888888888,                             val_sensitivities: [-0.016666666666666666, -0.001111111111111111, -0.017777777777777778, -0.005555555555555555, -0.004999999999999999, -0.020555555555555556, -0.023333333333333334, -0.009444444444444445, -0.017222222222222222, -0.02, -0.0022222222222222235, -0.021666666666666667, -0.020555555555555556, -0.0077777777777777776, -0.019444444444444445]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 55,                     sensitivity_threshold: 0.02722222222222222,                         val_sensitivity: -0.011703703703703706,                             val_sensitivities: [-0.02388888888888889, -0.007222222222222223, -0.015555555555555557, -0.018333333333333333, -0.0061111111111111106, -0.006111111111111111, -0.019444444444444445, 0.005555555555555556, -0.012777777777777779, -0.009444444444444445, 0.0016666666666666661, -0.019444444444444445, -0.020555555555555556, -0.005555555555555555, -0.018333333333333333]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 60,                     sensitivity_threshold: 0.02722222222222222,                         val_sensitivity: -0.006925925925925927,                             val_sensitivities: [-0.021666666666666667, -0.0038888888888888888, -0.017222222222222222, -0.018333333333333333, -0.011666666666666667, -0.0022222222222222227, -0.015555555555555557, 0.004999999999999999, -0.004444444444444445, -0.008888888888888889, 0.021111111111111112, -0.008888888888888889, -0.012777777777777777, 0.006666666666666666, -0.011111111111111112]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 65,                     sensitivity_threshold: 0.02722222222222222,                         val_sensitivity: -0.0064444444444444445,                             val_sensitivities: [-0.017777777777777778, 0.006666666666666666, -0.013888888888888888, -0.012777777777777777, -0.0038888888888888888, -0.011666666666666665, -0.02, -0.0027777777777777783, -0.008333333333333333, -0.015555555555555557, 0.025555555555555554, -0.011666666666666667, -0.013888888888888888, 0.01388888888888889, -0.010555555555555556]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 70,                     sensitivity_threshold: 0.02722222222222222,                         val_sensitivity: -0.00974074074074074,                             val_sensitivities: [-0.02277777777777778, 0.00111111111111111, -0.012777777777777779, -0.012222222222222223, -0.008333333333333333, -0.012777777777777779, -0.02277777777777778, -0.007222222222222222, -0.011111111111111112, -0.012777777777777779, 0.011111111111111112, -0.012222222222222223, -0.011666666666666667, -0.0011111111111111113, -0.010555555555555556]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 75,                     sensitivity_threshold: 0.02722222222222222,                         val_sensitivity: -0.010703703703703705,                             val_sensitivities: [-0.02388888888888889, -0.010555555555555556, -0.019444444444444445, -0.019444444444444445, -0.01666666666666667, -0.018333333333333333, -0.020555555555555556, -0.010555555555555556, -0.013888888888888888, -0.017777777777777778, 0.02722222222222222, -0.01, -0.003333333333333333, 0.01, -0.013333333333333334]
Model: small_model_full
System Memory Usage: -104.24 MB
Python Current Memory: 23.20 MB
Python Peak Memory: 26.57 MB
Copying artefacts from: best-disc-model
Copying artefacts from: mean-best-disc-model
Copying artefacts from: not-best-disc-model
Copying artefacts from: last-epoch-model
Computing the sensitivity threshold...
Using parallel computation!!!




Bootstrap round:   0%|          | 0/500 [00:00<?, ?it/s]

Sensitivity threshold summary:
count    500.000000
mean       0.004157
std        0.012505
min       -0.020556
25%       -0.005000
50%        0.002778
75%        0.011806
max        0.047222
dtype: float64
Sensitivity threshold: 0.028333333333333332 qt_max: 0.05


Map:   0%|          | 0/546 [00:00<?, ? examples/s]

Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 5,                     sensitivity_threshold: 0.028333333333333332,                         val_sensitivity: -0.013703703703703708,                             val_sensitivities: [-0.02277777777777778, -0.0038888888888888888, -0.01611111111111111, -0.01611111111111111, -0.003333333333333334, -0.021666666666666667, -0.02388888888888889, -0.01388888888888889, -0.019444444444444445, -0.02277777777777778, 0.012777777777777779, -0.017222222222222222, -0.021666666666666667, -0.0016666666666666679, -0.01388888888888889]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 10,                     sensitivity_threshold: 0.028333333333333332,                         val_sensitivity: -0.013555555555555559,                             val_sensitivities: [-0.02388888888888889, -0.0033333333333333322, -0.019444444444444445, -0.017222222222222222, -0.005000000000000001, -0.019444444444444445, -0.021666666666666667, -0.010555555555555556, -0.017777777777777778, -0.022222222222222223, 0.016111111111111114, -0.020555555555555556, -0.021666666666666667, 0.0027777777777777775, -0.019444444444444445]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 15,                     sensitivity_threshold: 0.028333333333333332,                         val_sensitivity: -0.010185185185185184,                             val_sensitivities: [-0.022222222222222223, -0.0033333333333333327, -0.017222222222222222, -0.01888888888888889, -0.012222222222222223, -0.016666666666666666, -0.021111111111111112, -0.007222222222222222, -0.018333333333333333, -0.01611111111111111, 0.027777777777777776, -0.020555555555555556, -0.008333333333333333, 0.012222222222222223, -0.010555555555555556]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 20,                     sensitivity_threshold: 0.028333333333333332,                         val_sensitivity: -0.011333333333333332,                             val_sensitivities: [-0.020555555555555556, 0.0016666666666666657, -0.018333333333333333, -0.009444444444444445, -0.012222222222222223, -0.014444444444444444, -0.02277777777777778, -0.010555555555555556, -0.017222222222222222, -0.020555555555555556, 0.012777777777777779, -0.016666666666666666, -0.012777777777777779, 0.0077777777777777776, -0.01666666666666667]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 25,                     sensitivity_threshold: 0.028333333333333332,                         val_sensitivity: -0.005740740740740741,                             val_sensitivities: [-0.017222222222222222, -0.0011111111111111105, -0.011666666666666665, -0.013333333333333332, -0.008333333333333333, -0.013888888888888888, -0.017222222222222222, -0.008333333333333333, -0.013333333333333334, -0.01611111111111111, 0.042777777777777776, -0.01, -0.009444444444444445, 0.015, -0.0038888888888888888]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 30,                     sensitivity_threshold: 0.028333333333333332,                         val_sensitivity: -0.00025925925925925937,                             val_sensitivities: [-0.015555555555555555, 0.0016666666666666653, -0.013888888888888888, -0.010555555555555556, 0.007222222222222222, -0.004444444444444445, -0.021666666666666667, 0.007222222222222222, -0.010555555555555556, -0.012777777777777779, 0.03222222222222222, 0.0016666666666666653, 0.0077777777777777776, 0.026111111111111113, 0.001666666666666666]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 35,                     sensitivity_threshold: 0.028333333333333332,                         val_sensitivity: 0.007962962962962965,                             val_sensitivities: [-0.015, 0.0027777777777777766, -0.013333333333333332, -0.006666666666666667, -0.000555555555555555, 0.009444444444444445, -0.011111111111111112, 0.01777777777777778, 0.0005555555555555544, -0.004444444444444444, 0.06277777777777778, 0.012222222222222223, 0.012777777777777779, 0.043333333333333335, 0.008888888888888889]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 40,                     sensitivity_threshold: 0.028333333333333332,                         val_sensitivity: 0.010925925925925926,                             val_sensitivities: [-0.004999999999999999, 0.009999999999999998, 0.0011111111111111122, 0.006111111111111112, 0.012777777777777777, 0.030555555555555555, -0.0061111111111111106, 0.03166666666666667, 0.01888888888888889, 0.008333333333333333, 0.04166666666666667, -0.0027777777777777783, -0.005555555555555556, 0.029444444444444447, -0.007222222222222222]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 45,                     sensitivity_threshold: 0.028333333333333332,                         val_sensitivity: 0.00848148148148148,                             val_sensitivities: [-0.020555555555555556, 0.01388888888888889, -0.0044444444444444444, -0.010555555555555556, 0.005555555555555555, 0.015, -0.01611111111111111, 0.028333333333333332, 0.011111111111111112, 0.007777777777777778, 0.03944444444444444, 0.0077777777777777776, 0.0077777777777777776, 0.04, 0.0022222222222222214]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 50,                     sensitivity_threshold: 0.028333333333333332,                         val_sensitivity: 0.008925925925925926,                             val_sensitivities: [-0.013333333333333332, 0.021111111111111112, -0.008333333333333333, -0.0005555555555555554, 0.012777777777777779, 0.007222222222222223, -0.009444444444444445, 0.017222222222222222, 0.008888888888888887, 0.0033333333333333322, 0.06, -0.0077777777777777776, 0.011111111111111112, 0.028333333333333332, 0.003333333333333334]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 55,                     sensitivity_threshold: 0.028333333333333332,                         val_sensitivity: 0.02511111111111111,                             val_sensitivities: [-0.01611111111111111, 0.03666666666666667, 0.0, 0.006666666666666666, 0.014444444444444446, 0.04111111111111111, 0.0011111111111111115, 0.05, 0.02888888888888889, 0.025, 0.06722222222222222, 0.016666666666666666, 0.02722222222222222, 0.05, 0.027777777777777776]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Saving not-best model...
Critic round: 60,                     sensitivity_threshold: 0.028333333333333332,                         val_sensitivity: 0.02907407407407407,                             val_sensitivities: [0.004444444444444444, 0.052777777777777785, 0.018333333333333333, 0.019444444444444445, 0.03833333333333333, 0.04, 0.009444444444444445, 0.058333333333333334, 0.02888888888888889, 0.023333333333333334, 0.05666666666666666, 0.013333333333333334, 0.013333333333333332, 0.042777777777777776, 0.01666666666666667]


Step,Training Loss


  0%|          | 0/270 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 384 samples generated. Sampling efficiency is: 100.0000%
Critic round: 65,                     sensitivity_threshold: 0.028333333333333332,                         val_sensitivity: 0.04222222222222223,                             val_sensitivities: [0.004999999999999999, 0.02611111111111111, 0.02277777777777778, 0.019999999999999997, 0.034999999999999996, 0.04166666666666667, 0.01388888888888889, 0.05333333333333333, 0.03277777777777778, 0.018333333333333333, 0.12833333333333335, 0.03944444444444445, 0.05444444444444445, 0.09888888888888889, 0.043333333333333335]
Stopping training, no improvement in critic...
Model: regular_model_full
System Memory Usage: -328.23 MB
Python Current Memory: 1.09 MB
Python Peak Memory: 4.51 MB
Copying artefacts from: best-disc-model
Copying artefacts from: mean-best-disc-model
Copying artefacts from: not-best-disc-model
Copying artefacts from: last-epoch-model
           Model     Time (s)  Memory Current (MB)  Mem

In [57]:
new_results.loc[2] = ['small_model_full',1101.653454,23.20,26.57,-104.24]
new_results

Unnamed: 0,Model,Time (s),Memory Current (MB),Memory Peak (MB),System Memory Usage (MB)
3,regular_model_full,2710.346546,1.090179,4.505012,-328.234375
2,small_model_full,1101.653454,23.2,26.57,-104.24


In [None]:
2710.346546

In [16]:
layers = [4,6,12]
heads = [8,12,12]
n_embd = [512,768,768]
train_length = [len(sample_df),len(sample_df),len(sample_df)]

results['Layers'] = layers
results['Heads'] = heads
results['Embedding Dim'] = n_embd
results['DS Size'] = train_length


good_results = results[['Model','Time (s)','Layers','Heads','Embedding Dim','DS Size']]
good_results

Unnamed: 0,Model,Time (s),Layers,Heads,Embedding Dim,DS Size
0,small_model,566.675813,4,8,512,273
1,regular_model,1790.28255,6,12,768,273
2,large_model,4983.763235,12,12,768,273


In [18]:
good_results.to_csv('logs.csv')