In [62]:
configuration = "fi-100_ft-15000000_nb-30_ht-100000_di-uni_mx-1500_lg-no"
dataset_path = "datasets"
train_year = 2024
output_dir = "trained_models"
fine_tune_run_name = "fine_tune_2024_full_" + configuration
num_fine_tune_epochs = 1
fine_tune_per_device_batch_size = 128
fine_tune_per_device_eval_batch_size = 256

In [63]:
from data_loader import *
train_benign_corpus, train_rt_corpus = load_train_data_2024(train_year, configuration, dataset_path)

Loading data from datasets/yr-2024_fi-100_ft-15000000_nb-30_ht-100000_di-uni_mx-1500_lg-no.zip


  0%|          | 0/8859 [00:00<?, ?it/s]

100%|██████████| 8859/8859 [00:04<00:00, 2032.86it/s]


In [64]:
from bert_combined_dataset import *

train_data = FlowPairDataset(configuration, train_rt_corpus, train_benign_corpus, seq_length=128, deduplicate=True, coalesce=True, shuffle=True, balanced=False)

Creating dataset for  fi-100_ft-15000000_nb-30_ht-100000_di-uni_mx-1500_lg-no
Normalizing corpus 0
Normalizing corpus 1
Coalescing


100%|██████████| 119/119 [00:00<00:00, 179.74it/s]
100%|██████████| 33370/33370 [00:50<00:00, 655.13it/s] 


Creating sentence pairs


100%|██████████| 119/119 [00:00<00:00, 165.72it/s]
100%|██████████| 33370/33370 [00:01<00:00, 17692.00it/s]


Training pairs 0: 6172
Training pairs 1: 6741035
Deduplicating
Training pairs after deduplication (0): 2087
Training pairs after deduplication (1): 1106090
Extracting tokens


0it [00:00, ?it/s]
100%|██████████| 2087/2087 [00:00<00:00, 78682.55it/s]
100%|██████████| 1106090/1106090 [00:10<00:00, 104568.22it/s]


In [65]:
from fine_tune_bert import *

pre_train_run_name = "pre_train_all_years_ded_coa_med_seq-128_old_fi-100_ft-15000000_nb-30_ht-100000_di-uni_mx-1500_lg-no"

pretrained_model_path = os.path.join(output_dir, pre_train_run_name, "pretrained_model")    
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_path, num_labels=2)

finetune_model_path = os.path.join(output_dir, fine_tune_run_name)
try:
    os.mkdir(finetune_model_path)
except:
    pass

training_args = TrainingArguments(
    output_dir=finetune_model_path,          # output directory to where save model checkpoint
    overwrite_output_dir=True,      
    num_train_epochs=1,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=fine_tune_per_device_batch_size, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=32,  # accumulating the gradients before updating the weights
    logging_steps=1,             # evaluate, log and save model checkpoints every 1000 step
    max_steps=20,
    run_name=fine_tune_run_name,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
)
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at trained_models/pre_train_all_years_ded_coa_med_seq-128_old_fi-100_ft-15000000_nb-30_ht-100000_di-uni_mx-1500_lg-no/pretrained_model and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[34m[1mwandb[0m: Currently logged in as: [33msomm-s[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
1,0.7571
2,0.4234
3,0.2346
4,0.1334
5,0.0822
6,0.0521
7,0.0431
8,0.0278
9,0.0238
10,0.0215


TrainOutput(global_step=20, training_loss=0.0977258310187608, metrics={'train_runtime': 137.9366, 'train_samples_per_second': 593.896, 'train_steps_per_second': 0.145, 'total_flos': 1603300680007680.0, 'train_loss': 0.0977258310187608, 'epoch': 0.07})

In [66]:
save_path = os.path.join(finetune_model_path, 'finetuned_model_20_steps')
trainer.save_model(save_path)
print(f"Model saved to {save_path}")

hyperparameters = {
    'config': configuration,
    'run_name': fine_tune_run_name,
    'output_dir': finetune_model_path,
    'num_epochs': 1,
    'per_device_train_batch_size': fine_tune_per_device_batch_size,
    'per_device_eval_batch_size': 0,
    'logging_steps': 1,
    'save_steps': 1,
    'pretrain_run_name': pre_train_run_name,
}

with open(os.path.join(finetune_model_path, 'hyperparameters.json'), 'w') as f:
    json.dump(hyperparameters, f)

Model saved to trained_models/fine_tune_2024_full_fi-100_ft-15000000_nb-30_ht-100000_di-uni_mx-1500_lg-no/finetuned_model_20_steps
