In [16]:
# Run this cell to configure the Google Colab runtime

import os

try:
    from google.colab import drive

    drive.mount('/content/drive')
    os.chdir(next((root for root, _, files in os.walk(".") if "dsait4090_project_location" in files), "."))
    print(f'Google Colab: {os.getcwd()}')
except ImportError:
    print(f'Local: {os.getcwd()}')

Local: /Users/szymong/tud/nlp/fact-checking/notebooks


In [32]:
%load_ext autoreload
%autoreload

from src.early_stopping import EarlyStopping
from src.common import get_device, read_json, DATA_PATH, save_model, MODELS_PATH
from src.quantemp_processor import QuantempProcessor, QT_VERACITY_LABELS
from src.models.gpt2 import Gpt2Tokenizer, Gpt2Classifier
from src.classification_training import ClassificationTraining

import time
import torch
import logging
import os
from torch import nn

# os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

logging.basicConfig(level=logging.ERROR)

device = get_device()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
MPS: 1


In [33]:
train_data = read_json(f"{DATA_PATH}/raw_data/train_claims_quantemp.json")
val_data = read_json(f"{DATA_PATH}/raw_data/val_claims_quantemp.json")

In [27]:
%autoreload

gpt2_tokenizer = Gpt2Tokenizer()
data_processor = QuantempProcessor(gpt2_tokenizer, claim_decomposition=False)

train_dataset = data_processor.transform(train_data)
val_dataset = data_processor.transform(val_data)

  0%|          | 0/9935 [00:00<?, ?it/s]

  0%|          | 0/3084 [00:00<?, ?it/s]

In [34]:
model = Gpt2Classifier("gpt2", len(QT_VERACITY_LABELS), mlp_dim=1024, dropout=0.1)

# test
output = model.forward(train_dataset.tensors[0][:16], train_dataset.tensors[1][:16])
labels = train_dataset.tensors[2][:16]

model = model.to(device)

In [None]:
training = ClassificationTraining(
    model_name="gpt2_basic",
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    model=model,
    optimizer=torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8),
    loss_function=nn.CrossEntropyLoss(),
    batch_size=16,
    device=device
)

In [None]:
import random
import os
import numpy as np

random_state = 42

training_stats = []

total_t0 = time.time()
early_stopping = EarlyStopping(patience=3, verbose=True)

for epoch_i in range(0, epochs):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_accuracy = 0
    total_train_loss = 0

    early_stopping(avg_val_loss, model)
    if early_stopping.early_stop:
        print("Early stopping")
        break

    output_dir = 'finqa_roberta_claimdecomp_continued/'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving model to %s" % output_dir)
    gpt2_tokenizer.save_pretrained(output_dir)
    torch.save(model.state_dict(), os.path.join(output_dir, 'model_weights'))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time() - total_t0)))


Training...
  Batch    40  of    621.    Elapsed: 1:14:24.
  Batch    80  of    621.    Elapsed: 2:31:59.
  Batch   120  of    621.    Elapsed: 3:55:02.
  Batch   160  of    621.    Elapsed: 5:29:24.
  Batch   200  of    621.    Elapsed: 7:01:18.


KeyboardInterrupt: 

In [None]:
tokenizer.save_pretrained("models/gpt2-ft-tokenizer")
torch.save(model.state_dict(), os.path.join("../models/", 'model_weights'))

save_model(f'{MODELS_PATH}')