In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import torch

from CompLex import load_dataset, preprocess_data, tokenize_complex_dataset, create_base_model, apply_lora, create_trainer_complex
from train import train_model
from schema import TrainingConfig, TrainingTask

In [None]:
data = load_dataset()
data

In [None]:
data = preprocess_data(data)
data

In [None]:
tokenized_dataset, tokenizer = tokenize_complex_dataset(data)
tokenized_dataset

In [6]:
config = TrainingConfig(
    task=TrainingTask.CompLexV1,
    rank=16,
    alpha=16,
    target_modules=["query", "key", "value"],
    lora_dropout=0.1,
    learning_rate=2e-4,
    batch_size=8,
    num_epochs=3,
    max_input_length=128
)

In [None]:
model = create_base_model()
model = apply_lora(model, config)

In [8]:
trainer = create_trainer_complex(
    model=model,
    config=config,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    output_dir="./outputs/first"
)

In [None]:
train_time, vram = train_model(trainer)

In [13]:
def predict_complexity(model, tokenizer, sentence: str, word: str) -> float:
    inputs = tokenizer(
        sentence,
        word,
        return_tensors="pt",
        padding="max_length",
        truncation="only_first",
        max_length=128,
    )
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        output = model(**inputs)
    
    return output.logits.squeeze().item()

In [None]:
from visualization import visualize_complexity
sentence = "The protagonist exhibited extraordinary perspicacity"
scores = {word: predict_complexity(model, tokenizer=tokenizer, sentence=sentence, word=word) for word in sentence.split(" ")}


visualize_complexity(sentence, scores)

In [None]:
MODEL_PATH = Path.home() / "models/bert" 

trainer.save_model(MODEL_PATH)
tokenizer.save_pretrained(MODEL_PATH)