In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer
)
from datasets import Dataset

In [3]:
model_path = "/model-weights/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    local_files_only=True,
    trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    local_files_only=True,
    device_map="auto",
    use_safetensors=True,
    trust_remote_code=True
)

In [5]:
df = pd.read_csv("/h/emzed/data/qa_discharge_masked.csv", nrows=5)

# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

In [6]:
class QuestionGenDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self._data = data
        self._tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self._data)
    
    def __getitem__(self, idx):
        item = self._data[idx]
        
        # Format the input-output pair
        prompt = f"### Text:\n{item['masked_text']}\n### Question:\n"
        target = f"{item['q']}</s>"
        
        # First tokenize prompt to check its length
        prompt_tokens = self._tokenizer(prompt, return_tensors='pt')
        prompt_length = prompt_tokens['input_ids'].shape[1]
        
        # Calculate remaining space for target
        remaining_length = self.max_length - prompt_length
        
        # Tokenize target separately and truncate if needed
        target_tokens = self._tokenizer(target, return_tensors='pt')
        
        # Combine truncated sequences
        if remaining_length > 0:
            target_ids = target_tokens['input_ids'][0, :remaining_length]
            input_ids = torch.cat([prompt_tokens['input_ids'][0], target_ids])
            attention_mask = torch.ones_like(input_ids)
        else:
            # If prompt is too long, truncate it
            input_ids = prompt_tokens['input_ids'][0, :self.max_length]
            attention_mask = torch.ones_like(input_ids)
        
        # Pad if necessary
        if len(input_ids) < self.max_length:
            padding_length = self.max_length - len(input_ids)
            input_ids = torch.cat([input_ids, torch.full((padding_length,), self._tokenizer.pad_token_id)])
            attention_mask = torch.cat([attention_mask, torch.zeros(padding_length)])
        
        # Create labels
        labels = input_ids.clone()
        labels[:prompt_length] = -100
        labels[labels == self._tokenizer.pad_token_id] = -100
        
        return {
            'input_ids': input_ids.unsqueeze(0),
            'attention_mask': attention_mask.unsqueeze(0),
            'labels': labels.unsqueeze(0)
        }

In [13]:
training_args = TrainingArguments(
    output_dir="./results",
    remove_unused_columns=False,
    report_to="none",
    per_device_train_batch_size=1,      # Reduced from 32 due to model size
    gradient_accumulation_steps=8,       # To maintain effective batch size of 32
    gradient_checkpointing=True,         # Enable to save memory
    fp16=True,                          # Keep this for memory efficiency
    optim='adamw_torch',                # Memory efficient optimizer
    max_grad_norm=1.0,                  # Help prevent gradient explosion
    
    # Other parameters can remain the same
    learning_rate=2e-5,
    num_train_epochs=1,
    warmup_steps=1,
    weight_decay=0.01,
    logging_steps=1,
    evaluation_strategy="no",
    eval_steps=1,
    save_steps=1,
)



In [8]:
# 1. Initialize tokenizer and model
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False

# 2. Prepare datasets
train_test_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = QuestionGenDataset(train_test_split['train'], tokenizer, max_length=64)
val_dataset = QuestionGenDataset(train_test_split['test'], tokenizer, max_length=64)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 22.16 GiB of which 47.88 MiB is free. Including non-PyTorch memory, this process has 22.11 GiB memory in use. Of the allocated memory 21.83 GiB is allocated by PyTorch, and 79.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [17]:
print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
print(f"Allocated GPU memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"Cached GPU memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

Total GPU memory: 22.16 GB
Allocated GPU memory: 20.82 GB
Cached GPU memory: 20.89 GB
