In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer
)
from datasets import Dataset
import numpy as np
import os

In [3]:
model_path = "/model-weights/Llama-3.2-1B"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    local_files_only=True,
    trust_remote_code=True
)

In [20]:
# Load the masked data
df = pd.read_csv("/h/emzed/data/qa_discharge_masked.csv")

# Find the longest text
longest_text = df['masked_text'].str.len().idxmax()
longest_text_content = df.loc[longest_text, 'masked_text']

# Tokenize the longest text
tokenized_text = tokenizer(longest_text_content, return_tensors="pt")

print(f"Length of longest text: {len(longest_text_content)}")
print(f"Number of tokens: {len(tokenized_text['input_ids'][0])}")
print("\nFirst 100 tokens:")
print(tokenized_text['input_ids'][0][:100])


Length of longest text: 42591
Number of tokens: 11125

First 100 tokens:
tensor([128000,    720,    678,     25,    220,   7588,   1078,   8113,   2360,
            25,    256,   7588,    198,    720,   2654,   2796,   2696,     25,
           220,   7588,   1078,   4185,  14215,   2696,     25,    256,   7588,
           198,    720,   1956,    315,  36127,     25,    220,   7588,   1835,
          6834,     25,    256,    386,  27907,   1898,     25,  52699,   1341,
          4069,  27907,   2460,   2431,    552,     25,    720,  29305,    292,
           484,   1354,    611,  37833,  17757,  91073,  27907,  10673,   2518,
            25,   1328,   5056,  14681,  68538,    512,     37,   2099,    323,
         69393,  27907,  35575,  71212,    477,    763,  78134,  45546,    512,
          6101,   3092,  74378,    532,  99647,    323,   1314,   4851,  31747,
          1430,   2065,    198,   6101,   3092,  74378,    532,  99647,    198,
          6101])


In [None]:
# Load the data
# df = pd.read_csv("/h/emzed/data/qa_discharge.csv")

# Initialize empty list to store tokenized texts
tokenized_texts = []

# Tokenize each text and append to list
for text in df['text']:
    tokens = tokenizer(text, return_tensors="pt")
    tokenized_texts.append(tokens)

print(f"Tokenized {len(tokenized_texts)} texts")
print(f"First text tokens shape: {tokenized_texts[0]['input_ids'].shape}")


In [24]:
# Calculate average length of text column
avg_length = df['text'].str.len().mean()
print(f"Average length of text: {avg_length:.2f} characters")

# Calculate average number of tokens
# avg_tokens = df['text'].apply(lambda x: len(tokenizer(x)['input_ids'])).mean()
# print(f"Average number of tokens: {avg_tokens:.2f} tokens")


Average length of text: 12149.97 characters


In [25]:
# Find text with length closest to mean
mean_length = df['text'].str.len().mean()
closest_idx = (df['text'].str.len() - mean_length).abs().idxmin()
closest_text = df.loc[closest_idx, 'text']

print(f"Mean length: {mean_length:.2f}")
print(f"Length of closest text: {len(closest_text)}")


Mean length: 12149.97
Length of closest text: 12150


In [26]:
tokens = tokenizer(closest_text, return_tensors="pt")
print(f"Number of tokens: {len(tokens['input_ids'][0])}")

Number of tokens: 3424


In [29]:
# Count entries with text length below 1500
count_below_1500 = len(df[df['text'].str.len() < 12149])
print(f"Number of entries with text length below 1500: {count_below_1500}")
print(f"Percentage: {(count_below_1500/len(df))*100:.2f}%")


Number of entries with text length below 1500: 64338
Percentage: 56.88%


In [28]:
import json

with open("/model-weights/Llama-3.2-1B/config.json", "r") as f:
    config = json.load(f)
    print(f"Maximum position embeddings: {config['max_position_embeddings']}")

Maximum position embeddings: 131072


In [30]:
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    local_files_only=True,
    torch_dtype=torch.float16,
    device_map="auto",
    use_safetensors=True,
    trust_remote_code=True
)

In [31]:
# Generate output from the model using the closest text
inputs = tokenizer(closest_text, return_tensors="pt").to(model.device)
outputs = model.generate(
    inputs["input_ids"],
    max_new_tokens=100,
    do_sample=True,
    temperature=0.7,
    pad_token_id=tokenizer.eos_token_id
)

# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated text:")
print(generated_text)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [5]:
# Get max input length for the model
max_length = tokenizer.model_max_length
print(f"Maximum input length: {max_length} tokens")


Maximum input length: 131072 tokens


In [10]:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

In [11]:
print(f"Model vocabulary size: {model.config.vocab_size}")
print(f"Tokenizer vocabulary size: {len(tokenizer)}")

Model vocabulary size: 128256
Tokenizer vocabulary size: 128256


In [12]:
df = pd.read_csv("/h/emzed/data/qa_discharge_masked.csv", nrows=5)

# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

In [67]:
class QuestionGenDataset(Dataset):
    def __init__(self, data, tokenizer):
        self._data = data
        self._tokenizer = tokenizer

    def __len__(self):
        return len(self._data)

    def __getitem__(self, idx):
        item = self._data[idx]
        
        # Format the input-output pair
        prompt = f"### Text:\n{item['masked_text']}\n### Question:\n"
        target = f"{item['q']}</s>"
        
        # Combine prompt and target for full sequence
        full_sequence = prompt + target
        
        # Tokenize without return_tensors first
        model_inputs = self._tokenizer(
            full_sequence,
            padding='max_length',
            truncation=True,
            max_length=512,
            return_attention_mask=True
        )
        
        # Convert to tensors manually and add batch dimension
        input_ids = torch.tensor(model_inputs['input_ids']).unsqueeze(0)
        attention_mask = torch.tensor(model_inputs['attention_mask']).unsqueeze(0)
        
        # Create labels
        labels = input_ids.clone()
        
        # Get prompt length (tokenize without return_tensors)
        prompt_tokens = self._tokenizer(prompt)['input_ids']
        prompt_length = len(prompt_tokens)
        
        # Set prompt tokens to -100
        labels[:, :prompt_length] = -100
        
        # Set padding tokens to -100
        labels[labels == self._tokenizer.pad_token_id] = -100
        
        return {
            'input_ids': input_ids.squeeze(0),  # Remove batch dimension before returning
            'attention_mask': attention_mask.squeeze(0),
            'labels': labels.squeeze(0)
        }

In [63]:
longest_text = df['masked_text'].str.len().idxmax()
longest_text_content = df.loc[longest_text, 'masked_text']
tokenized_text = tokenizer(longest_text_content, return_tensors="pt")
max_length = tokenized_text['input_ids'].shape[1]

In [68]:
train_test_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = QuestionGenDataset(train_test_split['train'], tokenizer)
val_dataset = QuestionGenDataset(train_test_split['test'], tokenizer)

In [69]:
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="./results",
    report_to="none",
    remove_unused_columns=False,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    gradient_accumulation_steps=4,
    fp16=True,
    learning_rate=2e-5,
)

In [72]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


ValueError: not enough values to unpack (expected 3, got 2)

In [73]:
class QuestionGenDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self._data = data
        self._tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self._data)

    def __getitem__(self, idx):
        item = self._data[idx]
        
        # Format the input-output pair
        prompt = f"### Text:\n{item['masked_text']}\n### Question:\n"
        target = f"{item['q']}</s>"
        
        # Combine prompt and target
        full_sequence = prompt + target
        
        # Tokenize
        model_inputs = self._tokenizer(
            full_sequence,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'  # Important: return pytorch tensors
        )
        
        # Create labels (don't squeeze tensors)
        labels = model_inputs['input_ids'].clone()
        
        # Get prompt length
        prompt_tokens = self._tokenizer(prompt, return_tensors='pt')
        prompt_length = prompt_tokens['input_ids'].shape[1]
        
        # Set prompt tokens to -100
        labels[:, :prompt_length] = -100
        
        # Set padding tokens to -100
        labels[labels == self._tokenizer.pad_token_id] = -100
        
        return {
            'input_ids': model_inputs['input_ids'],
            'attention_mask': model_inputs['attention_mask'],
            'labels': labels
        }

In [74]:
# Add before training
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False  # Important for training

In [75]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    gradient_accumulation_steps=4,
    fp16=True,
    learning_rate=2e-5,
    remove_unused_columns=False,  # Important for custom datasets
    prediction_loss_only=True,    # Add this for more efficient training
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [76]:
# 1. Initialize tokenizer and model
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False

# 2. Prepare datasets
train_test_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = QuestionGenDataset(train_test_split['train'], tokenizer, max_length=1024)
val_dataset = QuestionGenDataset(train_test_split['test'], tokenizer, max_length=1024)

# 3. Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# 4. Train
trainer.train()

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


KeyboardInterrupt: 