In [1]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, AdamW, get_linear_schedule_with_warmup
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [3]:
llama_instruction_df = pd.read_csv('data/llama-pretrain-instruction-set.csv', index_col = 0)

In [4]:
llama_instruction_df.head()

Unnamed: 0,Input,Context,Instruction,Output
0,MAATNTILAFSSPSRLLIPPSSNPSTLRSSFRGVSLNNNNLHRLQS...,This task involves mapping target sequences to...,Predict the miRNA sequence for this target.,UGUGUUCUCAGGUCACCCCUG
1,MAATNTILAFSSPSRLLIPPSSNPSTLRSSFRGVSLNNNNLHRLQS...,This task involves mapping target sequences to...,Predict the miRNA sequence for this target.,UGUGUUCUCAGGUCACCCCUG
2,MAKGVAVLNSSEGVTGTIFFTQEGDGVTTVSGTVSGLKPGLHGFHV...,This task involves mapping target sequences to...,Predict the miRNA sequence for this target.,UGUGUUCUCAGGUCACCCCUG
3,MAATNTILAFSSPSRLLIPPSSNPSTLRSSFRGVSLNNNNLHRLQS...,This task involves mapping target sequences to...,Predict the miRNA sequence for this target.,UGUGUUCUCAGGUCACCCCUU
4,MAKGVAVLNSSEGVTGTIFFTQEGDGVTTVSGTVSGLKPGLHGFHV...,This task involves mapping target sequences to...,Predict the miRNA sequence for this target.,UGUGUUCUCAGGUCACCCCUU


In [5]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf')

# Load the model with pre-training turned off
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")

# Reinitialize the model's weights (essentially turning off pretraining)
model.init_weights()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# Add special tokens
special_tokens = {
    'pad_token': '[PAD]',
    'eos_token': '[EOS]',
    'bos_token': '[SOS]',  # Beginning of Sequence (sometimes also referred to as Start of Sequence)
    'unk_token': '[UNK]'
}

# Add the special tokens to the tokenizer
tokenizer.add_special_tokens(special_tokens)

4

In [7]:
def tokenize_instruction_set(row):
    # Construct the input text by concatenating context, instruction, and input
    input_text = f"[SOS] {row['Context']} {row['Instruction']} {row['Input']} [EOS]"
    # Construct the output text
    output_text = f"{row['Output']} [EOS]"

    # Tokenize the input text
    input_ids = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True).input_ids
    # Tokenize the output text
    output_ids = tokenizer(output_text, return_tensors='pt', padding=True, truncation=True).input_ids
    
    return input_ids, output_ids

In [8]:
# Apply the tokenization function to the entire dataset
llama_instruction_df[['input_ids', 'output_ids']] = llama_instruction_df.apply(tokenize_instruction_set, axis=1, result_type='expand')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [9]:
llama_instruction_df[['input_ids', 'output_ids']]

Unnamed: 0,input_ids,output_ids
0,"[[tensor(1), tensor(32002), tensor(29871), ten...","[[tensor(1), tensor(501), tensor(29954), tenso..."
1,"[[tensor(1), tensor(32002), tensor(29871), ten...","[[tensor(1), tensor(501), tensor(29954), tenso..."
2,"[[tensor(1), tensor(32002), tensor(29871), ten...","[[tensor(1), tensor(501), tensor(29954), tenso..."
3,"[[tensor(1), tensor(32002), tensor(29871), ten...","[[tensor(1), tensor(501), tensor(29954), tenso..."
4,"[[tensor(1), tensor(32002), tensor(29871), ten...","[[tensor(1), tensor(501), tensor(29954), tenso..."
...,...,...
80011,"[[tensor(1), tensor(32002), tensor(29871), ten...","[[tensor(1), tensor(402), tensor(6344), tensor..."
80012,"[[tensor(1), tensor(32002), tensor(29871), ten...","[[tensor(1), tensor(501), tensor(29954), tenso..."
80013,"[[tensor(1), tensor(32002), tensor(29871), ten...","[[tensor(1), tensor(315), tensor(6344), tensor..."
80014,"[[tensor(1), tensor(32002), tensor(29871), ten...","[[tensor(1), tensor(14614), tensor(29965), ten..."


In [10]:
# First, split the dataset into training + validation and testing sets
train_val_df, test_df = train_test_split(llama_instruction_df, test_size=0.2, random_state=42)

# Then, split the training + validation set into training and validation sets
train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

In [11]:
class InstructionDataset(Dataset):
    def __init__(self, df):
        self.input_ids = df['input_ids'].tolist()
        self.output_ids = df['output_ids'].tolist()

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx].squeeze(),
            'labels': self.output_ids[idx].squeeze()
        }

def custom_collate_fn(batch):
    input_ids = [item['input_ids'].squeeze(0) for item in batch]
    labels = [item['labels'].squeeze(0) for item in batch]
    
    # Pad sequences to the length of the longest sequence in the batch
    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=tokenizer.pad_token_id)
    
    return {
        'input_ids': input_ids_padded,
        'labels': labels_padded
    }

In [12]:
# Create the Trainning DataLoader
train_dataset = InstructionDataset(train_df)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=custom_collate_fn)

# Create the validation DataLoader
val_dataset = InstructionDataset(val_df)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=custom_collate_fn)

In [12]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,  # Accumulate gradients over 8 steps
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,  # Logs loss every 100 steps
    evaluation_strategy="epoch",  # Evaluate after each epoch
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()



OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 23.64 GiB of which 18.50 MiB is free. Including non-PyTorch memory, this process has 23.54 GiB memory in use. Of the allocated memory 23.17 GiB is allocated by PyTorch, and 1.05 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [16]:
vocab_size = len(tokenizer)
model.resize_token_embeddings(vocab_size)

Embedding(32004, 4096)

In [None]:
# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Set up a learning rate scheduler
num_epochs = 3
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# To store training and validation losses
train_loss_values = []
val_loss_values = []

# Training loop
model.train()

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    
    # Training
    total_train_loss = 0
    model.train()  # Set the model to training mode
    
    for step, batch in enumerate(train_dataloader):
        # input_ids = batch['input_ids'].to(device)
        input_ids = batch['input_ids']
        # labels = batch['labels'].to(device)
        labels = batch['labels']
        
        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        scheduler.step()
        
        total_train_loss += loss.item()
        train_loss_values.append(loss.item())
        
        if step % 100 == 0 and step != 0:
            avg_train_loss = total_train_loss / (step + 1)
            print(f"Step {step}, Training Loss: {avg_train_loss}")
    
    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} Training Loss: {avg_train_loss}")
    
    # Validation
    total_val_loss = 0
    model.eval()  # Set the model to evaluation mode
    
    with torch.no_grad():
        for batch in val_dataloader:
            # input_ids = batch['input_ids'].to(device)
            input_ids = batch['input_ids']
            labels = batch['labels']
            # labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss
            
            total_val_loss += loss.item()
            val_loss_values.append(loss.item())
    
    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1} Validation Loss: {avg_val_loss}")
    
    # Store epoch loss values
    train_loss_values.append(avg_train_loss)
    val_loss_values.append(avg_val_loss)

Epoch 1/3
