**LLaMA**

In [None]:
TRAIN_PATH = '21.jsonl'
TEST_PATH = 'test.jsonl'
VAL_PATH = 'val.jsonl'
MODEL_PATH = '' #directory w/ model_train.py, and tokenizer_zeropad.py (modified from Meta's codebase)
TRAINED_SPM_PATH = '' #directory w/ tokenizer.model (trained sentencepiece tokenizer)

**Init**

In [None]:
import torch
import sys
sys.path.append(MODEL_PATH) #where transformer and tokenizer are defined

In [None]:
# Copyright (c) Meta Platforms, Inc. and affiliates.
# This software may be used and distributed according to the terms of the GNU General Public License version 3.
from model_train import ModelArgs, Transformer #ctrl+f and comment out cuda (no GPU), removed caching, configured Transformer.forward for training
from tokenizer_zeropad import Tokenizer #custom padding value

**Data**

In [None]:
import json
from torch.utils.data import Dataset, DataLoader
from typing import List
from torch.nn.utils.rnn import pad_sequence

Ingestion

In [None]:
def make_data_list(filepath:str, maxiter:int) -> List[dict]:
    '''ingests JSON into list (with tripwire parameter to prevent computer from crashing)'''
    data = []
    with open(filepath, 'r') as f:
        for i, line in enumerate(f):
            if i >= maxiter:
                break
            data.append(json.loads(line))
    return data

Data Model

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer):
        self.inputs = []
        self.targets = []

        for text in texts:
            encodings = tokenizer.encode(text, bos=True, eos=True) #beginning/end of sentence tokens

            #takes all but the last token as input and all but the first token as target
            self.inputs.append(torch.tensor(encodings[:-1], dtype=torch.long))
            self.targets.append(torch.tensor(encodings[1:], dtype=torch.long))

    def __getitem__(self, idx):
        return {"input_ids": self.inputs[idx],
                "target_ids": self.targets[idx]}

    def __len__(self):
        return len(self.inputs)

In [None]:
def collate_fn(batch):
    '''custom collation function for tokenized sequences'''
    input_ids = [item['input_ids'] for item in batch]
    target_ids = [item['target_ids'] for item in batch]
    
    max_seq_len = 2048 #fixed tensor dimension for sequence lengths
    input_ids = [ids[:max_seq_len] for ids in input_ids]
    target_ids = [ids[:max_seq_len] for ids in target_ids]
    
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0) #add padding
    target_ids = pad_sequence(target_ids, batch_first=True, padding_value=0)
    return {'input_ids': input_ids, 'target_ids': target_ids}

train_data = make_data_list(TRAIN_PATH, 10)
test_data = make_data_list(TEST_PATH, 2)
val_data = make_data_list(VAL_PATH, 1)

In [None]:
def extract_texts(data_list):
    '''gets rid of the metadata'''
    return [item['text'] for item in data_list]

train_texts = extract_texts(train_data)
test_texts = extract_texts(test_data)
val_texts = extract_texts(val_data)

Processed Data

In [None]:
tokenizer = Tokenizer(TRAINED_SPM_PATH)
train_dataset = TextDataset(train_texts, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

**Training**

Configure environment for CPU (suboptimal for GPU; no parallelization)

In [None]:
import torch.distributed as dist
import fairscale.nn.model_parallel.initialize as fs_init

%env RANK=0
%env WORLD_SIZE=1
%env MASTER_ADDR=localhost
%env MASTER_PORT=0

torch.distributed.init_process_group(backend='gloo')
fs_init.initialize_model_parallel(1) #1 worker

Instantiate model

In [None]:
#arbitrarily chosen for prototyping
model_args = ModelArgs(
    dim=512,
    n_layers=8,
    n_heads=8,
    vocab_size=tokenizer.n_words,
    multiple_of=256,
    norm_eps=1e-5,
    max_batch_size=32,
    max_seq_len=2048,
)

model = Transformer(model_args)
optimizer = torch.optim.AdamW(model.parameters())
loss_function = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_id)  #ignores padding token (0) for loss calculation

Training loop

In [None]:
def examine_tensor(tensor):
    '''debugging function'''
    print(tensor)
    print("Type:", tensor.type())
    print("Data Type:", tensor.dtype)
    print("Shape:", tensor.shape)
    print("Size:", tensor.size())
    print("Number of Dimensions:", tensor.ndim)
    print("Device:", tensor.device)
    print("Requires Grad:", tensor.requires_grad)
    print("Gradient:", tensor.grad)
    return

In [None]:
def train(model, dataloader, optimizer, loss_function, num_epochs):
    '''the training loop'''
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0

        for batch in dataloader:
            optimizer.zero_grad()  #reset gradients

            input_ids = batch['input_ids']
            target_ids = batch['target_ids']
            outputs = model(input_ids, start_pos=0) #forward pass
            
            loss = loss_function(outputs.view(-1, outputs.size(-1)), target_ids.view(-1))
            loss.backward() #backward pass
            optimizer.step()
            total_loss += loss.item()
            
        print("Epoch: {}, Loss: {:.4f}".format(epoch, total_loss / len(dataloader)))
        
num_epochs = 1
train(model, train_dataloader, optimizer, loss_function, num_epochs)