**LLaMA**

In [437]:
TRAIN_PATH = 'C:\\Users\mmont\HW2\\03.jsonl'
TEST_PATH = 'C:\\Users\mmont\HW2\\test.jsonl'
VAL_PATH = 'C:\\Users\mmont\HW2\\val.jsonl'
MODEL_PATH = 'C:\\Users\mmont\HW2' #folder with generation.py, model.py, and tokenizer.py
TRAINED_SPM_PATH = 'C:\\Users\mmont\HW2\\tokenizer.model' #downloaded from Ed post

**Init**

In [438]:
#Make sure INGESTED_SAMPLE_CNT % MAX_BSZ == 0; assymetric batches break something somewhere
#MAX_SEQ_LEN = 2048
MAX_SEQ_LEN = 256
INGESTED_SAMPLE_CNT = 2500
MAX_BSZ = 10
MINI_MODEL = False

In [439]:
import sys
sys.path.append(MODEL_PATH)

import time

# Copyright (c) Meta Platforms, Inc. and affiliates.
# This software may be used and distributed according to the terms of the GNU General Public License version 3.
from model_train import ModelArgs, Transformer #no cuda, no inference mode, no output projection (all sequences),\
                                               #no init_method=lambda x: x (fairscale default),\
                                               #remove self.cache; not for training,
                                               #CONSIDER: fairscale -> torch.nn replacement (not needed; good practice)
from tokenizer_zeropad import Tokenizer #override default padding to 0 to shut tok_embed up about indexing\
                                        #Consider eos() as token

#to to-do: model checkpoints (torch.save)

**Data**

In [440]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pandas
from pathlib import Path
import numpy as np

In [441]:
def examine_data(data):
    '''debugging func'''
    print('Input type: {}'.format(type(data)))
    print('Length: {}'.format(len(data)))
    print('First element type: {}'.format(type(data[0])))
    if type(data[0])==dict:
        print('Keys: {}'.format(data[0].keys()))
    return

def make_data_list(filepath:str, maxiter:int): #no much better way; maybe webtext for bonus point
    '''ingests JSON into list (with tripwire parameter to prevent computer from crashing)'''
    data = []
    with open(filepath, 'r') as f:
        for i, line in enumerate(f):
            if i >= maxiter:
                break
            data.append(json.loads(line))
    return data

train_data_raw = make_data_list(TRAIN_PATH, INGESTED_SAMPLE_CNT)
val_data_raw = make_data_list(VAL_PATH, INGESTED_SAMPLE_CNT/5)
examine_data(train_data_raw) #meta might be a useful extra points and/or ablation (Mitchell doesn't reccommend)

Input type: <class 'list'>
Length: 2500
First element type: <class 'dict'>
Keys: dict_keys(['text', 'meta'])


In [442]:
def extract_texts(data_list): #list of strings; BAD!
    '''gets rid of the metadata'''
    return [item['text'] for item in data_list]

train_texts = extract_texts(train_data_raw)
val_texts = extract_texts(val_data_raw)
examine_data(train_texts)

Input type: <class 'list'>
Length: 2500
First element type: <class 'str'>


In [443]:
class TextDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item['text']
        encoded_text = self.tokenizer.encode(text, bos=False, eos=False)

        #Truncate the sequence to max_seq_len if it's too long
        if len(encoded_text) > MAX_SEQ_LEN - 1:  #We subtract 2 to account for the BOS and EOS tokens\
                                                 #small modification: -1 instead
            encoded_text = encoded_text[:MAX_SEQ_LEN - 1]

        return {
            'input_ids': torch.tensor([self.tokenizer.bos_id] + encoded_text, dtype=torch.long),
            'target_ids': torch.tensor(encoded_text + [self.tokenizer.eos_id], dtype=torch.long)
        }


def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    target_ids = [item['target_ids'] for item in batch]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_id)
    target_ids = pad_sequence(target_ids, batch_first=True, padding_value=tokenizer.pad_id)

    return {
        'input_ids': input_ids,
        'target_ids': target_ids
    }

In [444]:
def examine_dataset(dataset):
    print('self.data:')
    examine_data(dataset.data)
    print('Tokenizer: {}'.format(dataset.tokenizer))
    return

tokenizer = Tokenizer(TRAINED_SPM_PATH) #pretrained spm
train_dataset = TextDataset(train_data_raw, tokenizer) #Dataset obj
val_dataset = TextDataset(val_data_raw, tokenizer)
examine_dataset(train_dataset)
examine_dataset(val_dataset)

self.data:
Input type: <class 'list'>
Length: 2500
First element type: <class 'dict'>
Keys: dict_keys(['text', 'meta'])
Tokenizer: <tokenizer_zeropad.Tokenizer object at 0x000001EB8F322F40>
self.data:
Input type: <class 'list'>
Length: 50
First element type: <class 'dict'>
Keys: dict_keys(['text', 'meta'])
Tokenizer: <tokenizer_zeropad.Tokenizer object at 0x000001EB8F322F40>


**Training**

Configure environment for CPU

In [445]:
import torch
import torch.distributed as dist
import fairscale.nn.model_parallel.initialize as fs_init

%env RANK=0
%env WORLD_SIZE=1
%env MASTER_ADDR=localhost
%env MASTER_PORT=0

#torch.distributed.init_process_group(backend='gloo')
#fs_init.initialize_model_parallel(1) #1 worker

env: RANK=0
env: WORLD_SIZE=1
env: MASTER_ADDR=localhost
env: MASTER_PORT=0


Instantiate model

In [446]:
model_args = ModelArgs(
    dim=512,
    n_layers=8,
    n_heads=8,
    vocab_size=tokenizer.n_words,
    multiple_of=256,
    norm_eps=1e-5,
    max_batch_size=MAX_BSZ,
    max_seq_len=MAX_SEQ_LEN,
)

mini_args = ModelArgs(
    dim=256,
    n_layers=4,
    n_heads=4,
    vocab_size=tokenizer.n_words,
    multiple_of=256,
    norm_eps=1e-5,
    max_batch_size=MAX_BSZ, #only works for 32; no idea why
    max_seq_len=MAX_SEQ_LEN,
)

if MINI_MODEL: #global var (2nd cell)
    model = Transformer(mini_args)
else:
    model = Transformer(model_args)

train_dataloader = DataLoader(train_dataset, batch_size=model_args.max_batch_size, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=model_args.max_batch_size, shuffle=True, collate_fn=collate_fn)
optimizer = torch.optim.AdamW(model.parameters()) #Ablation idea: 
loss_func = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_id)  #ignores padding token for loss calculation

Training loop

In [447]:
#debugging tools
def examine_tensor(tensor):
    '''debugging function'''
    print('TENSOR OVERVIEW\n'
          'Type: {}\n'
          'Data Type: {}\n'
          'Shape: {}\n'
          'Number of Dimensions: {}\n'
          'Device: {}\n'
          'Requires Grad: {}\n'
          'Gradient: {}\n'.format(tensor.type(), tensor.dtype, tensor.shape, tensor.ndim, tensor.device,\
                                  tensor.requires_grad, tensor.grad))
    return

def flag(msg='unspecified'):
    print('FLAG: {}'.format(msg))
    return

def loop_summary(titles:tuple, tensors:tuple):
    for i in range(len(titles)):
        flag(titles[i])
        examine_tensor(tensors[i])
    return

def examine_dataloader(dataloader):
    '''debugging function'''
    print('DATALOADER OVERVIEW\n'
          'Number of Batches: {}\n'
          'Batch Size: {}\n'
          'Shuffle: {}\n'
          'Number of Workers: {}\n'.format(
              len(dataloader), 
              dataloader.batch_size,
              dataloader.num_workers))
    return


In [448]:
def examine_dataloader(dataloader):
    '''debugging function'''
    print('DATALOADER OVERVIEW\n'
          'Number of Batches: {}\n'
          'Batch Size: {}\n'.format(len(dataloader), dataloader.batch_size))

    # Examine the first batch in the dataloader
    first_batch = next(iter(dataloader))
    print('First batch overview:')
    print('Keys: ', first_batch.keys())
    
    for key in first_batch.keys():
        print('Shape of {} tensor: {}'.format(key, first_batch[key].shape))
    
    # Examine the dataset
    print('\nDataset overview:')
    print('Length of Dataset: ', len(dataloader.dataset))

    # Try getting an item from the dataset
    try:
        print('Example item: ', dataloader.dataset[0])
    except Exception as e:
        print('Could not retrieve item from dataset: ', str(e))
        
    return

examine_dataloader(train_dataloader)
examine_dataloader(val_dataloader)

DATALOADER OVERVIEW
Number of Batches: 250
Batch Size: 10

First batch overview:
Keys:  dict_keys(['input_ids', 'target_ids'])
Shape of input_ids tensor: torch.Size([10, 256])
Shape of target_ids tensor: torch.Size([10, 256])

Dataset overview:
Length of Dataset:  2500
Example item:  {'input_ids': tensor([    1,  9041,   558,  5921,   478,  1308,   550, 26049, 22873,  3614,
          304,   263,  2532,  1974, 27822,   297,  7660, 29892,   360, 29889,
        29907, 29889,   304, 15905,  9279,   363,  4857,  1747,   478, 29909,
         9045,  2562, 29892, 17231,  1009,  9554,  7014,   723,  5401,  2304,
         1192,   322,  5220,   292,   785,   363,  5314,  4822,   278, 14311,
        29889,    13,    13,    13,    13, 29911,  5086,   263,   274,   434,
          515,   278, 16417,  1510,  1383,   935,   323,   804, 29892,   278,
          478, 29909,   512, 13715,   362, 27819,  8373,  6296,  1438, 19001,
        29899, 14056,   558,  1600,  1295,  1434,   263,  9451,   310,   478,

In [449]:
import os
import shutil

dir = './ckpts'
if not os.path.exists(dir):
    os.makedirs(dir)



def train():
    data = np.ndarray(shape=(251, 2))
    torch.autograd.set_detect_anomaly(True)
    model.train()
    counter = 0 #debug feature
    start = time.time()
    for epoch in range(0, int(2500/INGESTED_SAMPLE_CNT)):
        total_loss = 0

        print("Epoch " + str(epoch))
        for batch in train_dataloader:

            counter += 1
            if counter==0:
                break
            print('Loop {}'.format(counter))

            inputs = batch['input_ids'] #bsz x seq_len

            targets = batch['target_ids'] #bsz x seq_len

            outputs = model(inputs, start_pos=0) #bsz x sel_len x vocab_size

            flat_outputs = outputs.view(-1, outputs.size(-1)) #(bsz*seq_len) x vocab_size

            flat_targets = targets.view(-1) #(bsz*seq_len)

            loss = loss_func(flat_outputs, flat_targets) #flattening confirmed by TA
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()
            print(loss.item())
            data[counter][0] = loss.item()

            #this would be out of the loop for per-epoch validation loss
            total_val_loss = 0
            for batch in val_dataloader:

                inputs = batch['input_ids'] #bsz x seq_len

                targets = batch['target_ids'] #bsz x seq_len

                outputs = model(inputs, start_pos=0) #bsz x sel_len x vocab_size

                flat_outputs = outputs.view(-1, outputs.size(-1)) #(bsz*seq_len) x vocab_size

                flat_targets = targets.view(-1) #(bsz*seq_len)

                valloss = loss_func(flat_outputs, flat_targets) #flattening confirmed by TA
                total_val_loss += valloss.item()
            data[counter][1] = total_val_loss / len(val_dataloader)
            print(data[counter][1])



        #store data from this epoch
        avg_train_loss = total_loss / len(train_dataloader)
        avg_val_loss = total_val_loss / len(val_dataloader)
        # commented out to account for storing data from batches instead
        # data[epoch][0] = avg_train_loss
        # data[epoch][1] = avg_val_loss
        print("Average training loss: {0:.2f}".format(avg_train_loss))
        print("Average validation loss: {0:.2f}".format(avg_val_loss))


    # output data to csv file
    df = pandas.DataFrame(data)
    filepath = Path('out.csv');
    df.to_csv(filepath)

    # save final model
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, "ckpts/model" + str(INGESTED_SAMPLE_CNT) + "pts.pth")

    json_params = json.dumps({
        "dim": model_args.dim,
        "n_layers": model_args.n_layers,
        "n_heads": model_args.n_heads,
        "vocab_size": model_args.vocab_size,
        "multiple_of": model_args.multiple_of,
        "norm_eps": model_args.norm_eps,
    }, indent=4)

    with open(dir + "/params" + str(INGESTED_SAMPLE_CNT) + "pts.json", "w") as outfile:
         outfile.write(json_params)
    outfile.close()


train()

Epoch 0
Loop 1
10.387748718261719
10.188249588012695
Loop 2
10.18424129486084
9.693785095214844
Loop 3
9.71471881866455
9.287421035766602
Loop 4
9.31380558013916
8.922862434387207
Loop 5
9.003082275390625
8.608777236938476
Loop 6
8.740099906921387
8.349223709106445
Loop 7
8.569924354553223
8.153986358642578
Loop 8
8.31257438659668
8.008385467529298
Loop 9
8.079536437988281
7.914016628265381
Loop 10
7.903517723083496
7.857735347747803
Loop 11
7.899935722351074
7.837017250061035
Loop 12
7.9402337074279785
7.822018241882324
Loop 13
7.844095706939697
7.815658187866211
Loop 14
7.902446269989014
7.774166488647461
Loop 15
7.800807952880859
7.735138034820556
Loop 16
7.682217121124268
7.76634635925293
Loop 17
7.945694923400879
7.723428344726562
Loop 18
7.956690788269043
7.702315998077393
Loop 19
7.314047336578369
7.716390323638916
Loop 20
7.783333778381348
7.68515043258667
Loop 21
7.953357696533203
7.632419109344482
Loop 22
7.376928329467773
7.639599800109863
Loop 23
7.5660624504089355
7.603056