In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [2]:
import numpy as np

import torch
import torch.nn.functional as F

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


In [3]:
def get_default_supported_precision(training: bool, tpu: bool = False) -> str:
    """Return default precision that is supported by the hardware.

    Args:
        training: `-mixed` or `-true` version of the precision to use
        tpu: whether TPU device is used

    Returns:
        default precision that is suitable for the task and is supported by the hardware
    """
    if tpu:
        return "32-true"
    if not torch.cuda.is_available() or torch.cuda.is_bf16_supported():
        return "bf16-mixed" if training else "bf16-true"
    return "16-mixed" if training else "16-true"


In [4]:
import lightning as L

strategy="auto"
tpu=False
precision = None or get_default_supported_precision(training=True, tpu=tpu)

fabric = L.Fabric(devices=1, strategy=strategy, precision=precision, loggers=[])

Using bfloat16 Automatic Mixed Precision (AMP)


In [5]:
import glob
from pathlib import Path
from typing import Optional, Tuple
import torch

from torch.utils.data import DataLoader
from functools import partial
import random

from TinyLlama.lit_gpt.packed_dataset import CombinedDataset, PackedDataset
# from TinyLlama.pretrain.tinyllama import create_dataloaders

train_data_config = [
    ("train_ind", 1.0),
]

val_data_config = [
    ("train_ind", 1.0),
]

def create_dataloader(
    batch_size: int, block_size: int, data_dir: Path, fabric, shuffle: bool = True, seed: int = 12345, split="train"
) -> DataLoader:
    datasets = []
    data_config = train_data_config if split == "train" else val_data_config
    for prefix, _ in data_config:
        filenames = sorted(glob.glob(str(data_dir / f"{prefix}*")))
        random.seed(seed)
        random.shuffle(filenames)

        dataset = PackedDataset(
            filenames,
            # n_chunks control the buffer size. 
            # Note that the buffer size also impacts the random shuffle
            # (PackedDataset is an IterableDataset. So the shuffle is done by prefetch a buffer and shuffle the buffer)
            n_chunks=8,
            block_size=block_size,
            shuffle=shuffle,
            seed=seed+fabric.global_rank,
            num_processes=fabric.world_size,
            process_rank=fabric.global_rank,
        )
        datasets.append(dataset)

    if not datasets:
        raise RuntimeError(
            f"No data found at {data_dir}. Make sure you ran prepare_redpajama.py to create the dataset."
        )

    weights = [weight for _, weight in data_config]
    sum_weights = sum(weights)
    weights = [el / sum_weights for el in weights]

    combined_dataset = CombinedDataset(datasets=datasets, seed=seed, weights=weights)

    return DataLoader(combined_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)


def create_dataloaders(
    batch_size: int,
    block_size: int,
    fabric,
    train_data_dir: Path = Path("data/redpajama_sample"),
    val_data_dir: Optional[Path] = None,
    seed: int = 12345,
) -> Tuple[DataLoader, DataLoader]:
    # Increase by one because we need the next word as well
    effective_block_size = block_size + 1
    train_dataloader = create_dataloader(
        batch_size=batch_size,
        block_size=effective_block_size,
        fabric=fabric,
        data_dir=train_data_dir,
        shuffle=True,
        seed=seed,
        split="train"
    )
    val_dataloader = (
        create_dataloader(
            batch_size=batch_size,
            block_size=effective_block_size,
            fabric=fabric,
            data_dir=val_data_dir,
            shuffle=False,
            seed=seed,
            split="validation"
        )
        if val_data_dir
        else None
    )
    return train_dataloader, val_dataloader


In [6]:
# Load the model
from model import Mamba, ModelArgs
from transformers import AutoTokenizer

from S5.dataloading import create_wikitext_dataset


pretrained_model_name_list = ["state-spaces/mamba-2.8b-slimpj", "state-spaces/mamba-2.8b", "state-spaces/mamba-1.4b", "state-spaces/mamba-790m", "state-spaces/mamba-370m", "state-spaces/mamba-130m"]
pretrained_model_name = pretrained_model_name_list[-2]
print("The pretrained model I'm using is:", pretrained_model_name)

model = Mamba.from_pretrained(pretrained_model_name)
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')

loader_names = ["val", "train"]
ppl_over_loaders = {"val":[], "train":[]}

# l_max_power_range = 16 # 120M
l_max_power_range = 15 # 370M or 790M
# l_max_power_range = 14 # up to 2.8B

for l_max_power in range(4, l_max_power_range+1): # 16->32768
    l_max = 2 ** l_max_power

    config = {
        "l_max": l_max, 
        "data_dir": "./",
        "batch_size": int(2**l_max_power_range / l_max),
        "batch_size_eval": int(2**l_max_power_range / l_max), 
        "num_workers": 4, 
        "pin_memory": False, 
        "tokenizer": "EleutherAI/gpt-neox-20b",
        "train_data_dir": Path("/home/aiops/wangsd/TinyLlama/data/the_pile_deduplicated_EleutherAI_combined"),
        "val_data_dir": Path("/home/aiops/wangsd/TinyLlama/data/the_pile_deduplicated_EleutherAI_combined"),
    }
    assert config["batch_size"] > 0, "batch_size must be positive"

    # train_loader, val_loader, test_loader = create_wikitext_dataset(config)

    train_loader, val_loader = create_dataloaders(
        batch_size=config["batch_size"],
        block_size=config["l_max"],
        fabric=fabric,
        train_data_dir=config["train_data_dir"],
        val_data_dir=config["val_data_dir"],
        seed=3412,
    )

    # Evaluate the perplexity
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    # Plan
    total_loss = 0.0
    total_tokens = 0

    loaders = [val_loader, train_loader]

    for loader, loader_name in zip(loaders, loader_names):
        if loader_name == "train":
            continue
        # print("loader_name is ", loader_name)
        # Iterate through the test data loader
        for batch_index, batch in enumerate(loader):
            input_ids = batch[:,:-1].to(device)  # Move input to GPU
            output_ids = batch[:,1:].to(device)  # Move output to GPU

            # print(f"input_ids.shape is {input_ids.shape}")
            # print(f"output_ids.shape is {output_ids.shape}")
            
            # Forward pass to get the logits
            with torch.no_grad():
                logits = model(input_ids)
            
            loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), output_ids.view(-1), reduction='sum')
            
            # Update the total loss and token count
            total_loss += loss.item()
            total_tokens += input_ids.numel()

            perplexity = np.exp(total_loss / total_tokens)

            if batch_index > 16:
                break
        ppl_over_loaders[loader_name].append(perplexity)

            # print(f"Running l_max = {l_max}\n  perplexity: {perplexity:.2f}\n  total loss is {total_loss}\n  total tokens is {total_tokens}")
for loader_name in loader_names:
    print(f"loader_name is {loader_name}, ppl_over_loaders is {ppl_over_loaders[loader_name]}")

The pretrained model I'm using is: state-spaces/mamba-370m


  return self.fget.__get__(instance, owner)()


loader_name is val, ppl_over_loaders is [38.365222204445494, 25.992814713117472, 20.00329615886128, 17.047069564983918, 11.035977720353646, 10.925790582248762, 8.88301029536941, 7.1713857470179105, 8.41482932120078, 9.128650253042753, 14.138112667928835, 207.5466573880326]
loader_name is train, ppl_over_loaders is []


In [7]:
import pandas as pd
# store the data into csv

df = pd.DataFrame(ppl_over_loaders, columns = ['val'])
df["length"] = [2 ** aha for aha in range(4, l_max_power_range+1)]
df.to_csv(f"ppl_pile_{pretrained_model_name[13:]}.csv", index=False)

In [None]:
# 23min